Class: TokenEstimator::Estimator

Inherits:
Object
  • Object
show all
Defined in:
lib/token_estimator.rb

Constant Summary collapse

SUPPORTED_FILE_TYPES =
[".txt", ".csv", ".pdf", ".json", ".md", ".html", ".xlsx"]

Instance Method Summary collapse

Constructor Details

#initialize(tokenizer_name) ⇒ Estimator

Returns a new instance of Estimator.



18
19
20
# File 'lib/token_estimator.rb', line 18

def initialize(tokenizer_name)
  @tokenizer = Tokenizers.from_pretrained(tokenizer_name)
end

Instance Method Details

#count_tokens_from_csv_file(file_path) ⇒ Object



50
51
52
53
# File 'lib/token_estimator.rb', line 50

def count_tokens_from_csv_file(file_path)
  text = extract_text_from_csv(file_path)
  count_tokens_from_text(text)
end

#count_tokens_from_excel_file(file_path) ⇒ Object



44
45
46
47
48
# File 'lib/token_estimator.rb', line 44

def count_tokens_from_excel_file(file_path)
  xlsx = Roo::Excelx.new(file_path)
  text = extract_text_from_excel(xlsx)
  count_tokens_from_text(text)
end

#count_tokens_from_file(file_path) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/token_estimator.rb', line 22

def count_tokens_from_file(file_path)
  extension = File.extname(file_path)
  case extension
  when ".txt"
    count_tokens_from_txt_file(file_path)
  when ".csv"
    count_tokens_from_csv_file(file_path)
  when ".pdf"
    count_tokens_from_pdf_file(file_path)
  when ".json"
    count_tokens_from_json_file(file_path)
  when ".md"
    count_tokens_from_markdown_file(file_path)
  when ".html"
    count_tokens_from_html_file(file_path)
  when ".xlsx"
    count_tokens_from_excel_file(file_path)
  else
    raise UnsupportedFileTypeError, "File type \"#{extension}\" is not supported"
  end
end

#count_tokens_from_html(html_content) ⇒ Object



90
91
92
93
# File 'lib/token_estimator.rb', line 90

def count_tokens_from_html(html_content)
  text = extract_text_from_html(html_content)
  count_tokens_from_text(text)
end

#count_tokens_from_html_file(file_path) ⇒ Object



78
79
80
81
# File 'lib/token_estimator.rb', line 78

def count_tokens_from_html_file(file_path)
  html_content = File.read(file_path)
  count_tokens_from_html(html_content)
end

#count_tokens_from_json(json_data) ⇒ Object

util



85
86
87
88
# File 'lib/token_estimator.rb', line 85

def count_tokens_from_json(json_data)
  text = extract_text_from_json(json_data)
  count_tokens_from_text(text)
end

#count_tokens_from_json_file(file_path) ⇒ Object



72
73
74
75
76
# File 'lib/token_estimator.rb', line 72

def count_tokens_from_json_file(file_path)
  json_content = File.read(file_path)
  json_data = JSON.parse(json_content)
  count_tokens_from_json(json_data)
end

#count_tokens_from_markdown_file(file_path) ⇒ Object



66
67
68
69
70
# File 'lib/token_estimator.rb', line 66

def count_tokens_from_markdown_file(file_path)
  markdown = File.read(file_path)
  html_content = Kramdown::Document.new(markdown).to_html
  count_tokens_from_html(html_content)
end

#count_tokens_from_pdf_file(file_path) ⇒ Object



55
56
57
58
59
# File 'lib/token_estimator.rb', line 55

def count_tokens_from_pdf_file(file_path)
  reader = PDF::Reader.new(file_path)
  text = extract_text_from_pdf(reader)
  count_tokens_from_text(text)
end

#count_tokens_from_text(text) ⇒ Object



95
96
97
98
# File 'lib/token_estimator.rb', line 95

def count_tokens_from_text(text)
  tokens = @tokenizer.encode(text).tokens
  tokens.count
end

#count_tokens_from_txt_file(file_path) ⇒ Object



61
62
63
64
# File 'lib/token_estimator.rb', line 61

def count_tokens_from_txt_file(file_path)
  text = File.read(file_path)
  count_tokens_from_text(text)
end