Class: TokenEstimator::Estimator
- Inherits:
-
Object
- Object
- TokenEstimator::Estimator
- Defined in:
- lib/token_estimator.rb
Constant Summary collapse
- SUPPORTED_FILE_TYPES =
[".txt", ".csv", ".pdf", ".json", ".md", ".html", ".xlsx"]
Instance Method Summary collapse
- #count_tokens_from_csv_file(file_path) ⇒ Object
- #count_tokens_from_excel_file(file_path) ⇒ Object
- #count_tokens_from_file(file_path) ⇒ Object
- #count_tokens_from_html(html_content) ⇒ Object
- #count_tokens_from_html_file(file_path) ⇒ Object
-
#count_tokens_from_json(json_data) ⇒ Object
util.
- #count_tokens_from_json_file(file_path) ⇒ Object
- #count_tokens_from_markdown_file(file_path) ⇒ Object
- #count_tokens_from_pdf_file(file_path) ⇒ Object
- #count_tokens_from_text(text) ⇒ Object
- #count_tokens_from_txt_file(file_path) ⇒ Object
-
#initialize(tokenizer_name) ⇒ Estimator
constructor
A new instance of Estimator.
Constructor Details
#initialize(tokenizer_name) ⇒ Estimator
Returns a new instance of Estimator.
18 19 20 |
# File 'lib/token_estimator.rb', line 18 def initialize(tokenizer_name) @tokenizer = Tokenizers.from_pretrained(tokenizer_name) end |
Instance Method Details
#count_tokens_from_csv_file(file_path) ⇒ Object
50 51 52 53 |
# File 'lib/token_estimator.rb', line 50 def count_tokens_from_csv_file(file_path) text = extract_text_from_csv(file_path) count_tokens_from_text(text) end |
#count_tokens_from_excel_file(file_path) ⇒ Object
44 45 46 47 48 |
# File 'lib/token_estimator.rb', line 44 def count_tokens_from_excel_file(file_path) xlsx = Roo::Excelx.new(file_path) text = extract_text_from_excel(xlsx) count_tokens_from_text(text) end |
#count_tokens_from_file(file_path) ⇒ Object
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/token_estimator.rb', line 22 def count_tokens_from_file(file_path) extension = File.extname(file_path) case extension when ".txt" count_tokens_from_txt_file(file_path) when ".csv" count_tokens_from_csv_file(file_path) when ".pdf" count_tokens_from_pdf_file(file_path) when ".json" count_tokens_from_json_file(file_path) when ".md" count_tokens_from_markdown_file(file_path) when ".html" count_tokens_from_html_file(file_path) when ".xlsx" count_tokens_from_excel_file(file_path) else raise UnsupportedFileTypeError, "File type \"#{extension}\" is not supported" end end |
#count_tokens_from_html(html_content) ⇒ Object
90 91 92 93 |
# File 'lib/token_estimator.rb', line 90 def count_tokens_from_html(html_content) text = extract_text_from_html(html_content) count_tokens_from_text(text) end |
#count_tokens_from_html_file(file_path) ⇒ Object
78 79 80 81 |
# File 'lib/token_estimator.rb', line 78 def count_tokens_from_html_file(file_path) html_content = File.read(file_path) count_tokens_from_html(html_content) end |
#count_tokens_from_json(json_data) ⇒ Object
util
85 86 87 88 |
# File 'lib/token_estimator.rb', line 85 def count_tokens_from_json(json_data) text = extract_text_from_json(json_data) count_tokens_from_text(text) end |
#count_tokens_from_json_file(file_path) ⇒ Object
72 73 74 75 76 |
# File 'lib/token_estimator.rb', line 72 def count_tokens_from_json_file(file_path) json_content = File.read(file_path) json_data = JSON.parse(json_content) count_tokens_from_json(json_data) end |
#count_tokens_from_markdown_file(file_path) ⇒ Object
66 67 68 69 70 |
# File 'lib/token_estimator.rb', line 66 def count_tokens_from_markdown_file(file_path) markdown = File.read(file_path) html_content = Kramdown::Document.new(markdown).to_html count_tokens_from_html(html_content) end |
#count_tokens_from_pdf_file(file_path) ⇒ Object
55 56 57 58 59 |
# File 'lib/token_estimator.rb', line 55 def count_tokens_from_pdf_file(file_path) reader = PDF::Reader.new(file_path) text = extract_text_from_pdf(reader) count_tokens_from_text(text) end |
#count_tokens_from_text(text) ⇒ Object
95 96 97 98 |
# File 'lib/token_estimator.rb', line 95 def count_tokens_from_text(text) tokens = @tokenizer.encode(text).tokens tokens.count end |
#count_tokens_from_txt_file(file_path) ⇒ Object
61 62 63 64 |
# File 'lib/token_estimator.rb', line 61 def count_tokens_from_txt_file(file_path) text = File.read(file_path) count_tokens_from_text(text) end |