Class: BasicTextChunker
- Inherits:
-
Object
- Object
- BasicTextChunker
- Defined in:
- lib/content_splitters/basic_sentence_splitter.rb
Instance Method Summary collapse
-
#initialize(token_limit = 390) ⇒ BasicTextChunker
constructor
A new instance of BasicTextChunker.
- #split_into_chunks(text) ⇒ Object
Constructor Details
#initialize(token_limit = 390) ⇒ BasicTextChunker
Returns a new instance of BasicTextChunker.
4 5 6 |
# File 'lib/content_splitters/basic_sentence_splitter.rb', line 4 def initialize(token_limit=390) @token_limit = token_limit end |
Instance Method Details
#split_into_chunks(text) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/content_splitters/basic_sentence_splitter.rb', line 8 def split_into_chunks(text) sentences = text.split(/[.!?]\s+/) chunks = [] current_chunk = "" current_token_count = 0 sentences.each do |sentence| sentence_token_count = OpenAI.rough_token_count(sentence) while sentence_token_count > @token_limit tokens_to_take = @token_limit - current_token_count partial = sentence.split(/\s+/).first(tokens_to_take).join(" ") current_chunk += partial + " " sentence = sentence[partial.length..].strip current_token_count += tokens_to_take sentence_token_count -= tokens_to_take if current_token_count == @token_limit chunks << current_chunk.strip current_chunk = "" current_token_count = 0 end end if current_token_count + sentence_token_count <= @token_limit current_chunk += sentence + " " current_token_count += sentence_token_count else chunks << current_chunk.strip current_chunk = sentence + " " current_token_count = sentence_token_count end end chunks << current_chunk.strip unless current_chunk.empty? chunks end |