Class: BasicTextChunker

Inherits:
Object
  • Object
show all
Defined in:
lib/content_splitters/basic_sentence_splitter.rb

Instance Method Summary collapse

Constructor Details

#initialize(token_limit = 390) ⇒ BasicTextChunker

Returns a new instance of BasicTextChunker.



4
5
6
# File 'lib/content_splitters/basic_sentence_splitter.rb', line 4

def initialize(token_limit=390)
  @token_limit = token_limit
end

Instance Method Details

#split_into_chunks(text) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/content_splitters/basic_sentence_splitter.rb', line 8

def split_into_chunks(text)
  sentences = text.split(/[.!?]\s+/)
  chunks = []
  current_chunk = ""
  current_token_count = 0

  sentences.each do |sentence|
    sentence_token_count = OpenAI.rough_token_count(sentence)

    while sentence_token_count > @token_limit
      tokens_to_take = @token_limit - current_token_count
      partial = sentence.split(/\s+/).first(tokens_to_take).join(" ")
      current_chunk += partial + " "
      sentence = sentence[partial.length..].strip
      current_token_count += tokens_to_take
      sentence_token_count -= tokens_to_take

      if current_token_count == @token_limit
        chunks << current_chunk.strip
        current_chunk = ""
        current_token_count = 0
      end
    end

    if current_token_count + sentence_token_count <= @token_limit
      current_chunk += sentence + " "
      current_token_count += sentence_token_count
    else
      chunks << current_chunk.strip
      current_chunk = sentence + " "
      current_token_count = sentence_token_count
    end
  end

  chunks << current_chunk.strip unless current_chunk.empty?
  chunks
end