Class: Baran::RecursiveCharacterTextSplitter

Inherits:
TextSplitter show all
Defined in:
lib/baran/recursive_character_text_splitter.rb

Direct Known Subclasses

MarkdownSplitter

Instance Attribute Summary collapse

Attributes inherited from TextSplitter

#chunk_overlap, #chunk_size

Instance Method Summary collapse

Methods inherited from TextSplitter

#chunks, #joined, #merged

Constructor Details

#initialize(chunk_size: 1024, chunk_overlap: 64, separators: nil) ⇒ RecursiveCharacterTextSplitter

Returns a new instance of RecursiveCharacterTextSplitter.



7
8
9
10
# File 'lib/baran/recursive_character_text_splitter.rb', line 7

def initialize(chunk_size: 1024, chunk_overlap: 64, separators: nil)
  super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
  @separators = separators || ["\n\n", "\n", " "]
end

Instance Attribute Details

#separatorsObject

Returns the value of attribute separators.



5
6
7
# File 'lib/baran/recursive_character_text_splitter.rb', line 5

def separators
  @separators
end

Instance Method Details

#splitted(text) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/baran/recursive_character_text_splitter.rb', line 12

def splitted(text)
  results = []
  good_splits = []
  separator = ''

  separators.each do |s|
    if text.include?(s)
      separator = s
      break
    end
  end

  text.split(separator).each do |s|
    if s.length < chunk_size
      good_splits << s
    else
      if good_splits.length.positive?
        results += merged(good_splits, separator)
        good_splits.clear
      end
      results += splitted(s)
    end
  end

  if good_splits.length.positive?
    results += merged(good_splits, separator)
  end

  results
end