Class: Baran::TextSplitter

Inherits:
Object
  • Object
show all
Defined in:
lib/baran/text_splitter.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(chunk_size: 1024, chunk_overlap: 64) ⇒ TextSplitter

Returns a new instance of TextSplitter.



7
8
9
10
11
# File 'lib/baran/text_splitter.rb', line 7

def initialize(chunk_size: 1024, chunk_overlap: 64)
  @chunk_size = chunk_size
  @chunk_overlap = chunk_overlap
  raise "Cannot have chunk_overlap >= chunk_size" if @chunk_overlap >= @chunk_size
end

Instance Attribute Details

#chunk_overlapObject

Returns the value of attribute chunk_overlap.



5
6
7
# File 'lib/baran/text_splitter.rb', line 5

def chunk_overlap
  @chunk_overlap
end

#chunk_sizeObject

Returns the value of attribute chunk_size.



5
6
7
# File 'lib/baran/text_splitter.rb', line 5

def chunk_size
  @chunk_size
end

Instance Method Details

#chunks(text, metadata: nil) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/baran/text_splitter.rb', line 17

def chunks(text, metadata: nil)
  cursor = 0
  chunks = []

  splitted(text).compact.each do |chunk|
    chunk = { text: chunk, cursor: cursor }
    chunk[:metadata] =  if 
    chunks << chunk
    cursor += chunk[:text].length
  end

  chunks
end

#joined(items, separator) ⇒ Object



31
32
33
34
# File 'lib/baran/text_splitter.rb', line 31

def joined(items, separator)
  text = items.join(separator).strip
  text.empty? ? nil : text
end

#merged(splits, separator) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/baran/text_splitter.rb', line 36

def merged(splits, separator)
  results = [] # Array of strings
  current_splits = [] # Array of strings
  total = 0

  splits.each do |split|
    if total + split.length >= chunk_size && current_splits.length.positive?
      results << joined(current_splits, separator)

      while total > chunk_overlap || (total + split.length >= chunk_size && total.positive?)
        total -= current_splits.first.length
        current_splits.shift
      end
    end

    current_splits << split
    total += split.length
    Logger.new(STDOUT).warn("Created a chunk of size #{total}, which is longer than the specified #{@chunk_size}") if total > @chunk_size
  end

  results << joined(current_splits, separator)

  results
end

#splitted(text) ⇒ Object

Raises:

  • (NotImplementedError)


13
14
15
# File 'lib/baran/text_splitter.rb', line 13

def splitted(text)
  raise NotImplementedError, "splitted method should be implemented in a subclass"
end