Class: Documentrix::Documents::Splitters::Character

Inherits:
Object
  • Object
show all
Defined in:
lib/documentrix/documents/splitters/character.rb

Constant Summary collapse

DEFAULT_SEPARATOR =
/(?:\r?\n){2,}/

Instance Method Summary collapse

Constructor Details

#initialize(separator: DEFAULT_SEPARATOR, include_separator: false, combining_string: "\n\n", chunk_size: 4096) ⇒ Character

Returns a new instance of Character.



5
6
7
8
9
10
11
# File 'lib/documentrix/documents/splitters/character.rb', line 5

def initialize(separator: DEFAULT_SEPARATOR, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
  @separator, @include_separator, @combining_string, @chunk_size =
    separator, include_separator, combining_string, chunk_size
  if include_separator
    @separator = Regexp.new("(#@separator)")
  end
end

Instance Method Details

#split(text) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/documentrix/documents/splitters/character.rb', line 13

def split(text)
  texts = []
  text.split(@separator) do |t|
    if @include_separator && t =~ @separator
      texts.last&.concat t
    else
      texts.push(t)
    end
  end
  result = []
  current_text = +''
  texts.each do |t|
    if current_text.size + t.size < @chunk_size
      current_text << t << @combining_string
    else
      current_text.empty? or result << current_text
      current_text = t
    end
  end
  current_text.empty? or result << current_text
  result
end