Class: Documentrix::Documents::Splitters::RecursiveCharacter

Inherits:
Object
  • Object
show all
Defined in:
lib/documentrix/documents/splitters/character.rb

Constant Summary collapse

DEFAULT_SEPARATORS =
[
  /(?:\r?\n){2,}/,
  /\r?\n/,
  /\b/,
  //,
].freeze

Instance Method Summary collapse

Constructor Details

#initialize(separators: DEFAULT_SEPARATORS, include_separator: false, combining_string: "\n\n", chunk_size: 4096) ⇒ RecursiveCharacter

Returns a new instance of RecursiveCharacter.



45
46
47
48
49
50
# File 'lib/documentrix/documents/splitters/character.rb', line 45

def initialize(separators: DEFAULT_SEPARATORS, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
  separators.empty? and
    raise ArgumentError, "non-empty array of separators required"
  @separators, @include_separator, @combining_string, @chunk_size =
    separators, include_separator, combining_string, chunk_size
end

Instance Method Details

#split(text, separators: @separators) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/documentrix/documents/splitters/character.rb', line 52

def split(text, separators: @separators)
  separators.empty? and return [ text ]
  separators = separators.dup
  separator = separators.shift
  texts = Character.new(
    separator:,
    include_separator: @include_separator,
    combining_string: @combining_string,
    chunk_size: @chunk_size
  ).split(text)
  texts.count == 0 and return [ text ]
  texts.inject([]) do |r, t|
    if t.size > @chunk_size
      r.concat(split(t, separators:))
    else
      r.concat([ t ])
    end
  end
end