Class: Documentrix::Documents::Splitters::Semantic

Inherits:
Object
  • Object
show all
Includes:
Utils::Math
Defined in:
lib/documentrix/documents/splitters/semantic.rb

Constant Summary collapse

DEFAULT_SEPARATOR =
/[.!?]\s*(?:\b|\z)/

Instance Method Summary collapse

Methods included from Utils::Math

#convert_to_vector, #cosine_similarity, #norm

Constructor Details

#initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096) ⇒ Semantic

Returns a new instance of Semantic.



7
8
9
10
# File 'lib/documentrix/documents/splitters/semantic.rb', line 7

def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096)
  @ollama, @model, @model_options, @separator, @chunk_size =
    ollama, model, model_options, separator, chunk_size
end

Instance Method Details

#split(text, batch_size: 100, breakpoint: :percentile, **opts) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/documentrix/documents/splitters/semantic.rb', line 12

def split(text, batch_size: 100, breakpoint: :percentile, **opts)
  sentences  = Documentrix::Documents::Splitters::Character.new(
    separator: @separator,
    include_separator: opts.fetch(:include_separator, true),
    chunk_size: 1,
  ).split(text)
  embeddings = sentences.with_infobar(label: 'Split').each_slice(batch_size).inject([]) do |e, batch|
    e.concat sentence_embeddings(batch)
    infobar.progress by: batch.size
    e
  end
  infobar.newline
  embeddings.size < 2 and return sentences
  distances = embeddings.each_cons(2).map do |a, b|
    1.0 - cosine_similarity(a:, b:)
  end
  max_distance = calculate_breakpoint_threshold(breakpoint, distances, **opts)
  gaps = distances.each_with_index.select do |d, i|
    d > max_distance
  end.transpose.last
  gaps or return sentences
  if gaps.last < distances.size
    gaps << distances.size
  end
  if gaps.last < sentences.size - 1
    gaps << sentences.size - 1
  end
  result = []
  sg = 0
  current_text = +''
  gaps.each do |g|
    sg.upto(g) do |i|
      sentence = sentences[i]
      if current_text.size + sentence.size < @chunk_size
        current_text += sentence
      else
        current_text.empty? or result << current_text
        current_text = sentence
      end
    end
    unless current_text.empty?
      result << current_text
      current_text = +''
    end
    sg = g.succ
  end
  current_text.empty? or result << current_text
  result
end