Class: Splitta::Doc

Inherits:
Object
  • Object
show all
Defined in:
lib/splitta/doc.rb

Constant Summary collapse

FRAG_SPLITTER =
/
  (
    [.!?]         # sentence end punctuation
    (?:
      (?:<.*>)    # extra tag
      |
      [”"')\]}]   # right-handed punctuation to retain
    )*
    \s+           # must have whitespace
  )
/ux
SEGMENT_THRESHOLD =
0.5

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, model:) ⇒ Doc

Returns a new instance of Doc.


23
24
25
26
27
28
29
30
# File 'lib/splitta/doc.rb', line 23

def initialize(text, model:)
  @frags = []
  text.split(FRAG_SPLITTER).each_slice(2) do |frag_text|
    frag = Frag.new(frag_text.join, previous_frag: @frags.last)
    @frags << frag
  end
  model.classify(self)
end

Instance Attribute Details

#fragsObject (readonly)

Returns the value of attribute frags


21
22
23
# File 'lib/splitta/doc.rb', line 21

def frags
  @frags
end

Instance Method Details

#featurize(model) ⇒ Object


32
33
34
35
36
37
38
# File 'lib/splitta/doc.rb', line 32

def featurize(model)
  frag = @frag
  while frag
    frag.features = get_features(frag, model)
    frag = frag.next
  end
end

#segmentsObject

output all the text, split according to predictions


43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/splitta/doc.rb', line 43

def segments
  Enumerator.new do |y|
    io = StringIO.new
    frags.each do |frag|
      io << frag.orig
      if frag.pred && frag.pred > SEGMENT_THRESHOLD
        y << io.string.strip
        io.string = ''
      end
    end
    y << io.string.strip unless io.string.empty?
  end
end