Class: Splitta::Frag

Inherits:
Object
  • Object
show all
Includes:
WordTokenizer
Defined in:
lib/splitta/frag.rb

Constant Summary

Constants included from WordTokenizer

WordTokenizer::TOKENIZE_REGEXPS

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from WordTokenizer

#tokenize

Constructor Details

#initialize(orig, previous_frag: nil) ⇒ Frag

Returns a new instance of Frag.


12
13
14
15
16
17
# File 'lib/splitta/frag.rb', line 12

def initialize(orig, previous_frag: nil)
  words = clean(orig).split
  previous_frag.next_word = words.first if previous_frag
  @orig = orig
  @last_word = words.last
end

Instance Attribute Details

#last_wordObject (readonly)

Returns the value of attribute last_word


9
10
11
# File 'lib/splitta/frag.rb', line 9

def last_word
  @last_word
end

#next_wordObject

Returns the value of attribute next_word


9
10
11
# File 'lib/splitta/frag.rb', line 9

def next_word
  @next_word
end

#origObject (readonly)

Returns the value of attribute orig


9
10
11
# File 'lib/splitta/frag.rb', line 9

def orig
  @orig
end

#predObject

Returns the value of attribute pred


10
11
12
# File 'lib/splitta/frag.rb', line 10

def pred
  @pred
end

Instance Method Details

#features(model) ⇒ Object

… w1. (sb?) w2 … Features, listed roughly in order of importance:

(1) w1: word that includes a period (2) w2: the next word, if it exists (3) w1length: number of alphabetic characters in w1 (4) w2cap: true if w2 is capitalized (5) both: w1 and w2 (6) w1abbr: log count of w1 in training without a final period (7) w2lower: log count of w2 in training as lowercased (8) w1w2upper: w1 and w2 is capitalized


30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/splitta/frag.rb', line 30

def features(model)
  Enumerator.new do |y|
    c1 = last_word&.sub(/(^.+?\-)/, '')
    c2 = next_word&.sub(/(\-.+?)$/, '')

    y << [:w1, c1]
    y << [:w2, c2]
    y << [:both, c1, c2]

    if alphabetic?(c1)
      y << [:w1length, [10, c1.sub(/\W/, '').length].min]
      y << [:w1abbr, Math.log(1 + model.non_abbrs.fetch(c1.chop, 0.0)).to_i]
    end

    if alphabetic?(c2)
      y << [:w2cap, upcase?(c2.chars.first) ? 'True' : 'False']
      y << [:w2lower, Math.log(1 + model.lower_words.fetch(c2.downcase, 0.0)).to_i]
      y << [:w1w2upper, c1, upcase?(c2.chars.first) ? 'True' : 'False']
    end
  end
end