Class: SegmentRuby::Analyzer

Inherits:
Object
  • Object
show all
Defined in:
lib/segment_ruby.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(model_name = :small, max_word_length = 20) ⇒ Analyzer

Returns a new instance of Analyzer.



61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/segment_ruby.rb', line 61

def initialize(model_name=:small, max_word_length=20)
  @model_name = model_name
  @max_word_length = max_word_length

  # unigram log probabilities
  @ulp = ProbabilityDistribution.new(total_file_name, freq_file_name)

  # bigram log probabilities
  btf = total_file_name('2_')
  bff = freq_file_name('2_')
  @blp = (File.exists?(btf) && File.exists?(bff) ? ProbabilityDistribution.new(btf, bff) : false)
end

Instance Attribute Details

#blpObject (readonly)

Returns the value of attribute blp.



74
75
76
# File 'lib/segment_ruby.rb', line 74

def blp
  @blp
end

#max_word_lengthObject (readonly)

Returns the value of attribute max_word_length.



74
75
76
# File 'lib/segment_ruby.rb', line 74

def max_word_length
  @max_word_length
end

#model_nameObject (readonly)

Returns the value of attribute model_name.



74
75
76
# File 'lib/segment_ruby.rb', line 74

def model_name
  @model_name
end

#ulpObject (readonly)

Returns the value of attribute ulp.



74
75
76
# File 'lib/segment_ruby.rb', line 74

def ulp
  @ulp
end

Instance Method Details

#combine(pFirst, first, segmented) ⇒ Object



103
104
105
106
107
# File 'lib/segment_ruby.rb', line 103

def combine(pFirst, first, segmented)
  pRem,rem = segmented

  [pFirst+pRem, [first]+rem]
end

#freq_file_name(prefix = '') ⇒ Object



90
91
92
# File 'lib/segment_ruby.rb', line 90

def freq_file_name(prefix='')
  File.join(model_path, prefix + 'frequencies.tsv')
end

#log_CPr(w, prev) ⇒ Object



80
81
82
83
84
# File 'lib/segment_ruby.rb', line 80

def log_CPr(w, prev)
  key = [prev, w].join(' ')

  blp && blp.has_key?(key) ? blp.log_prob(key) : ulp.log_prob(w)
end

#log_Pr(w) ⇒ Object



76
77
78
# File 'lib/segment_ruby.rb', line 76

def log_Pr(w)
  ulp.log_prob(w)
end

#model_pathObject



94
95
96
# File 'lib/segment_ruby.rb', line 94

def model_path
  @model_path ||= File.join(__dir__, "..", "data", "segment_ruby", model_name.to_s)
end

#segment(text, prev = '<S>') ⇒ Object



123
124
125
126
127
# File 'lib/segment_ruby.rb', line 123

def segment(text, prev='<S>')
  _, segmentation = segment_r(text, prev, 0, Hash.new)

  segmentation
end

#segment_r(text, prev, n, memo) ⇒ Object



109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/segment_ruby.rb', line 109

def segment_r(text, prev, n, memo)
  return [0.0, []] if (!text) || (text.size == 0)
  return memo[text] if memo.has_key?(text)

  log_p_segment = splits(text).map do |first, rem|
     log_p = log_CPr(first, prev)
     combine(log_p, first, segment_r(rem, first, n+1, memo))
  end.max

  memo[text] = log_p_segment

  log_p_segment
end

#splits(text) ⇒ Object

Returns all the splits of a string up to a given length



99
100
101
# File 'lib/segment_ruby.rb', line 99

def splits(text)
  (0..[max_word_length, text.size-1].min).map { |i| [text[0..i], text[i+1..text.size]] }
end

#total_file_name(prefix = '') ⇒ Object



86
87
88
# File 'lib/segment_ruby.rb', line 86

def total_file_name(prefix='')
  File.join(model_path, prefix + 'total.tsv')
end