Class: SegmentRuby::Analyzer
- Inherits:
-
Object
- Object
- SegmentRuby::Analyzer
- Defined in:
- lib/segment_ruby.rb
Instance Attribute Summary collapse
-
#blp ⇒ Object
readonly
Returns the value of attribute blp.
-
#max_word_length ⇒ Object
readonly
Returns the value of attribute max_word_length.
-
#model_name ⇒ Object
readonly
Returns the value of attribute model_name.
-
#ulp ⇒ Object
readonly
Returns the value of attribute ulp.
Instance Method Summary collapse
- #combine(pFirst, first, segmented) ⇒ Object
- #freq_file_name(prefix = '') ⇒ Object
-
#initialize(model_name = :small, max_word_length = 20) ⇒ Analyzer
constructor
A new instance of Analyzer.
- #log_CPr(w, prev) ⇒ Object
- #log_Pr(w) ⇒ Object
- #model_path ⇒ Object
- #segment(text, prev = '<S>') ⇒ Object
- #segment_r(text, prev, n, memo) ⇒ Object
-
#splits(text) ⇒ Object
Returns all the splits of a string up to a given length.
- #total_file_name(prefix = '') ⇒ Object
Constructor Details
#initialize(model_name = :small, max_word_length = 20) ⇒ Analyzer
Returns a new instance of Analyzer.
61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/segment_ruby.rb', line 61 def initialize(model_name=:small, max_word_length=20) @model_name = model_name @max_word_length = max_word_length # unigram log probabilities @ulp = ProbabilityDistribution.new(total_file_name, freq_file_name) # bigram log probabilities btf = total_file_name('2_') bff = freq_file_name('2_') @blp = (File.exists?(btf) && File.exists?(bff) ? ProbabilityDistribution.new(btf, bff) : false) end |
Instance Attribute Details
#blp ⇒ Object (readonly)
Returns the value of attribute blp.
74 75 76 |
# File 'lib/segment_ruby.rb', line 74 def blp @blp end |
#max_word_length ⇒ Object (readonly)
Returns the value of attribute max_word_length.
74 75 76 |
# File 'lib/segment_ruby.rb', line 74 def max_word_length @max_word_length end |
#model_name ⇒ Object (readonly)
Returns the value of attribute model_name.
74 75 76 |
# File 'lib/segment_ruby.rb', line 74 def model_name @model_name end |
#ulp ⇒ Object (readonly)
Returns the value of attribute ulp.
74 75 76 |
# File 'lib/segment_ruby.rb', line 74 def ulp @ulp end |
Instance Method Details
#combine(pFirst, first, segmented) ⇒ Object
103 104 105 106 107 |
# File 'lib/segment_ruby.rb', line 103 def combine(pFirst, first, segmented) pRem,rem = segmented [pFirst+pRem, [first]+rem] end |
#freq_file_name(prefix = '') ⇒ Object
90 91 92 |
# File 'lib/segment_ruby.rb', line 90 def freq_file_name(prefix='') File.join(model_path, prefix + 'frequencies.tsv') end |
#log_CPr(w, prev) ⇒ Object
80 81 82 83 84 |
# File 'lib/segment_ruby.rb', line 80 def log_CPr(w, prev) key = [prev, w].join(' ') blp && blp.has_key?(key) ? blp.log_prob(key) : ulp.log_prob(w) end |
#log_Pr(w) ⇒ Object
76 77 78 |
# File 'lib/segment_ruby.rb', line 76 def log_Pr(w) ulp.log_prob(w) end |
#model_path ⇒ Object
94 95 96 |
# File 'lib/segment_ruby.rb', line 94 def model_path @model_path ||= File.join(__dir__, "..", "data", "segment_ruby", model_name.to_s) end |
#segment(text, prev = '<S>') ⇒ Object
123 124 125 126 127 |
# File 'lib/segment_ruby.rb', line 123 def segment(text, prev='<S>') _, segmentation = segment_r(text, prev, 0, Hash.new) segmentation end |
#segment_r(text, prev, n, memo) ⇒ Object
109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/segment_ruby.rb', line 109 def segment_r(text, prev, n, memo) return [0.0, []] if (!text) || (text.size == 0) return memo[text] if memo.has_key?(text) log_p_segment = splits(text).map do |first, rem| log_p = log_CPr(first, prev) combine(log_p, first, segment_r(rem, first, n+1, memo)) end.max memo[text] = log_p_segment log_p_segment end |
#splits(text) ⇒ Object
Returns all the splits of a string up to a given length
99 100 101 |
# File 'lib/segment_ruby.rb', line 99 def splits(text) (0..[max_word_length, text.size-1].min).map { |i| [text[0..i], text[i+1..text.size]] } end |
#total_file_name(prefix = '') ⇒ Object
86 87 88 |
# File 'lib/segment_ruby.rb', line 86 def total_file_name(prefix='') File.join(model_path, prefix + 'total.tsv') end |