Class: Maxixe::Segmenter
- Inherits:
-
Object
- Object
- Maxixe::Segmenter
- Defined in:
- lib/maxixe.rb
Instance Attribute Summary collapse
-
#t ⇒ Object
Returns the value of attribute t.
Instance Method Summary collapse
- #all_n_grams(str) ⇒ Object
- #average_votes(votes) ⇒ Object
- #compute_vote(non_strad, strad, n) ⇒ Object
- #compute_votes(positions_with_ngrams, n) ⇒ Object
-
#initialize(index, t = 0.5) ⇒ Segmenter
constructor
A new instance of Segmenter.
- #non_straddling(n_grams, pos) ⇒ Object
- #segment(str, t = nil) ⇒ Object
- #split_with_votes(votes, str, t = nil) ⇒ Object
- #straddling(n_grams, pos) ⇒ Object
- #straddling_and_non_straddling(n_grams, str) ⇒ Object
- #token_count(n_gram) ⇒ Object
Constructor Details
#initialize(index, t = 0.5) ⇒ Segmenter
Returns a new instance of Segmenter.
7 8 9 10 11 |
# File 'lib/maxixe.rb', line 7 def initialize(index, t = 0.5) @index = index @n = index.keys.map(&:to_i) @t = t end |
Instance Attribute Details
#t ⇒ Object
Returns the value of attribute t.
5 6 7 |
# File 'lib/maxixe.rb', line 5 def t @t end |
Instance Method Details
#all_n_grams(str) ⇒ Object
47 48 49 |
# File 'lib/maxixe.rb', line 47 def all_n_grams str @n.map do |n| str.each_char.each_cons(n).to_a end end |
#average_votes(votes) ⇒ Object
92 93 94 95 96 |
# File 'lib/maxixe.rb', line 92 def average_votes(votes) votes.transpose.map do |vote_array| vote_array.inject(&:+).to_f / vote_array.size end end |
#compute_vote(non_strad, strad, n) ⇒ Object
83 84 85 86 87 88 89 90 |
# File 'lib/maxixe.rb', line 83 def compute_vote(non_strad, strad, n) res = non_strad.inject(0) do |res, s| res + strad.inject(0) do |res_2, t| res_2 + ((token_count(s) > token_count(t)) ? 1 : 0) end end res / (2.0 * (n - 1)) end |
#compute_votes(positions_with_ngrams, n) ⇒ Object
77 78 79 80 81 |
# File 'lib/maxixe.rb', line 77 def compute_votes positions_with_ngrams, n positions_with_ngrams.map do |(non_strad, strad)| compute_vote(non_strad, strad, n) end end |
#non_straddling(n_grams, pos) ⇒ Object
61 62 63 64 65 66 67 |
# File 'lib/maxixe.rb', line 61 def non_straddling n_grams, pos res = [] n_grams.each_with_index do |n_gram, i| res << n_gram if i == pos + 1 or i == pos - (n_gram.size - 1) end res.map(&:join) end |
#segment(str, t = nil) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/maxixe.rb', line 13 def segment(str, t = nil) n_grams = all_n_grams(str) votes_for_all = n_grams.map{|n| compute_votes(straddling_and_non_straddling(n,str), n.first.size)} averaged = average_votes(votes_for_all) split_with_votes(averaged, str, t) end |
#split_with_votes(votes, str, t = nil) ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/maxixe.rb', line 25 def split_with_votes(votes, str, t = nil) points = [] votes.each_with_index do |vote, i| treshold = vote > (t || @t) maximum = if i > 0 and i < (votes.size - 1) vote > votes[i - 1] and vote > votes[i + 1] else false end points << i if treshold or maximum end res = str.dup offset = 1 points.each do |p| res.insert(p + offset, " ") offset += 1 end res end |
#straddling(n_grams, pos) ⇒ Object
69 70 71 72 73 74 75 |
# File 'lib/maxixe.rb', line 69 def straddling n_grams, pos res = [] n_grams.each_with_index do |n_gram, i| res << n_gram if i <= pos and i > pos - (n_gram.size - 1) end res.map(&:join) end |
#straddling_and_non_straddling(n_grams, str) ⇒ Object
55 56 57 58 59 |
# File 'lib/maxixe.rb', line 55 def straddling_and_non_straddling n_grams, str (0..(str.length - 2)).map do |pos| [non_straddling(n_grams, pos), straddling(n_grams, pos)] end end |
#token_count(n_gram) ⇒ Object
51 52 53 |
# File 'lib/maxixe.rb', line 51 def token_count(n_gram) @index[n_gram.length.to_s][n_gram] || 0 end |