Class: Maxixe::Trainer
- Inherits:
-
Object
- Object
- Maxixe::Trainer
- Defined in:
- lib/maxixe.rb
Class Method Summary collapse
- .check_recognition(index, samples) ⇒ Object
- .generate_corpus_from_io(n, io) ⇒ Object
- .optimize(index, samples) ⇒ Object
Class Method Details
.check_recognition(index, samples) ⇒ Object
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# File 'lib/maxixe.rb', line 128 def self.check_recognition(index, samples) # Get all subsets of N ns = 1.upto(index.keys.size).map{|i| index.keys.combination(i).to_a}.flatten(1) results = ns.inject({}) do |res, n| n_index = index.select{|key, value| n.include? key} m = Maxixe::Segmenter.new(n_index) t_values = ((0.1)..(1.0)).step(0.1).inject({}) do |res, t| difference = samples.inject(0) do |result, (not_split, split)| temp = m.segment(not_split, t) result += Text::Levenshtein.distance(temp, split) end res[t] = difference res end res[n] = t_values res end results end |
.generate_corpus_from_io(n, io) ⇒ Object
102 103 104 105 106 107 108 109 110 111 112 113 |
# File 'lib/maxixe.rb', line 102 def self.generate_corpus_from_io(n , io) result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r} io.each_line do |line| n.each do |c_n| n_grams = line.each_char.each_cons(c_n).map(&:join).to_a n_grams.each do |n_gram| result[c_n.to_s][n_gram] += 1 end end end result end |
.optimize(index, samples) ⇒ Object
115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/maxixe.rb', line 115 def self.optimize(index, samples) res = check_recognition(index, samples) min = nil res.each do |n, ts| ts.each do |t, score| if !min or score < min[1] min = [[n,t],score] end end end {:n => min[0][0], :t => min[0][1], :score => min[1]} end |