Class: LanguageDetector::Profile
- Inherits:
-
Object
- Object
- LanguageDetector::Profile
- Defined in:
- lib/language_detector.rb
Constant Summary collapse
- LIMIT =
1500
- PUNCTUATION_REGEX =
/[\W^_\d]+/
Instance Attribute Summary collapse
-
#name ⇒ Object
Returns the value of attribute name.
-
#ngrams ⇒ Object
Returns the value of attribute ngrams.
Instance Method Summary collapse
- #compute_distance(other_profile) ⇒ Object
- #count_ngram(token, n, counts) ⇒ Object
- #generate_ngrams(str, ngram_count) ⇒ Object
- #init_with_file(filename) ⇒ Object
- #init_with_string(str) ⇒ Object
-
#initialize(*args) ⇒ Profile
constructor
A new instance of Profile.
- #is_punctuation?(char) ⇒ Boolean
- #tokenize(str) ⇒ Object
Constructor Details
#initialize(*args) ⇒ Profile
Returns a new instance of Profile.
150 151 152 153 154 155 156 157 158 |
# File 'lib/language_detector.rb', line 150 def initialize(*args) args = args.first @name = args[:name] || "" @ngrams = {} init_with_string(args[:text]) if args[:text] init_with_file(args[:file]) if args[:file] end |
Instance Attribute Details
#name ⇒ Object
Returns the value of attribute name.
148 149 150 |
# File 'lib/language_detector.rb', line 148 def name @name end |
#ngrams ⇒ Object
Returns the value of attribute ngrams.
148 149 150 |
# File 'lib/language_detector.rb', line 148 def ngrams @ngrams end |
Instance Method Details
#compute_distance(other_profile) ⇒ Object
160 161 162 163 164 165 166 167 168 169 170 171 172 |
# File 'lib/language_detector.rb', line 160 def compute_distance(other_profile) distance = 0 other_profile.ngrams.each do |k, v| n = @ngrams[k] if n = @ngrams[k] distance += (v - n).abs else distance += LIMIT end end distance end |
#count_ngram(token, n, counts) ⇒ Object
207 208 209 210 211 212 213 214 215 216 217 218 |
# File 'lib/language_detector.rb', line 207 def count_ngram(token, n, counts) token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.length >= n n.upto(token.length).with_index do |t, i| s = '' 0.upto(n-1) { |j| s << token[i+j] } counts[s] = counts.has_key?(s) ? counts[s]+=1 : 1 end counts end |
#generate_ngrams(str, ngram_count) ⇒ Object
197 198 199 200 201 202 |
# File 'lib/language_detector.rb', line 197 def generate_ngrams(str, ngram_count) tokens = tokenize(str) tokens.each do |token| 2.upto(5) { |n| count_ngram(token, n, ngram_count) } end end |
#init_with_file(filename) ⇒ Object
174 175 176 177 178 179 180 181 182 183 184 185 |
# File 'lib/language_detector.rb', line 174 def init_with_file(filename) ngram_count = Hash.new(0) path = File.(File.join(File.dirname(__FILE__), "training_data/" + filename)) File.open(path).each_line {|line| generate_ngrams(line, ngram_count) } puts "training with " + path ngram_count.sort {|a,b| b[1] <=> a[1]}.each_with_index do |t, i| ngrams[t[0]] = (i+1) break if i > LIMIT end end |
#init_with_string(str) ⇒ Object
187 188 189 190 191 192 193 194 195 |
# File 'lib/language_detector.rb', line 187 def init_with_string(str) ngram_count = {} generate_ngrams(str, ngram_count) ngram_count.sort {|a,b| b[1] <=> a[1]}.each_with_index do |t, i| @ngrams[t[0]] = (i+1) break if i > LIMIT end end |
#is_punctuation?(char) ⇒ Boolean
205 |
# File 'lib/language_detector.rb', line 205 def is_punctuation?(char); char =~ PUNCTUATION_REGEX; end |
#tokenize(str) ⇒ Object
204 |
# File 'lib/language_detector.rb', line 204 def tokenize(str) str.split(PUNCTUATION_REGEX); end |