Class: LanguageDetector::Profile
- Inherits:
-
Object
- Object
- LanguageDetector::Profile
- Defined in:
- lib/language_detector/profile.rb
Constant Summary collapse
- IGNORE_CHARACTERS =
[?., ?\,, ?:, ?;, ?\w, ?\n]
- LIMIT =
2000
Instance Attribute Summary collapse
-
#name ⇒ Object
readonly
Returns the value of attribute name.
-
#ngrams ⇒ Object
readonly
Returns the value of attribute ngrams.
Instance Method Summary collapse
- #_init_with_string(str, ngram_count) ⇒ Object
- #append_next_token(tokens, new_token) ⇒ Object
- #compute_distance(profile) ⇒ Object
- #count_ngram(token, n, counts) ⇒ Object
- #init_with_string(str) ⇒ Object
- #init_with_training_file(filename) ⇒ Object
-
#initialize(name) ⇒ Profile
constructor
A new instance of Profile.
- #is_valid_character?(char) ⇒ Boolean
- #tokenize(line) ⇒ Object
Constructor Details
#initialize(name) ⇒ Profile
Returns a new instance of Profile.
21 22 23 24 25 26 |
# File 'lib/language_detector/profile.rb', line 21 def initialize(name) @name = name @ignore_characters = {} IGNORE_CHARACTERS.each {|p| @ignore_characters[p] = 1} @ngrams = {} end |
Instance Attribute Details
#name ⇒ Object (readonly)
Returns the value of attribute name.
19 20 21 |
# File 'lib/language_detector/profile.rb', line 19 def name @name end |
#ngrams ⇒ Object (readonly)
Returns the value of attribute ngrams.
19 20 21 |
# File 'lib/language_detector/profile.rb', line 19 def ngrams @ngrams end |
Instance Method Details
#_init_with_string(str, ngram_count) ⇒ Object
88 89 90 91 92 93 94 95 96 |
# File 'lib/language_detector/profile.rb', line 88 def _init_with_string str, ngram_count tokens = tokenize(str) tokens.each {|token| count_ngram token, 2, ngram_count count_ngram token, 3, ngram_count count_ngram token, 4, ngram_count count_ngram token, 5, ngram_count } end |
#append_next_token(tokens, new_token) ⇒ Object
45 46 47 48 49 |
# File 'lib/language_detector/profile.rb', line 45 def append_next_token(tokens, new_token) if !new_token.empty? tokens << new_token end end |
#compute_distance(profile) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/language_detector/profile.rb', line 6 def compute_distance profile distance = 0 profile.ngrams.each {|k, v| n = @ngrams[k] if n distance += (v - n).abs else distance += LanguageDetector::Profile::LIMIT end } return distance end |
#count_ngram(token, n, counts) ⇒ Object
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/language_detector/profile.rb', line 98 def count_ngram token, n, counts if n > 1 && token.length >= n token = "_#{token}#{'_' * (n-1)}" end i = 0 while i + n <= token.length s = '' j = 0 while j < n s << token[i+j] j += 1 end if counts[s] counts[s] = counts[s] + 1 else counts[s] = 1 end i += 1 end return counts end |
#init_with_string(str) ⇒ Object
74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/language_detector/profile.rb', line 74 def init_with_string str ngram_count = {} _init_with_string str, ngram_count a = ngram_count.sort {|a,b| b[1] <=> a[1]} i = 1 a.each {|t| @ngrams[t[0]] = i i += 1 break if i > LIMIT } end |
#init_with_training_file(filename) ⇒ Object
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/language_detector/profile.rb', line 55 def init_with_training_file filename ngram_count = {} path = File.(File.join(File.dirname(__FILE__), "training_data/" + filename)) File.open(path).each_line{ |line| _init_with_string line, ngram_count } a = ngram_count.sort {|a,b| b[1] <=> a[1]} i = 1 a.each {|t| @ngrams[t[0]] = i i += 1 break if i > LIMIT } end |
#is_valid_character?(char) ⇒ Boolean
51 52 53 |
# File 'lib/language_detector/profile.rb', line 51 def is_valid_character? char char.match(/[^a-z]/).nil? end |
#tokenize(line) ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/language_detector/profile.rb', line 28 def tokenize line tokens = [] new_token = '' line.downcase.each_char {|c| if is_valid_character?(c) new_token << c else append_next_token(tokens, new_token) new_token = '' end } append_next_token(tokens, new_token) return tokens end |