Class: WordBloom::Scorer
- Inherits:
-
Object
- Object
- WordBloom::Scorer
- Defined in:
- lib/word-bloom/scorer.rb
Constant Summary collapse
- OPTIMISM =
3.5
- MIN_CONFIDENCE =
15
- @@filters =
{}
- @@all_languages =
nil
Class Method Summary collapse
- .all_languages ⇒ Object
- .filter_for(language) ⇒ Object
- .load_filter(name) ⇒ Object
- .loaded_with(*languages) ⇒ Object
Instance Method Summary collapse
- #add_all_languages ⇒ Object
- #add_language(name, weight = nil) ⇒ Object
- #apply_weights(results) ⇒ Object
- #confidence(considered, results) ⇒ Object
-
#initialize ⇒ Scorer
constructor
A new instance of Scorer.
- #language(text) ⇒ Object
-
#process_text(text) ⇒ Object
Very inefficient method for now..
Constructor Details
#initialize ⇒ Scorer
Returns a new instance of Scorer.
39 40 41 42 43 |
# File 'lib/word-bloom/scorer.rb', line 39 def initialize() @languages = {} @language_weights = Hash.new(1.0) @language_weights[:russian] = 0.8 end |
Class Method Details
.all_languages ⇒ Object
21 22 23 24 25 |
# File 'lib/word-bloom/scorer.rb', line 21 def self.all_languages @@all_languages ||= Dir.entries(LANGUAGE_DIR_PATH).grep(/\.lang$/).map do |filename| filename.sub(/\.lang$/,'').to_sym end end |
.filter_for(language) ⇒ Object
17 18 19 |
# File 'lib/word-bloom/scorer.rb', line 17 def self.filter_for(language) @@filters[language] end |
.load_filter(name) ⇒ Object
8 9 10 11 12 13 14 15 |
# File 'lib/word-bloom/scorer.rb', line 8 def self.load_filter(name) @@filters[name] ||= begin File.open(File.join(LANGUAGE_DIR_PATH, "#{name}.lang"), 'rb') do |file| BloominSimple.from_dump(file.read, &HASHER) end end end |
.loaded_with(*languages) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/word-bloom/scorer.rb', line 27 def self.loaded_with(*languages) scorer = self.new if [:all] == languages scorer.add_all_languages else languages.each do |language| scorer.add_language(language) end end return scorer end |
Instance Method Details
#add_all_languages ⇒ Object
51 52 53 54 55 |
# File 'lib/word-bloom/scorer.rb', line 51 def add_all_languages self.class.all_languages.each do |language| add_language(language) end end |
#add_language(name, weight = nil) ⇒ Object
45 46 47 48 49 |
# File 'lib/word-bloom/scorer.rb', line 45 def add_language(name, weight = nil) self.class.load_filter(name) @languages[name] = true @language_weights[name] = weight unless weight.nil? end |
#apply_weights(results) ⇒ Object
68 69 70 71 72 73 |
# File 'lib/word-bloom/scorer.rb', line 68 def apply_weights(results) results.keys.each do |lang| results[lang] *= @language_weights[lang] end results end |
#confidence(considered, results) ⇒ Object
57 58 59 60 61 62 63 |
# File 'lib/word-bloom/scorer.rb', line 57 def confidence(considered, results) top_results = results.values.sort best = top_results[-1] rest = top_results[0..-2].inject{|number, sum| sum + number} return OPTIMISM * best - rest end |
#language(text) ⇒ Object
104 105 106 |
# File 'lib/word-bloom/scorer.rb', line 104 def language(text) process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil end |
#process_text(text) ⇒ Object
Very inefficient method for now.. but still beats the non-Bloom alternatives. Change to better bit comparison technique later..
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/word-bloom/scorer.rb', line 78 def process_text(text) results = Hash.new(0) word_count = 0 text.split(/\s+/).each do |word| word = word.downcase next if /^\d*$/ =~ word @languages.keys.each do |lang| if @@filters[lang].includes?(word) results[lang] += 1 end end # Every now and then check to see if we have a really convincing result.. if so, exit early. if word_count % 4 == 0 && results.size > 1 #break if confidence(word_count + 1, results) > MIN_CONFIDENCE end word_count += 1 #break if word_count > 100 end apply_weights(results) rescue => ex p ex, ex.backtrace nil end |