Class: WordBloom::Scorer

Inherits:
Object
  • Object
show all
Defined in:
lib/word-bloom/scorer.rb

Constant Summary collapse

OPTIMISM =
3.5
MIN_CONFIDENCE =
15
@@filters =
{}
@@all_languages =
nil

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeScorer

Returns a new instance of Scorer.



39
40
41
42
43
# File 'lib/word-bloom/scorer.rb', line 39

def initialize()
  @languages = {}
  @language_weights = Hash.new(1.0)
  @language_weights[:russian] = 0.8
end

Class Method Details

.all_languagesObject



21
22
23
24
25
# File 'lib/word-bloom/scorer.rb', line 21

def self.all_languages
  @@all_languages ||= Dir.entries(LANGUAGE_DIR_PATH).grep(/\.lang$/).map do |filename|
    filename.sub(/\.lang$/,'').to_sym
  end
end

.filter_for(language) ⇒ Object



17
18
19
# File 'lib/word-bloom/scorer.rb', line 17

def self.filter_for(language)
  @@filters[language]
end

.load_filter(name) ⇒ Object



8
9
10
11
12
13
14
15
# File 'lib/word-bloom/scorer.rb', line 8

def self.load_filter(name)
  @@filters[name] ||=
    begin
      File.open(File.join(LANGUAGE_DIR_PATH, "#{name}.lang"), 'rb') do |file|
        BloominSimple.from_dump(file.read, &HASHER)
      end
    end
end

.loaded_with(*languages) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
# File 'lib/word-bloom/scorer.rb', line 27

def self.loaded_with(*languages)
  scorer = self.new
  if [:all] == languages
    scorer.add_all_languages
  else
    languages.each do |language|
      scorer.add_language(language)
    end
  end
  return scorer
end

Instance Method Details

#add_all_languagesObject



51
52
53
54
55
# File 'lib/word-bloom/scorer.rb', line 51

def add_all_languages
  self.class.all_languages.each do |language|
    add_language(language)
  end
end

#add_language(name, weight = nil) ⇒ Object



45
46
47
48
49
# File 'lib/word-bloom/scorer.rb', line 45

def add_language(name, weight = nil)
  self.class.load_filter(name)
  @languages[name] = true
  @language_weights[name] = weight unless weight.nil?
end

#apply_weights(results) ⇒ Object



68
69
70
71
72
73
# File 'lib/word-bloom/scorer.rb', line 68

def apply_weights(results)
  results.keys.each do |lang|
    results[lang] *= @language_weights[lang]
  end
  results
end

#confidence(considered, results) ⇒ Object



57
58
59
60
61
62
63
# File 'lib/word-bloom/scorer.rb', line 57

def confidence(considered, results)
  top_results = results.values.sort
  best = top_results[-1]
  rest = top_results[0..-2].inject{|number, sum| sum + number}

  return OPTIMISM * best - rest
end

#language(text) ⇒ Object



104
105
106
# File 'lib/word-bloom/scorer.rb', line 104

def language(text)
  process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
end

#process_text(text) ⇒ Object

Very inefficient method for now.. but still beats the non-Bloom alternatives. Change to better bit comparison technique later..



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/word-bloom/scorer.rb', line 78

def process_text(text)
  results = Hash.new(0)
  word_count = 0
  text.split(/\s+/).each do |word|
    word = word.downcase
    next if /^\d*$/ =~ word
    @languages.keys.each do |lang|
      if @@filters[lang].includes?(word)
        results[lang] += 1
      end
    end

    # Every now and then check to see if we have a really convincing result.. if so, exit early.
    if word_count % 4 == 0 && results.size > 1
      #break if confidence(word_count + 1, results) > MIN_CONFIDENCE
    end

    word_count += 1
    #break if word_count > 100
  end
  apply_weights(results)
rescue => ex
  p ex, ex.backtrace
  nil
end