Class: Rlid::NaiveBayesModels

Inherits:
Object
  • Object
show all
Defined in:
lib/rlid/models/naive_bayes_models.rb

Overview

> prova di una stringa molto lunga lunghissima davvero lunga yyyy default = 10 ita(99.97) : cat(0.026) : spa(0.0023) default = 1 ita(99.995) : cat(0.0045) : por(0.00019) default = 0.1 ita(99.9990) : cat(0.00086) : rum(3.7e-05)

Constant Summary collapse

CUTOFF =

top ngrams kept for every language

3000
OTHER =

special feature

nil
MAX_STRING_LENGTH =
20
FILENAME =
"naive_bayes_models"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(n = 3) ⇒ NaiveBayesModels

Returns a new instance of NaiveBayesModels.



29
30
31
32
# File 'lib/rlid/models/naive_bayes_models.rb', line 29

def initialize(n=3)
  @n=n
  @default_count=1
end

Instance Attribute Details

#default_countObject

Returns the value of attribute default_count.



19
20
21
# File 'lib/rlid/models/naive_bayes_models.rb', line 19

def default_count
  @default_count
end

#nObject

Returns the value of attribute n.



19
20
21
# File 'lib/rlid/models/naive_bayes_models.rb', line 19

def n
  @n
end

Class Method Details

.generate_models(file = nil, n = 3) ⇒ Object



34
35
36
37
38
39
40
41
42
43
# File 'lib/rlid/models/naive_bayes_models.rb', line 34

def self.generate_models(file=nil, n=3)
  file ||= FILENAME
  models = NaiveBayesModels.new(n)
  puts "Training started.."
  models.train
  File.open( "#{DATA_DIRECTORY}/#{file}", "w") do |f|
    f.write Marshal.dump(models)
    puts "Models saved to #{DATA_DIRECTORY}/#{file}"
  end
end

.load(file = nil) ⇒ Object



45
46
47
48
# File 'lib/rlid/models/naive_bayes_models.rb', line 45

def self.load(file=nil)
  file ||= FILENAME
  Marshal.load(File.read("#{DATA_DIRECTORY}/#{file}"))
end

Instance Method Details

#probabilities(string) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/rlid/models/naive_bayes_models.rb', line 50

def probabilities(string)
  if not string.is_a? String
    raise InvalidArgument
  end
  @ngram_frequency.keys.each do |lang|
    prob = 1
    string[0..MAX_STRING_LENGTH].each_ngram(@n) do |ngram|
      if lang == :eng
        #print ngram, ",  "
      end
      prob *= frequency_of(lang, ngram)
    end
    yield lang, prob
  end
end

#probabilities_h(string) ⇒ Object

returns a hash



67
68
69
70
71
72
73
74
# File 'lib/rlid/models/naive_bayes_models.rb', line 67

def probabilities_h(string)
  #puts "#{@n}: #{total_ngrams(:ita)}"
  res = {}
  probabilities(string) do |lang, prob|
    res[lang] = prob
  end
  res
end

#trainObject



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/rlid/models/naive_bayes_models.rb', line 76

def train
  ngram_counts = get_ngram_counts
  # ngrams for which we want to store information (all languages)
  @stored_ngrams = top_ngrams(ngram_counts)

  puts "- processing ngrams"
  # content: ngram_frequency[lang][ngram] = freq
  @ngram_frequency = Hash.new
  # content: total_ngrams_found[lang] = total count of ngrams encountered
  @total_ngrams_found= Hash.new
  # content: total_ngrams_not_found[lang] = n of ngrams not found
  @total_ngrams_not_found = Hash.new

  ngram_counts.each do |lang, counts|
    @ngram_frequency[lang] = Hash.new(0)
    @total_ngrams_found[lang] = 0
    counts.each do |ngram, count|
      if @stored_ngrams.include?(ngram)
        @ngram_frequency[lang][ngram] = count
      else
        @ngram_frequency[lang][OTHER] += count
      end
      @total_ngrams_found[lang] += count
    end
    
    not_found = (@stored_ngrams - @ngram_frequency[lang].keys).size
    @total_ngrams_not_found[lang] = not_found

    puts_info(lang)
  end

  # add language :nnn
  n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5)
  @total_ngrams_found[:nnn] = n
  @ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n}
  @total_ngrams_not_found[:nnn] = @stored_ngrams.size
  
  #puts "total frequencies saved: #{freqs}"
  #puts "default values used: #{default_count} (#{100*default_count/freqs}%)"
  #@ngram_frequency
end