Class: Rlid::NaiveBayesModels
- Inherits:
-
Object
- Object
- Rlid::NaiveBayesModels
- Defined in:
- lib/rlid/models/naive_bayes_models.rb
Overview
> prova di una stringa molto lunga lunghissima davvero lunga yyyy default = 10 ita(99.97) : cat(0.026) : spa(0.0023) default = 1 ita(99.995) : cat(0.0045) : por(0.00019) default = 0.1 ita(99.9990) : cat(0.00086) : rum(3.7e-05)
Constant Summary collapse
- CUTOFF =
top ngrams kept for every language
3000
- OTHER =
special feature
nil
- MAX_STRING_LENGTH =
20
- FILENAME =
"naive_bayes_models"
Instance Attribute Summary collapse
-
#default_count ⇒ Object
Returns the value of attribute default_count.
-
#n ⇒ Object
Returns the value of attribute n.
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(n = 3) ⇒ NaiveBayesModels
constructor
A new instance of NaiveBayesModels.
- #probabilities(string) ⇒ Object
-
#probabilities_h(string) ⇒ Object
returns a hash.
- #train ⇒ Object
Constructor Details
#initialize(n = 3) ⇒ NaiveBayesModels
Returns a new instance of NaiveBayesModels.
29 30 31 32 |
# File 'lib/rlid/models/naive_bayes_models.rb', line 29 def initialize(n=3) @n=n @default_count=1 end |
Instance Attribute Details
#default_count ⇒ Object
Returns the value of attribute default_count.
19 20 21 |
# File 'lib/rlid/models/naive_bayes_models.rb', line 19 def default_count @default_count end |
#n ⇒ Object
Returns the value of attribute n.
19 20 21 |
# File 'lib/rlid/models/naive_bayes_models.rb', line 19 def n @n end |
Class Method Details
.generate_models(file = nil, n = 3) ⇒ Object
34 35 36 37 38 39 40 41 42 43 |
# File 'lib/rlid/models/naive_bayes_models.rb', line 34 def self.generate_models(file=nil, n=3) file ||= FILENAME models = NaiveBayesModels.new(n) puts "Training started.." models.train File.open( "#{DATA_DIRECTORY}/#{file}", "w") do |f| f.write Marshal.dump(models) puts "Models saved to #{DATA_DIRECTORY}/#{file}" end end |
.load(file = nil) ⇒ Object
45 46 47 48 |
# File 'lib/rlid/models/naive_bayes_models.rb', line 45 def self.load(file=nil) file ||= FILENAME Marshal.load(File.read("#{DATA_DIRECTORY}/#{file}")) end |
Instance Method Details
#probabilities(string) ⇒ Object
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/rlid/models/naive_bayes_models.rb', line 50 def probabilities(string) if not string.is_a? String raise InvalidArgument end @ngram_frequency.keys.each do |lang| prob = 1 string[0..MAX_STRING_LENGTH].each_ngram(@n) do |ngram| if lang == :eng #print ngram, ", " end prob *= frequency_of(lang, ngram) end yield lang, prob end end |
#probabilities_h(string) ⇒ Object
returns a hash
67 68 69 70 71 72 73 74 |
# File 'lib/rlid/models/naive_bayes_models.rb', line 67 def probabilities_h(string) #puts "#{@n}: #{total_ngrams(:ita)}" res = {} probabilities(string) do |lang, prob| res[lang] = prob end res end |
#train ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# File 'lib/rlid/models/naive_bayes_models.rb', line 76 def train ngram_counts = get_ngram_counts # ngrams for which we want to store information (all languages) @stored_ngrams = top_ngrams(ngram_counts) puts "- processing ngrams" # content: ngram_frequency[lang][ngram] = freq @ngram_frequency = Hash.new # content: total_ngrams_found[lang] = total count of ngrams encountered @total_ngrams_found= Hash.new # content: total_ngrams_not_found[lang] = n of ngrams not found @total_ngrams_not_found = Hash.new ngram_counts.each do |lang, counts| @ngram_frequency[lang] = Hash.new(0) @total_ngrams_found[lang] = 0 counts.each do |ngram, count| if @stored_ngrams.include?(ngram) @ngram_frequency[lang][ngram] = count else @ngram_frequency[lang][OTHER] += count end @total_ngrams_found[lang] += count end not_found = (@stored_ngrams - @ngram_frequency[lang].keys).size @total_ngrams_not_found[lang] = not_found puts_info(lang) end # add language :nnn n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5) @total_ngrams_found[:nnn] = n @ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n} @total_ngrams_not_found[:nnn] = @stored_ngrams.size #puts "total frequencies saved: #{freqs}" #puts "default values used: #{default_count} (#{100*default_count/freqs}%)" #@ngram_frequency end |