Class: NBayes::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/nbayes.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Base

Returns a new instance of Base.



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/nbayes.rb', line 20

def initialize(options={})
  @debug = false
  @k = 1
  @binarized = options[:binarized] || false
  @log_vocab = false    				# for smoothing, use log of vocab size, rather than vocab size 
  @assume_uniform = false
  @vocab = Hash.new                   		# used to calculate vocab size (@vocab.keys.length)
  @data = Hash.new
  @data.default_proc = get_default_proc()
  #@data = {
  #  "category1": {
  #    "tokens": Hash.new(0),
  #    "total_tokens": 0,
  #    "examples": 0
  #  },
  # ...
  #}
end

Instance Attribute Details

#assume_uniformObject

Returns the value of attribute assume_uniform.



17
18
19
# File 'lib/nbayes.rb', line 17

def assume_uniform
  @assume_uniform
end

#binarizedObject (readonly)

Returns the value of attribute binarized.



18
19
20
# File 'lib/nbayes.rb', line 18

def binarized
  @binarized
end

#dataObject

Returns the value of attribute data.



17
18
19
# File 'lib/nbayes.rb', line 17

def data
  @data
end

#debugObject

Returns the value of attribute debug.



17
18
19
# File 'lib/nbayes.rb', line 17

def debug
  @debug
end

#kObject

Returns the value of attribute k.



17
18
19
# File 'lib/nbayes.rb', line 17

def k
  @k
end

#log_vocabObject

Returns the value of attribute log_vocab.



17
18
19
# File 'lib/nbayes.rb', line 17

def log_vocab
  @log_vocab
end

#vocabObject

Returns the value of attribute vocab.



17
18
19
# File 'lib/nbayes.rb', line 17

def vocab
  @vocab
end

Class Method Details

.from(yml_file) ⇒ Object

Loads class instance from a data file (e.g., yaml)



163
164
165
166
167
# File 'lib/nbayes.rb', line 163

def self.from(yml_file)
  nbayes = YAML.load_file(yml_file)
  nbayes.reset_after_import()         		# yaml does not properly set the defaults on the Hashes
  nbayes
end

Instance Method Details

#calculate_probabilities(tokens) ⇒ Object

Calculates the actual probability of a class given the tokens (this is the work horse of the code)



119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/nbayes.rb', line 119

def calculate_probabilities(tokens)
  # P(class|words) = P(w1,...,wn|class) * P(class) / P(w1,...,wn)
  #                = argmax P(w1,...,wn|class) * P(class)
  # 
  # P(wi|class) = (count(wi, class) + k)/(count(w,class) + kV)
  prob_numerator = {}
  v_size = vocab_size
  @data.keys.each do |category|
	cat_data = @data[category]
	cat_prob = Math.log(cat_data[:examples]/total_examples().to_f)
	cat_prob = Math.log(1/@data.keys.length.to_f)  if assume_uniform
	log_probs = 0
	cat_denominator = (cat_data[:total_tokens]+ @k*v_size).to_f
	tokens.each do |token|
	  log_probs += Math.log( (cat_data[:tokens][token] + @k)/cat_denominator )
	end
	prob_numerator[category] = log_probs + cat_prob
  end
  # calculate the denominator, which normalizes this into a probability; it's just the sum of all numerators from above
  normalizer = 0
  prob_numerator.each {|cat, numerator| normalizer += numerator }
  # One more caveat:
  # We're using log probabilities, so the numbers are negative and the smallest negative number is actually the largest prob.
  # To convert, we need to maintain the relative distance between all of the probabilities:
  # - divide log prob by normalizer: this keeps ratios the same, but reverses the ordering
  # - re-normalize based off new counts
  # - final calculation
  # Ex: -1,-1,-2  =>  -4/-1, -4/-1, -4/-2
  #   - renormalize and calculate => 4/10, 4/10, 2/10
  intermed = {}
  renormalizer = 0
  prob_numerator.each do |cat, numerator|
	intermed[cat]=normalizer/numerator.to_f
	renormalizer += intermed[cat]
  end
  # calculate final probs
  final_probs = {}
  intermed.each do |cat, value|
	final_probs[cat]=value/renormalizer.to_f
  end
  final_probs
end

#classify(tokens) ⇒ Object



93
94
95
96
97
98
99
100
101
# File 'lib/nbayes.rb', line 93

def classify(tokens)
  print "classify: #{tokens.join(', ')}\n" if @debug
  probs = {}
  tokens = tokens.uniq  if binarized
  probs = calculate_probabilities(tokens)
  print "results: #{probs.to_yaml}\n" if @debug
  probs.extend(NBayes::Result)
  probs
end

#dump(arg) ⇒ Object

Dumps class instance to a data file (e.g., yaml) or a string



183
184
185
186
187
188
189
# File 'lib/nbayes.rb', line 183

def dump(arg)
  if arg.instance_of? String
    File.open(arg, "w") {|f| YAML.dump(self, f) }
  else
    YAML.dump(arg)
  end
end

#get_default_procObject

Returns the default proc used by the data hash Separate method so that it can be used after data import



66
67
68
69
70
71
72
73
74
# File 'lib/nbayes.rb', line 66

def get_default_proc
  return lambda do |hash, category|
	hash[category]= {
	  :tokens => Hash.new(0),         		# holds freq counts
	  :total_tokens => 0,
	  :examples => 0
	}
  end
end

#load(yml) ⇒ Object

Load class instance



170
171
172
173
174
175
176
177
178
179
180
# File 'lib/nbayes.rb', line 170

def load(yml)
  if yml.nil?
	return NBayes::Base.new
  elsif yml[0..2] == "---"
    nbayes = YAML.load(yml)
  else
    nbayes = YAML.load_file(yml_file)
  end
  nbayes.reset_after_import()         		# yaml does not properly set the defaults on the Hashes
  nbayes
end

#purge_less_than(x) ⇒ Object

Allows removal of low frequency words that increase processing time and may overfit

  • tokens with a count less than x (measured by summing across all classes) are removed

Ex: nb.purge_less_than(2)

NOTE: this does not decrement the “examples” count, so purging is not always the same

as if the item was never added in the first place, but usually so


46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/nbayes.rb', line 46

def purge_less_than(x)
  remove_list = {}
  @vocab.keys.each do |token|
	count = @data.keys.inject(0){|sum, cat| sum + @data[cat][:tokens][token] }
	next if count >= x
	@data.each do |cat, cat_data|
	  count = cat_data[:tokens][token]
	  cat_data[:tokens].delete(token) 		# delete and retrieve count
	  cat_data[:total_tokens] -= count                # subtract that count from cat counts
	end  # each category hash
	#print "removing #{token}\n"
	remove_list[token]=1
  end  # each vocab word
  remove_list.keys.each {|token| @vocab.delete(token) }
  #print "total vocab size is now #{vocab_size}\n" 
end

#reset_after_importObject

called internally after yaml import to reset Hash defaults



77
78
79
80
# File 'lib/nbayes.rb', line 77

def reset_after_import
  @data.default_proc = get_default_proc()
  @data.each {|cat, cat_hash| cat_hash[:tokens].default=0 }
end

#total_examplesObject

Total number of training instances



104
105
106
107
108
# File 'lib/nbayes.rb', line 104

def total_examples
  sum = 0
  @data.each {|cat, cat_data| sum += cat_data[:examples] }
  sum
end

#train(tokens, category) ⇒ Object



82
83
84
85
86
87
88
89
90
91
# File 'lib/nbayes.rb', line 82

def train(tokens, category)
  cat_data = @data[category]
  cat_data[:examples]+=1
  tokens = tokens.uniq  if binarized
  tokens.each do |w|
	@vocab[w]=1
	cat_data[:tokens][w]+=1
	cat_data[:total_tokens]+=1
  end
end

#vocab_sizeObject

Returns the size of the “vocab” - the number of unique tokens found in the text This is used in the Laplacian smoothing.



112
113
114
115
# File 'lib/nbayes.rb', line 112

def vocab_size
  return Math.log(@vocab.keys.length)  if @log_vocab
  @vocab.keys.length
end