Class: NBayes::Base

Inherits:

Object

Object
NBayes::Base

show all

Defined in:: lib/nbayes.rb

Instance Attribute Summary collapse

#assume_uniform ⇒ Object

Returns the value of attribute assume_uniform.
#binarized ⇒ Object readonly

Returns the value of attribute binarized.
#data ⇒ Object

Returns the value of attribute data.
#debug ⇒ Object

Returns the value of attribute debug.
#k ⇒ Object

Returns the value of attribute k.
#log_vocab ⇒ Object

Returns the value of attribute log_vocab.
#vocab ⇒ Object

Returns the value of attribute vocab.

Class Method Summary collapse

.from(yml_file) ⇒ Object

Loads class instance from a data file (e.g., yaml).

Instance Method Summary collapse

#calculate_probabilities(tokens) ⇒ Object

Calculates the actual probability of a class given the tokens (this is the work horse of the code).
#classify(tokens) ⇒ Object
#dump(arg) ⇒ Object

Dumps class instance to a data file (e.g., yaml) or a string.
#get_default_proc ⇒ Object

Returns the default proc used by the data hash Separate method so that it can be used after data import.
#initialize(options = {}) ⇒ Base constructor

A new instance of Base.
#load(yml) ⇒ Object

Load class instance.
#purge_less_than(x) ⇒ Object

Allows removal of low frequency words that increase processing time and may overfit - tokens with a count less than x (measured by summing across all classes) are removed Ex: nb.purge_less_than(2).
#reset_after_import ⇒ Object

called internally after yaml import to reset Hash defaults.
#total_examples ⇒ Object

Total number of training instances.
#train(tokens, category) ⇒ Object
#vocab_size ⇒ Object

Returns the size of the “vocab” - the number of unique tokens found in the text This is used in the Laplacian smoothing.

Constructor Details

#initialize(options = {}) ⇒ `Base`

Returns a new instance of Base.

# File 'lib/nbayes.rb', line 20

def initialize(options={})
  @debug = false
  @k = 1
  @binarized = options[:binarized] || false
  @log_vocab = false    				# for smoothing, use log of vocab size, rather than vocab size 
  @assume_uniform = false
  @vocab = Hash.new                   		# used to calculate vocab size (@vocab.keys.length)
  @data = Hash.new
  @data.default_proc = get_default_proc()
  #@data = {
  #  "category1": {
  #    "tokens": Hash.new(0),
  #    "total_tokens": 0,
  #    "examples": 0
  #  },
  # ...
  #}
end

Instance Attribute Details

#assume_uniform ⇒ `Object`

Returns the value of attribute assume_uniform.



17
18
19

# File 'lib/nbayes.rb', line 17

def assume_uniform
  @assume_uniform
end

#binarized ⇒ `Object` (readonly)

Returns the value of attribute binarized.



18
19
20

# File 'lib/nbayes.rb', line 18

def binarized
  @binarized
end

#data ⇒ `Object`

Returns the value of attribute data.



17
18
19

# File 'lib/nbayes.rb', line 17

def data
  @data
end

#debug ⇒ `Object`

Returns the value of attribute debug.



17
18
19

# File 'lib/nbayes.rb', line 17

def debug
  @debug
end

#k ⇒ `Object`

Returns the value of attribute k.



17
18
19

# File 'lib/nbayes.rb', line 17

def k
  @k
end

#log_vocab ⇒ `Object`

Returns the value of attribute log_vocab.



17
18
19

# File 'lib/nbayes.rb', line 17

def log_vocab
  @log_vocab
end

#vocab ⇒ `Object`

Returns the value of attribute vocab.



17
18
19

# File 'lib/nbayes.rb', line 17

def vocab
  @vocab
end

Class Method Details

.from(yml_file) ⇒ `Object`

Loads class instance from a data file (e.g., yaml)

# File 'lib/nbayes.rb', line 163

def self.from(yml_file)
  nbayes = YAML.load_file(yml_file)
  nbayes.reset_after_import()         		# yaml does not properly set the defaults on the Hashes
  nbayes
end

Instance Method Details

#calculate_probabilities(tokens) ⇒ `Object`

Calculates the actual probability of a class given the tokens (this is the work horse of the code)

# File 'lib/nbayes.rb', line 119

def calculate_probabilities(tokens)
  # P(class|words) = P(w1,...,wn|class) * P(class) / P(w1,...,wn)
  #                = argmax P(w1,...,wn|class) * P(class)
  # 
  # P(wi|class) = (count(wi, class) + k)/(count(w,class) + kV)
  prob_numerator = {}
  v_size = vocab_size
  @data.keys.each do |category|
	cat_data = @data[category]
	cat_prob = Math.log(cat_data[:examples]/total_examples().to_f)
	cat_prob = Math.log(1/@data.keys.length.to_f)  if assume_uniform
	log_probs = 0
	cat_denominator = (cat_data[:total_tokens]+ @k*v_size).to_f
	tokens.each do |token|
	  log_probs += Math.log( (cat_data[:tokens][token] + @k)/cat_denominator )
	end
	prob_numerator[category] = log_probs + cat_prob
  end
  # calculate the denominator, which normalizes this into a probability; it's just the sum of all numerators from above
  normalizer = 0
  prob_numerator.each {|cat, numerator| normalizer += numerator }
  # One more caveat:
  # We're using log probabilities, so the numbers are negative and the smallest negative number is actually the largest prob.
  # To convert, we need to maintain the relative distance between all of the probabilities:
  # - divide log prob by normalizer: this keeps ratios the same, but reverses the ordering
  # - re-normalize based off new counts
  # - final calculation
  # Ex: -1,-1,-2  =>  -4/-1, -4/-1, -4/-2
  #   - renormalize and calculate => 4/10, 4/10, 2/10
  intermed = {}
  renormalizer = 0
  prob_numerator.each do |cat, numerator|
	intermed[cat]=normalizer/numerator.to_f
	renormalizer += intermed[cat]
  end
  # calculate final probs
  final_probs = {}
  intermed.each do |cat, value|
	final_probs[cat]=value/renormalizer.to_f
  end
  final_probs
end

#classify(tokens) ⇒ `Object`

# File 'lib/nbayes.rb', line 93

def classify(tokens)
  print "classify: #{tokens.join(', ')}\n" if @debug
  probs = {}
  tokens = tokens.uniq  if binarized
  probs = calculate_probabilities(tokens)
  print "results: #{probs.to_yaml}\n" if @debug
  probs.extend(NBayes::Result)
  probs
end

#dump(arg) ⇒ `Object`

Dumps class instance to a data file (e.g., yaml) or a string

# File 'lib/nbayes.rb', line 183

def dump(arg)
  if arg.instance_of? String
    File.open(arg, "w") {|f| YAML.dump(self, f) }
  else
    YAML.dump(arg)
  end
end

#get_default_proc ⇒ `Object`

Returns the default proc used by the data hash Separate method so that it can be used after data import

# File 'lib/nbayes.rb', line 66

def get_default_proc
  return lambda do |hash, category|
	hash[category]= {
	  :tokens => Hash.new(0),         		# holds freq counts
	  :total_tokens => 0,
	  :examples => 0
	}
  end
end

#load(yml) ⇒ `Object`

Load class instance

# File 'lib/nbayes.rb', line 170

def load(yml)
  if yml.nil?
	return NBayes::Base.new
  elsif yml[0..2] == "---"
    nbayes = YAML.load(yml)
  else
    nbayes = YAML.load_file(yml_file)
  end
  nbayes.reset_after_import()         		# yaml does not properly set the defaults on the Hashes
  nbayes
end

#purge_less_than(x) ⇒ `Object`

Allows removal of low frequency words that increase processing time and may overfit

tokens with a count less than x (measured by summing across all classes) are removed

Ex: nb.purge_less_than(2)

NOTE: this does not decrement the “examples” count, so purging is not always the same

as if the item was never added in the first place, but usually so

# File 'lib/nbayes.rb', line 46

def purge_less_than(x)
  remove_list = {}
  @vocab.keys.each do |token|
	count = @data.keys.inject(0){|sum, cat| sum + @data[cat][:tokens][token] }
	next if count >= x
	@data.each do |cat, cat_data|
	  count = cat_data[:tokens][token]
	  cat_data[:tokens].delete(token) 		# delete and retrieve count
	  cat_data[:total_tokens] -= count                # subtract that count from cat counts
	end  # each category hash
	#print "removing #{token}\n"
	remove_list[token]=1
  end  # each vocab word
  remove_list.keys.each {|token| @vocab.delete(token) }
  #print "total vocab size is now #{vocab_size}\n" 
end

#reset_after_import ⇒ `Object`

called internally after yaml import to reset Hash defaults

# File 'lib/nbayes.rb', line 77

def reset_after_import
  @data.default_proc = get_default_proc()
  @data.each {|cat, cat_hash| cat_hash[:tokens].default=0 }
end

#total_examples ⇒ `Object`

Total number of training instances

# File 'lib/nbayes.rb', line 104

def total_examples
  sum = 0
  @data.each {|cat, cat_data| sum += cat_data[:examples] }
  sum
end

#train(tokens, category) ⇒ `Object`

# File 'lib/nbayes.rb', line 82

def train(tokens, category)
  cat_data = @data[category]
  cat_data[:examples]+=1
  tokens = tokens.uniq  if binarized
  tokens.each do |w|
	@vocab[w]=1
	cat_data[:tokens][w]+=1
	cat_data[:total_tokens]+=1
  end
end

#vocab_size ⇒ `Object`

Returns the size of the “vocab” - the number of unique tokens found in the text This is used in the Laplacian smoothing.

# File 'lib/nbayes.rb', line 112

def vocab_size
  return Math.log(@vocab.keys.length)  if @log_vocab
  @vocab.keys.length
end

Class: NBayes::Base

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Base

Instance Attribute Details

#assume_uniform ⇒ Object

#binarized ⇒ Object (readonly)

#data ⇒ Object

#debug ⇒ Object

#k ⇒ Object

#log_vocab ⇒ Object

#vocab ⇒ Object

Class Method Details

.from(yml_file) ⇒ Object

Instance Method Details

#calculate_probabilities(tokens) ⇒ Object

#classify(tokens) ⇒ Object

#dump(arg) ⇒ Object

#get_default_proc ⇒ Object

#load(yml) ⇒ Object

#purge_less_than(x) ⇒ Object

#reset_after_import ⇒ Object

#total_examples ⇒ Object

#train(tokens, category) ⇒ Object

#vocab_size ⇒ Object