Class: NaiveBayes

Inherits:
Object
  • Object
show all
Defined in:
lib/twss-classifier/naive-bayes.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(ngram_size, pos_training_examples, neg_training_examples) ⇒ NaiveBayes

Returns a new instance of NaiveBayes.



14
15
16
17
18
# File 'lib/twss-classifier/naive-bayes.rb', line 14

def initialize(ngram_size, pos_training_examples, neg_training_examples)
  @ngram_size = ngram_size
  @pos_training_examples = pos_training_examples # Array of positive training examples.
  @neg_training_examples = neg_training_examples
end

Class Method Details

.load_yaml(filename) ⇒ Object



77
78
79
# File 'lib/twss-classifier/naive-bayes.rb', line 77

def self.load_yaml(filename)
  return YAML::load(File.read(filename))
end

Instance Method Details

#classify(sentence) ⇒ Object

Returns the probability that ‘sentence` is a TWSS sentence.



63
64
65
66
67
68
69
# File 'lib/twss-classifier/naive-bayes.rb', line 63

def classify(sentence)
  probs = to_ngrams(sentence, @ngram_size).map{ |word| @probs[word] || [0.5, 0.5] }
  pos_probs = probs.map{ |x| x[0] }.product
  neg_probs = probs.map{ |x| x[1] }.product
  pos_p = pos_probs / (pos_probs + neg_probs)
  return pos_p
end

#puts_best_neg_predictorsObject



89
90
91
92
93
94
95
# File 'lib/twss-classifier/naive-bayes.rb', line 89

def puts_best_neg_predictors
  total_pos_count = @pos_counts.values.sum
  @probs.to_a.sort_by{ |k, v| p = v[1] / v.sum; p }.select{ |k, v| @pos_counts[k] > 10 }.reverse.first(500).each do |k, v|
    puts [k.ljust(20), v[1] / v.sum, @pos_counts[k] ].join("\t") #if @pos_counts[k] > 10
#      puts [k.ljust(20), v[0] / v.sum ].join("\t") #if @pos_counts[k] > 10
  end
end

#puts_best_pos_predictorsObject



81
82
83
84
85
86
87
# File 'lib/twss-classifier/naive-bayes.rb', line 81

def puts_best_pos_predictors
  total_pos_count = @pos_counts.values.sum
  @probs.to_a.sort_by{ |k, v| p = v[0] / v.sum; p }.select{ |k, v| @pos_counts[k] > 10 }.reverse.first(500).each do |k, v|
    puts [k.ljust(20), v[0] / v.sum, @pos_counts[k] ].join("\t") #if @pos_counts[k] > 10
#      puts [k.ljust(20), v[0] / v.sum ].join("\t") #if @pos_counts[k] > 10
  end
end

#test(threshold, pos_test_examples, neg_test_examples) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/twss-classifier/naive-bayes.rb', line 37

def test(threshold, pos_test_examples, neg_test_examples)
  tp = 0
  fp = 0
  tn = 0
  fn = 0
  
  pos_test_examples.each do |line|
    if self.classify(line) > threshold
      tp += 1
    else
      fn += 1
    end
  end
  
  neg_test_examples.each do |line|
    if self.classify(line) < threshold
      tn += 1
    else
      fp += 1
    end
  end

  return tp, fp, tn, fn   
end

#trainObject



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/twss-classifier/naive-bayes.rb', line 20

def train
  # Get hashes from ngrams to their (smoothed) counts.
  @pos_counts = get_ngram_counts(@pos_training_examples)
  @neg_counts = get_ngram_counts(@neg_training_examples)

  pos_total_count = @pos_counts.values.sum
  neg_total_count = @neg_counts.values.sum

  # Get the proportions of ngrams in each corpus.
  @probs = {} # Hash.new { |h, k| h[k] = [0.5, 0.5] }
  (@pos_counts.keys + @neg_counts.keys).uniq.each do |ngram|
    pos_p = @pos_counts[ngram].to_f / pos_total_count
    neg_p = @neg_counts[ngram].to_f / neg_total_count
    @probs[ngram] = [pos_p, neg_p]
  end
end

#yamlize(filename) ⇒ Object



71
72
73
74
75
# File 'lib/twss-classifier/naive-bayes.rb', line 71

def yamlize(filename)
  File.open(filename, "w") do |f|
    f.puts self.to_yaml
  end
end