Class: ClassifierReborn::Bayes

Inherits:
Object
  • Object
show all
Defined in:
lib/classifier-reborn/bayes.rb

Constant Summary collapse

CategoryNotFoundError =
Class.new(StandardError)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ Bayes

The class can be created with one or more categories, each of which will be initialized and given a training method. E.g.,

b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'

Options available are:

language:         'en'                    Used to select language specific stop words
auto_categorize:  false                   When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
enable_threshold: false                   When true, enables a threshold requirement for classifition
threshold:        0.0                     Default threshold, only used when enabled
enable_stemmer:   true                    When false, disables word stemming
stopwords:        nil                     Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
backend:          BayesMemoryBackend.new  Alternatively, BayesRedisBackend.new for persistent storage


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/classifier-reborn/bayes.rb', line 32

def initialize(*args)
  @initial_categories = []
  options = { language: 'en',
              enable_threshold: false,
              threshold: 0.0,
              enable_stemmer: true,
              backend: BayesMemoryBackend.new }
  args.flatten.each do |arg|
    if arg.is_a?(Hash)
      options.merge!(arg)
    else
      @initial_categories.push(arg)
    end
  end

  unless options.key?(:auto_categorize)
    options[:auto_categorize] = @initial_categories.empty? ? true : false
  end

  @language            = options[:language]
  @auto_categorize     = options[:auto_categorize]
  @enable_threshold    = options[:enable_threshold]
  @threshold           = options[:threshold]
  @enable_stemmer      = options[:enable_stemmer]
  @backend             = options[:backend]
  @tokenizer           = options[:tokenizer] || Tokenizer::Whitespace
  @token_filters       = options[:token_filters] || [TokenFilter::Stopword]
  @token_filters << TokenFilter::Stemmer if @enable_stemmer && !@token_filters.include?(TokenFilter::Stemmer)
  TokenFilter::Stopword.language = @language if @token_filters.include?(TokenFilter::Stopword)

  populate_initial_categories

  custom_stopwords options[:stopwords] if options.key?(:stopwords)
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args) ⇒ Object

Provides training and untraining methods for the categories specified in Bayes#new For example:

b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
b.train_this "This text"
b.train_that "That text"
b.untrain_that "That text"
b.train_the_other "The other text"


214
215
216
217
218
219
220
221
222
223
224
# File 'lib/classifier-reborn/bayes.rb', line 214

def method_missing(name, *args)
  cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
  category = CategoryNamer.prepare_name(cleaned_name)
  if category_keys.include?(category)
    args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
  elsif name.to_s =~ /(un)?train_([\w]+)/
    raise StandardError, "No such category: #{category}"
  else
    super # raise StandardError, "No such method: #{name}"
  end
end

Instance Attribute Details

#thresholdObject

Retrieve the current threshold value



172
173
174
# File 'lib/classifier-reborn/bayes.rb', line 172

def threshold
  @threshold
end

Instance Method Details

#add_category(category) ⇒ Object Also known as: append_category

Allows you to add categories to the classifier. For example:

b.add_category "Not spam"

WARNING: Adding categories to a trained classifier will result in an undertrained category that will tend to match more criteria than the trained selective categories. In short, try to initialize your categories at initialization.



250
251
252
253
# File 'lib/classifier-reborn/bayes.rb', line 250

def add_category(category)
  category = CategoryNamer.prepare_name(category)
  @backend.add_category(category)
end

#categoriesObject

Provides a list of category names For example:

b.categories
=>   ["This", "That", "The other"]


230
231
232
# File 'lib/classifier-reborn/bayes.rb', line 230

def categories
  category_keys.collect(&:to_s)
end

#category_keysObject

Provides a list of category keys as symbols For example:

b.categories
=>   [:This, :That, :"The other"]


238
239
240
# File 'lib/classifier-reborn/bayes.rb', line 238

def category_keys
  @backend.category_keys
end

#classifications(text) ⇒ Object

Returns the scores in each category the provided text. E.g.,

b.classifications "I hate bad words and you"
=>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}

The largest of these scores (the one closest to 0) is the one picked out by #classify



132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/classifier-reborn/bayes.rb', line 132

def classifications(text)
  score = {}
  word_hash = Hasher.word_hash(text, @enable_stemmer,
                               tokenizer: @tokenizer, token_filters: @token_filters)
  if word_hash.empty?
    category_keys.each do |category|
      score[category.to_s] = Float::INFINITY
    end
    return score
  end
  category_keys.each do |category|
    score[category.to_s] = 0
    total = (@backend.category_word_count(category) || 1).to_f
    word_hash.each do |word, _count|
      s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
      score[category.to_s] += Math.log(s / total)
    end
    # now add prior probability for the category
    s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
    score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
  end
  score
end

#classify(text) ⇒ Object

Return the classification without the score



165
166
167
168
169
# File 'lib/classifier-reborn/bayes.rb', line 165

def classify(text)
  result, score = classify_with_score(text)
  result = nil if threshold_enabled? && (score < @threshold || score == Float::INFINITY)
  result
end

#classify_with_score(text) ⇒ Object

Returns the classification of the provided text, which is one of the categories given in the initializer along with the score. E.g.,

b.classify "I hate bad words and you"
=>  ['Uninteresting', -4.852030263919617]


160
161
162
# File 'lib/classifier-reborn/bayes.rb', line 160

def classify_with_score(text)
  (classifications(text).sort_by { |a| -a[1] })[0]
end

#disable_thresholdObject

Dynamically disable threshold for classify results



183
184
185
# File 'lib/classifier-reborn/bayes.rb', line 183

def disable_threshold
  @enable_threshold = false
end

#enable_thresholdObject

Dynamically enable threshold for classify results



178
179
180
# File 'lib/classifier-reborn/bayes.rb', line 178

def enable_threshold
  @enable_threshold = true
end

#resetObject



257
258
259
260
# File 'lib/classifier-reborn/bayes.rb', line 257

def reset
  @backend.reset
  populate_initial_categories
end

#stemmer_disabled?Boolean

Is word stemming disabled?

Returns:

  • (Boolean)


203
204
205
# File 'lib/classifier-reborn/bayes.rb', line 203

def stemmer_disabled?
  !@enable_stemmer
end

#stemmer_enabled?Boolean

Is word stemming enabled?

Returns:

  • (Boolean)


198
199
200
# File 'lib/classifier-reborn/bayes.rb', line 198

def stemmer_enabled?
  @enable_stemmer
end

#threshold_disabled?Boolean

is threshold processing disabled?

Returns:

  • (Boolean)


193
194
195
# File 'lib/classifier-reborn/bayes.rb', line 193

def threshold_disabled?
  !@enable_threshold
end

#threshold_enabled?Boolean

Is threshold processing enabled?

Returns:

  • (Boolean)


188
189
190
# File 'lib/classifier-reborn/bayes.rb', line 188

def threshold_enabled?
  @enable_threshold
end

#train(category, text) ⇒ Object

Provides a general training method for all categories specified in Bayes#new For example:

b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
b.train :this, "This text"
b.train "that", "That text"
b.train "The other", "The other text"


73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/classifier-reborn/bayes.rb', line 73

def train(category, text)
  word_hash = Hasher.word_hash(text, @enable_stemmer,
                               tokenizer: @tokenizer, token_filters: @token_filters)
  return if word_hash.empty?

  category = CategoryNamer.prepare_name(category)

  # Add the category dynamically or raise an error
  unless category_keys.include?(category)
    if @auto_categorize
      add_category(category)
    else
      raise CategoryNotFoundError, "Cannot train; category #{category} does not exist"
    end
  end

  word_hash.each do |word, count|
    @backend.update_category_word_frequency(category, word, count)
    @backend.update_category_word_count(category, count)
    @backend.update_total_words(count)
  end
  @backend.update_total_trainings(1)
  @backend.update_category_training_count(category, 1)
end

#untrain(category, text) ⇒ Object

Provides a untraining method for all categories specified in Bayes#new Be very careful with this method.

For example:

b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
b.train :this, "This text"
b.untrain :this, "This text"


105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/classifier-reborn/bayes.rb', line 105

def untrain(category, text)
  word_hash = Hasher.word_hash(text, @enable_stemmer,
                               tokenizer: @tokenizer, token_filters: @token_filters)
  return if word_hash.empty?

  category = CategoryNamer.prepare_name(category)
  word_hash.each do |word, count|
    next if @backend.total_words < 0

    orig = @backend.category_word_frequency(category, word) || 0
    @backend.update_category_word_frequency(category, word, -count)
    if @backend.category_word_frequency(category, word) <= 0
      @backend.delete_category_word(category, word)
      count = orig
    end

    @backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
    @backend.update_total_words(-count)
  end
  @backend.update_total_trainings(-1)
  @backend.update_category_training_count(category, -1)
end