Class: Classifier::Bayes

Inherits:
Object show all
Includes:
Streaming, Mutex_m
Defined in:
lib/classifier/bayes.rb

Overview

rubocop:disable Metrics/ClassLength

Constant Summary

Constants included from Streaming

Streaming::DEFAULT_BATCH_SIZE

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Streaming

#delete_checkpoint, #list_checkpoints, #save_checkpoint

Constructor Details

#initialize(*categories) ⇒ Bayes

The class can be created with one or more categories, each of which will be initialized and given a training method. E.g.,

b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
b = Classifier::Bayes.new ['Interesting', 'Uninteresting', 'Spam']


31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/classifier/bayes.rb', line 31

def initialize(*categories)
  super()
  @categories = {}
  categories.flatten.each { |category| @categories[category.prepare_category_name] = {} }
  @total_words = 0
  @category_counts = Hash.new(0)
  @category_word_count = Hash.new(0)
  @cached_training_count = nil
  @cached_vocab_size = nil
  @dirty = false
  @storage = nil
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args) ⇒ Object

Provides training and untraining methods for the categories specified in Bayes#new For example:

b = Classifier::Bayes.new 'This', 'That', 'the_other'
b.train_this "This text"
b.train_that "That text"
b.untrain_that "That text"
b.train_the_other "The other text"

Raises:

  • (StandardError)


230
231
232
233
234
235
236
237
238
# File 'lib/classifier/bayes.rb', line 230

def method_missing(name, *args)
  return super unless name.to_s =~ /(un)?train_(\w+)/

  category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
  raise StandardError, "No such category: #{category}" unless @categories.key?(category)

  method = name.to_s.start_with?('untrain_') ? :untrain : :train
  args.each { |text| send(method, category, text) }
end

Instance Attribute Details

#storageObject

Returns the value of attribute storage.



24
25
26
# File 'lib/classifier/bayes.rb', line 24

def storage
  @storage
end

Class Method Details

.from_json(json) ⇒ Object

Loads a classifier from a JSON string or a Hash created by #to_json or #as_json.

Raises:

  • (ArgumentError)


135
136
137
138
139
140
141
142
# File 'lib/classifier/bayes.rb', line 135

def self.from_json(json)
  data = json.is_a?(String) ? JSON.parse(json) : json
  raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'bayes'

  instance = allocate
  instance.send(:restore_state, data)
  instance
end

.load(storage:) ⇒ Object

Loads a classifier from the configured storage. The storage is set on the returned instance.

Raises:



206
207
208
209
210
211
212
213
# File 'lib/classifier/bayes.rb', line 206

def self.load(storage:)
  data = storage.read
  raise StorageError, 'No saved state found' unless data

  instance = from_json(data)
  instance.storage = storage
  instance
end

.load_checkpoint(storage:, checkpoint_id:) ⇒ Object

Loads a classifier from a checkpoint.

Raises:

  • (ArgumentError)


372
373
374
375
376
377
378
379
380
381
382
383
384
# File 'lib/classifier/bayes.rb', line 372

def self.load_checkpoint(storage:, checkpoint_id:)
  raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)

  dir = File.dirname(storage.path)
  base = File.basename(storage.path, '.*')
  ext = File.extname(storage.path)
  checkpoint_path = File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")

  checkpoint_storage = Storage::File.new(path: checkpoint_path)
  instance = load(storage: checkpoint_storage)
  instance.storage = storage
  instance
end

.load_from_file(path) ⇒ Object

Loads a classifier from a file (legacy API).



218
219
220
# File 'lib/classifier/bayes.rb', line 218

def self.load_from_file(path)
  from_json(File.read(path))
end

Instance Method Details

#add_category(category) ⇒ Object Also known as: append_category

Allows you to add categories to the classifier. For example:

b.add_category "Not spam"

WARNING: Adding categories to a trained classifier will result in an undertrained category that will tend to match more criteria than the trained selective categories. In short, try to initialize your categories at initialization.



265
266
267
268
269
270
271
# File 'lib/classifier/bayes.rb', line 265

def add_category(category)
  synchronize do
    invalidate_caches
    @dirty = true
    @categories[category.prepare_category_name] = {}
  end
end

#as_json(_options = nil) ⇒ Object

Returns a hash representation of the classifier state. This can be converted to JSON or used directly.



113
114
115
116
117
118
119
120
121
122
# File 'lib/classifier/bayes.rb', line 113

def as_json(_options = nil)
  {
    version: 1,
    type: 'bayes',
    categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
    total_words: @total_words,
    category_counts: @category_counts.transform_keys(&:to_s),
    category_word_count: @category_word_count.transform_keys(&:to_s)
  }
end

#categoriesObject

Provides a list of category names For example:

b.categories
=>   ['This', 'That', 'the_other']


251
252
253
# File 'lib/classifier/bayes.rb', line 251

def categories
  synchronize { @categories.keys.collect(&:to_s) }
end

#classifications(text) ⇒ Object

Returns the scores in each category the provided text. E.g.,

b.classifications "I hate bad words and you"
=>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}

The largest of these scores (the one closest to 0) is the one picked out by #classify



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/classifier/bayes.rb', line 78

def classifications(text)
  words = text.word_hash.keys
  synchronize do
    training_count = cached_training_count
    vocab_size = cached_vocab_size

    @categories.to_h do |category, category_words|
      smoothed_total = ((@category_word_count[category] || 0) + vocab_size).to_f

      # Laplace smoothing: P(word|category) = (count + α) / (total + α * V)
      word_score = words.sum { |w| Math.log(((category_words[w] || 0) + 1) / smoothed_total) }
      prior_score = Math.log((@category_counts[category] || 0.1) / training_count)

      [category.to_s, word_score + prior_score]
    end
  end
end

#classify(text) ⇒ Object

Returns the classification of the provided text, which is one of the categories given in the initializer. E.g.,

b.classify "I hate bad words and you"
=>  'Uninteresting'

Raises:

  • (StandardError)


102
103
104
105
106
107
# File 'lib/classifier/bayes.rb', line 102

def classify(text)
  best = classifications(text).min_by { |a| -a[1] }
  raise StandardError, 'No classifications available' unless best

  best.first.to_s
end

#dirty?Boolean

Returns true if there are unsaved changes.

Returns:

  • (Boolean)


198
199
200
# File 'lib/classifier/bayes.rb', line 198

def dirty?
  @dirty
end

#marshal_dumpObject

Custom marshal serialization to exclude mutex state



277
278
279
# File 'lib/classifier/bayes.rb', line 277

def marshal_dump
  [@categories, @total_words, @category_counts, @category_word_count, @dirty]
end

#marshal_load(data) ⇒ Object

Custom marshal deserialization to recreate mutex



283
284
285
286
287
288
289
# File 'lib/classifier/bayes.rb', line 283

def marshal_load(data)
  mu_initialize
  @categories, @total_words, @category_counts, @category_word_count, @dirty = data
  @cached_training_count = nil
  @cached_vocab_size = nil
  @storage = nil
end

#reloadObject

Reloads the classifier from the configured storage. Raises UnsavedChangesError if there are unsaved changes. Use reload! to force reload and discard changes.

Raises:

  • (ArgumentError)


169
170
171
172
173
174
175
176
177
178
179
# File 'lib/classifier/bayes.rb', line 169

def reload
  raise ArgumentError, 'No storage configured' unless storage
  raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty

  data = storage.read
  raise StorageError, 'No saved state found' unless data

  restore_from_json(data)
  @dirty = false
  self
end

#reload!Object

Force reloads the classifier from storage, discarding any unsaved changes.

Raises:

  • (ArgumentError)


184
185
186
187
188
189
190
191
192
193
# File 'lib/classifier/bayes.rb', line 184

def reload!
  raise ArgumentError, 'No storage configured' unless storage

  data = storage.read
  raise StorageError, 'No saved state found' unless data

  restore_from_json(data)
  @dirty = false
  self
end

#remove_category(category) ⇒ Object

Allows you to remove categories from the classifier. For example:

b.remove_category "Spam"

WARNING: Removing categories from a trained classifier will result in the loss of all training data for that category. Make sure you really want to do this before calling this method.



300
301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/classifier/bayes.rb', line 300

def remove_category(category)
  category = category.prepare_category_name
  synchronize do
    raise StandardError, "No such category: #{category}" unless @categories.key?(category)

    invalidate_caches
    @dirty = true
    @total_words -= @category_word_count[category].to_i

    @categories.delete(category)
    @category_counts.delete(category)
    @category_word_count.delete(category)
  end
end

#respond_to_missing?(name, include_private = false) ⇒ Boolean

Returns:

  • (Boolean)


241
242
243
# File 'lib/classifier/bayes.rb', line 241

def respond_to_missing?(name, include_private = false)
  !!(name.to_s =~ /(un)?train_(\w+)/) || super
end

#saveObject

Saves the classifier to the configured storage. Raises ArgumentError if no storage is configured.

Raises:

  • (ArgumentError)


148
149
150
151
152
153
# File 'lib/classifier/bayes.rb', line 148

def save
  raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage

  storage.write(to_json)
  @dirty = false
end

#save_to_file(path) ⇒ Object

Saves the classifier state to a file (legacy API).



158
159
160
161
162
# File 'lib/classifier/bayes.rb', line 158

def save_to_file(path)
  result = File.write(path, to_json)
  @dirty = false
  result
end

#to_json(_options = nil) ⇒ Object

Serializes the classifier state to a JSON string. This can be saved to a file and later loaded with Bayes.from_json.



128
129
130
# File 'lib/classifier/bayes.rb', line 128

def to_json(_options = nil)
  as_json.to_json
end

#train(category = nil, text = nil, **categories) ⇒ Object

Trains the classifier with text for a category.

b.train(spam: "Buy now!", ham: ["Hello", "Meeting tomorrow"])
b.train(:spam, "legacy positional API")


50
51
52
53
54
55
56
# File 'lib/classifier/bayes.rb', line 50

def train(category = nil, text = nil, **categories)
  return train_single(category, text) if category && text

  categories.each do |cat, texts|
    (texts.is_a?(Array) ? texts : [texts]).each { |t| train_single(cat, t) }
  end
end

#train_batch(category = nil, documents = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &block) ⇒ Object

Trains the classifier with an array of documents in batches. Reduces lock contention by processing multiple documents per synchronize call.

Examples:

Positional style

classifier.train_batch(:spam, documents, batch_size: 100)

Keyword style

classifier.train_batch(spam: documents, ham: other_docs, batch_size: 100)

With progress tracking

classifier.train_batch(:spam, documents, batch_size: 100) do |progress|
  puts "#{progress.percent}% complete"
end


359
360
361
362
363
364
365
366
367
# File 'lib/classifier/bayes.rb', line 359

def train_batch(category = nil, documents = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &block)
  if category && documents
    train_batch_for_category(category, documents, batch_size: batch_size, &block)
  else
    categories.each do |cat, docs|
      train_batch_for_category(cat, Array(docs), batch_size: batch_size, &block)
    end
  end
end

#train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE) ⇒ Object

Trains the classifier from an IO stream. Each line in the stream is treated as a separate document. This is memory-efficient for large corpora.

Examples:

Train from a file

classifier.train_from_stream(:spam, File.open('spam_corpus.txt'))

With progress tracking

classifier.train_from_stream(:spam, io, batch_size: 500) do |progress|
  puts "#{progress.completed} documents processed"
end

Raises:

  • (StandardError)


328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
# File 'lib/classifier/bayes.rb', line 328

def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
  category = category.prepare_category_name
  raise StandardError, "No such category: #{category}" unless @categories.key?(category)

  reader = Streaming::LineReader.new(io, batch_size: batch_size)
  total = reader.estimate_line_count
  progress = Streaming::Progress.new(total: total)

  reader.each_batch do |batch|
    train_batch_internal(category, batch)
    progress.completed += batch.size
    progress.current_batch += 1
    yield progress if block_given?
  end
end

#untrain(category = nil, text = nil, **categories) ⇒ Object

Removes training data. Be careful with this method.

b.untrain(spam: "Buy now!")
b.untrain(:spam, "legacy positional API")


64
65
66
67
68
69
70
# File 'lib/classifier/bayes.rb', line 64

def untrain(category = nil, text = nil, **categories)
  return untrain_single(category, text) if category && text

  categories.each do |cat, texts|
    (texts.is_a?(Array) ? texts : [texts]).each { |t| untrain_single(cat, t) }
  end
end