Class: Classifier::KNN

Inherits:
Object show all
Includes:
Streaming, Mutex_m
Defined in:
lib/classifier/knn.rb

Overview

Instance-based classification: stores examples and classifies by similarity.

Example:

knn = Classifier::KNN.new(k: 3)
knn.add("spam" => ["Buy now!", "Limited offer!"])
knn.add("ham" => ["Meeting tomorrow", "Project update"])
knn.classify("Special discount!") # => "spam"

Constant Summary

Constants included from Streaming

Streaming::DEFAULT_BATCH_SIZE

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Streaming

#delete_checkpoint, #list_checkpoints, #save_checkpoint

Constructor Details

#initialize(k: 5, weighted: false) ⇒ KNN

Creates a new kNN classifier.



35
36
37
38
39
40
41
42
43
# File 'lib/classifier/knn.rb', line 35

def initialize(k: 5, weighted: false) # rubocop:disable Naming/MethodParameterName
  super()
  validate_k!(k)
  @k = k
  @weighted = weighted
  @lsi = LSI.new(auto_rebuild: true)
  @dirty = false
  @storage = nil
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args) ⇒ Object

Provides dynamic training methods for categories. For example:

knn.train_spam "Buy now!"
knn.train_ham "Meeting tomorrow"


121
122
123
124
125
126
127
# File 'lib/classifier/knn.rb', line 121

def method_missing(name, *args)
  category_match = name.to_s.match(/\Atrain_(\w+)\z/)
  return super unless category_match

  category = category_match[1].to_sym
  args.each { |text| add(category => text) }
end

Instance Attribute Details

#kObject

Returns the value of attribute k.



30
31
32
# File 'lib/classifier/knn.rb', line 30

def k
  @k
end

#storageObject

Returns the value of attribute storage.



31
32
33
# File 'lib/classifier/knn.rb', line 31

def storage
  @storage
end

#weightedObject

Returns the value of attribute weighted.



31
32
33
# File 'lib/classifier/knn.rb', line 31

def weighted
  @weighted
end

Class Method Details

.from_json(json) ⇒ Object

Loads a classifier from a JSON string or Hash.

Raises:

  • (ArgumentError)


152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/classifier/knn.rb', line 152

def self.from_json(json)
  data = json.is_a?(String) ? JSON.parse(json) : json
  raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'knn'

  lsi_data = data['lsi'].dup
  lsi_data['type'] = 'lsi'

  instance = new(k: data['k'], weighted: data['weighted'])
  instance.instance_variable_set(:@lsi, LSI.from_json(lsi_data))
  instance.instance_variable_set(:@dirty, false)
  instance
end

.load(storage:) ⇒ Object

Loads a classifier from configured storage.

Raises:



216
217
218
219
220
221
222
223
# File 'lib/classifier/knn.rb', line 216

def self.load(storage:)
  data = storage.read
  raise StorageError, 'No saved state found' unless data

  instance = from_json(data)
  instance.storage = storage
  instance
end

.load_checkpoint(storage:, checkpoint_id:) ⇒ Object

Loads a classifier from a checkpoint.

Raises:

  • (ArgumentError)


246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/classifier/knn.rb', line 246

def self.load_checkpoint(storage:, checkpoint_id:)
  raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)

  dir = File.dirname(storage.path)
  base = File.basename(storage.path, '.*')
  ext = File.extname(storage.path)
  checkpoint_path = File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")

  checkpoint_storage = Storage::File.new(path: checkpoint_path)
  instance = load(storage: checkpoint_storage)
  instance.storage = storage
  instance
end

.load_from_file(path) ⇒ Object

Loads a classifier from a file.



227
228
229
# File 'lib/classifier/knn.rb', line 227

def self.load_from_file(path)
  from_json(File.read(path))
end

Instance Method Details

#add(**items) ⇒ Object Also known as: train

Adds labeled examples. Keys are categories, values are items or arrays. Also aliased as ‘train` for API consistency with Bayes and LogisticRegression.

knn.add(spam: "Buy now!", ham: "Meeting tomorrow")
knn.train(spam: "Buy now!", ham: "Meeting tomorrow")  # equivalent


52
53
54
55
# File 'lib/classifier/knn.rb', line 52

def add(**items)
  synchronize { @dirty = true }
  @lsi.add(**items)
end

#as_json(_options = nil) ⇒ Object



135
136
137
138
139
140
141
142
143
# File 'lib/classifier/knn.rb', line 135

def as_json(_options = nil)
  {
    version: 1,
    type: 'knn',
    k: @k,
    weighted: @weighted,
    lsi: @lsi.as_json
  }
end

#categoriesObject

Returns all unique categories as strings.



105
106
107
108
109
# File 'lib/classifier/knn.rb', line 105

def categories
  synchronize do
    @lsi.items.flat_map { |item| @lsi.categories_for(item) }.uniq.map(&:to_s)
  end
end

#categories_for(item) ⇒ Object



88
89
90
# File 'lib/classifier/knn.rb', line 88

def categories_for(item)
  @lsi.categories_for(item)
end

#classify(text) ⇒ Object

Classifies text using k nearest neighbors with majority voting. Returns the category as a String for API consistency with Bayes and LogisticRegression.



62
63
64
65
# File 'lib/classifier/knn.rb', line 62

def classify(text)
  result = classify_with_neighbors(text)
  result[:category]&.to_s
end

#classify_with_neighbors(text) ⇒ Object

Classifies and returns neighbors:, votes:, confidence:.



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/classifier/knn.rb', line 69

def classify_with_neighbors(text)
  synchronize do
    return empty_result if @lsi.items.empty?

    neighbors = find_neighbors(text)
    return empty_result if neighbors.empty?

    votes = tally_votes(neighbors)
    winner = votes.max_by { |_, v| v }&.first
    return empty_result unless winner

    total_votes = votes.values.sum
    confidence = total_votes.positive? ? votes[winner] / total_votes.to_f : 0.0

    { category: winner, neighbors: neighbors, votes: votes, confidence: confidence }
  end
end

#dirty?Boolean

Returns:

  • (Boolean)


210
211
212
# File 'lib/classifier/knn.rb', line 210

def dirty?
  @dirty
end

#itemsObject



99
100
101
# File 'lib/classifier/knn.rb', line 99

def items
  @lsi.items
end

#marshal_dumpObject



232
233
234
# File 'lib/classifier/knn.rb', line 232

def marshal_dump
  [@k, @weighted, @lsi, @dirty]
end

#marshal_load(data) ⇒ Object



237
238
239
240
241
# File 'lib/classifier/knn.rb', line 237

def marshal_load(data)
  mu_initialize
  @k, @weighted, @lsi, @dirty = data
  @storage = nil
end

#reloadObject

Reloads the classifier from configured storage.

Raises:

  • (ArgumentError)


184
185
186
187
188
189
190
191
192
193
194
# File 'lib/classifier/knn.rb', line 184

def reload
  raise ArgumentError, 'No storage configured' unless storage
  raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty

  data = storage.read
  raise StorageError, 'No saved state found' unless data

  restore_from_json(data)
  @dirty = false
  self
end

#reload!Object

Force reloads, discarding unsaved changes.

Raises:

  • (ArgumentError)


198
199
200
201
202
203
204
205
206
207
# File 'lib/classifier/knn.rb', line 198

def reload!
  raise ArgumentError, 'No storage configured' unless storage

  data = storage.read
  raise StorageError, 'No saved state found' unless data

  restore_from_json(data)
  @dirty = false
  self
end

#remove_item(item) ⇒ Object



93
94
95
96
# File 'lib/classifier/knn.rb', line 93

def remove_item(item)
  synchronize { @dirty = true }
  @lsi.remove_item(item)
end

#respond_to_missing?(name, include_private = false) ⇒ Boolean

Returns:

  • (Boolean)


130
131
132
# File 'lib/classifier/knn.rb', line 130

def respond_to_missing?(name, include_private = false)
  !!(name.to_s =~ /\Atrain_(\w+)\z/) || super
end

#saveObject

Saves the classifier to the configured storage.

Raises:

  • (ArgumentError)


167
168
169
170
171
172
# File 'lib/classifier/knn.rb', line 167

def save
  raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage

  storage.write(to_json)
  @dirty = false
end

#save_to_file(path) ⇒ Object

Saves the classifier to a file.



176
177
178
179
180
# File 'lib/classifier/knn.rb', line 176

def save_to_file(path)
  result = File.write(path, to_json)
  @dirty = false
  result
end

#to_json(_options = nil) ⇒ Object



146
147
148
# File 'lib/classifier/knn.rb', line 146

def to_json(_options = nil)
  as_json.to_json
end

#train_batch(category = nil, documents = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &block) ⇒ Object Also known as: add_batch

Adds items in batches.

Examples:

Positional style

knn.train_batch(:spam, documents, batch_size: 100)

Keyword style

knn.train_batch(spam: documents, ham: other_docs)


286
287
288
289
290
# File 'lib/classifier/knn.rb', line 286

def train_batch(category = nil, documents = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &block)
  # @type var categories: Hash[Symbol, Array[String]]
  @lsi.train_batch(category, documents, batch_size: batch_size, **categories, &block) # steep:ignore
  synchronize { @dirty = true }
end

#train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE, &block) ⇒ Object

Trains the classifier from an IO stream. Each line in the stream is treated as a separate document.

Examples:

Train from a file

knn.train_from_stream(:spam, File.open('spam_corpus.txt'))

With progress tracking

knn.train_from_stream(:spam, io, batch_size: 500) do |progress|
  puts "#{progress.completed} documents processed"
end


272
273
274
275
# File 'lib/classifier/knn.rb', line 272

def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE, &block)
  @lsi.train_from_stream(category, io, batch_size: batch_size, &block)
  synchronize { @dirty = true }
end