Class: Hoatzin::Classifier

Inherits:
Object
  • Object
show all
Defined in:
lib/classifier.rb

Defined Under Namespace

Classes: InvalidFormat, ReadOnly

Constant Summary collapse

FORMAT_VERSION =
2

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Classifier

Returns a new instance of Classifier.



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/classifier.rb', line 11

def initialize options = {}

  @documents = []
  @classifications = []
  @labels = []

  @problem = @model = nil
  @cache = 0
  @readonly = false

  @metadata_file = options.delete(:metadata) || nil
  @model_file = options.delete(:model) || nil

  @builder = FeatureVector::Builder.new(:parser => Hoatzin::Parser.new)

  # If we have model and metadata files then load them
  load if @metadata_file && @model_file


  # Define kernel parameters for libsvm
  @parameters = Parameter.new(:C => 100,
                              :degree => 1,
                              :coef0 => 0,
                              :eps => 0.001)

end

Instance Attribute Details

#classificationsObject (readonly)

Returns the value of attribute classifications.



9
10
11
# File 'lib/classifier.rb', line 9

def classifications
  @classifications
end

Instance Method Details

#classify(text) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/classifier.rb', line 53

def classify text

  # See if we need to calculate the feature vectors
  sync

  # Calculate the feature vectors for the text to be classified
  f_vector = @builder.build_query_vector(text)

  # Classify and return classification
  pred, probs = @model.predict_probability(f_vector)
  @classifications[pred.to_i]
end

#save(options = {}) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/classifier.rb', line 75

def save options = {}
  @metadata_file = options[:metadata] if options.key?(:metadata)
  @model_file = options[:model] if options.key?(:model)
  return false unless (@metadata_file && @model_file)
  
  # TODO: Add a version identifier
  data = { :classifications => @classifications,
           :version => FORMAT_VERSION,
           :dictionary => @builder.vector_keyword_index,
           :readonly => true }
  data.merge!(:documents => @documents,
              :cache => @cache,
              :readonly => false) if options[:update]
  File.open(@metadata_file, 'w+') { |f| Marshal.dump(data, f) }
  assign_model if @model.nil?
  @model.save(@model_file)
end

#syncObject



66
67
68
69
70
71
72
73
# File 'lib/classifier.rb', line 66

def sync
  # Only update the model if we've trained more documents since it was last updated
  if !@readonly && @documents.length > @cache
    return nil if @documents.length == 0
    @cache = @documents.length
    assign_model
  end
end

#train(classification, text) ⇒ Object

Raises:



38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/classifier.rb', line 38

def train classification, text
  # Only allow retraining if we have all the required data
  raise ReadOnly if @readonly

  # Add the classification if we haven't seen it before
  @classifications << classification unless @classifications.include?(classification)

  # Add to document corpus
  @documents << text

  # Add classification to classification list
  @labels << @classifications.index(classification)
  
end