Class: Clusterer::Document

Inherits:
DocumentBase show all
Includes:
Tokenizer
Defined in:
lib/clusterer/document.rb

Overview

Document tokenizes the text and stores the count of each token in the document.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Tokenizer

#simple_ngram_tokenizer, #simple_tokenizer

Methods inherited from DocumentBase

#vector_length

Methods included from DocumentSimilarity

#cosine_similarity

Constructor Details

#initialize(object, options = { }) ⇒ Document

Returns a new instance of Document.



37
38
39
40
41
42
43
44
45
46
47
# File 'lib/clusterer/document.rb', line 37

def initialize (object, options = { })
  @object = object
  send(options[:tokenizer] || :simple_tokenizer,
       ((defined? yield) == "yield" ? yield(object) : object.to_s),
       options[:tokenizer_options] || {}) {|term| self << term }
  
  if (idf = options[:idf])
    idf.increment_documents_count
    self.each_key {|term| idf << term}
  end
end

Instance Attribute Details

#objectObject (readonly)

Reference to the original text or the object from which the text is derived.



29
30
31
# File 'lib/clusterer/document.rb', line 29

def object
  @object
end

Class Method Details

.centroid_classObject

Reference to the centroid class which is used by Kmeans algorithm



33
34
35
# File 'lib/clusterer/document.rb', line 33

def self.centroid_class
  DocumentsCentroid
end

Instance Method Details

#<<(term) ⇒ Object



49
50
51
# File 'lib/clusterer/document.rb', line 49

def << (term)
  self[term] = (self[term] || 0) + 1
end

#normalize!(idf = nil, add_term = false) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/clusterer/document.rb', line 53

def normalize!(idf = nil, add_term = false)
  normalizing_factor = 0.0
  idf.increment_documents_count if add_term
  
  self.each do |term,frequency|
    idf << term if add_term
    f =  idf ? idf[term] : 1.0
    self[term] = Math.log(1 + frequency) * f
    normalizing_factor += self[term] ** 2
  end

  normalizing_factor = Math.sqrt(normalizing_factor)
  normalizing_factor = 1 if normalizing_factor.zero?
  self.each {|term,frequency| self[term] = frequency/normalizing_factor}
  @vector_length = 1
  self.freeze
end