Class: Clusterer::DocumentArray

Inherits:
Array
  • Object
show all
Includes:
Tokenizer
Defined in:
lib/clusterer/document_array.rb

Constant Summary collapse

@@term_array_position_mapper =
{}

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Tokenizer

#simple_ngram_tokenizer, #simple_tokenizer

Constructor Details

#initialize(object = "", options = { }) ⇒ DocumentArray

Returns a new instance of DocumentArray.



31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/clusterer/document_array.rb', line 31

def initialize(object = "",options = { })
  @object = object
  super(@@term_array_position_mapper.size,0.0)
  send(options[:tokenizer] || :simple_tokenizer,
       ((defined? yield) == "yield" ? yield(object) : object.to_s),
       options[:tokenizer_options] || {}) {|term| self << term }

  if (idf = options[:idf])
    idf.increment_documents_count
    self.each_with_index {|ind,val| idf << @@term_array_position_mapper.index(ind) if val && val > 0.0}
  end
end

Instance Attribute Details

#objectObject (readonly)

stores the text in an array format, used with LSI or SVD



26
27
28
# File 'lib/clusterer/document_array.rb', line 26

def object
  @object
end

Instance Method Details

#<<(term) ⇒ Object



44
45
46
# File 'lib/clusterer/document_array.rb', line 44

def << (term)
  self[term_array_position_mapper(term)] = (self[term_array_position_mapper(term)] || 0) + 1
end

#normalize!(idf = nil, add_term = false) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/clusterer/document_array.rb', line 48

def normalize!(idf = nil, add_term = false)
  normalizing_factor = 0.0
  idf.increment_documents_count if add_term

  self[@@term_array_position_mapper.size - 1] ||= 0.0 

  self.each_with_index do |frequency, ind|
    f = add_term ? (idf << term) : (idf ? idf[@@term_array_position_mapper.index(ind)] : 1.0)
    self[ind] = (frequency || 0) * f
    normalizing_factor += self[ind] ** 2
  end
  
  normalizing_factor = Math.sqrt(normalizing_factor)
  normalizing_factor = 1 if normalizing_factor.zero?
  self.each_with_index {|frequency, ind| self[ind] = frequency/normalizing_factor}
  @vector_length = 1.0
  self.freeze
end

#term_array_position_mapper(term) ⇒ Object



71
72
73
74
75
76
77
# File 'lib/clusterer/document_array.rb', line 71

def term_array_position_mapper(term)
  if (x = @@term_array_position_mapper[term])
    x
  else
    @@term_array_position_mapper[term] = @@term_array_position_mapper.size
  end
end

#vector_lengthObject



67
68
69
# File 'lib/clusterer/document_array.rb', line 67

def vector_length
  @vector_length ||= Math.sqrt(self.inject(0) {|n,y| n + y*y})
end