Class: Clusterer::DocumentArray
- Inherits:
-
Array
- Object
- Array
- Clusterer::DocumentArray
show all
- Includes:
- Tokenizer
- Defined in:
- lib/clusterer/document_array.rb
Constant Summary
collapse
- @@term_array_position_mapper =
{}
Instance Attribute Summary collapse
-
#object ⇒ Object
readonly
stores the text in an array format, used with LSI or SVD.
Instance Method Summary
collapse
Methods included from Tokenizer
#simple_ngram_tokenizer, #simple_tokenizer
Constructor Details
#initialize(object = "", options = { }) ⇒ DocumentArray
Returns a new instance of DocumentArray.
31
32
33
34
35
36
37
38
39
40
41
42
|
# File 'lib/clusterer/document_array.rb', line 31
def initialize(object = "",options = { })
@object = object
super(@@term_array_position_mapper.size,0.0)
send(options[:tokenizer] || :simple_tokenizer,
((defined? yield) == "yield" ? yield(object) : object.to_s),
options[:tokenizer_options] || {}) {|term| self << term }
if (idf = options[:idf])
idf.increment_documents_count
self.each_with_index {|ind,val| idf << @@term_array_position_mapper.index(ind) if val && val > 0.0}
end
end
|
Instance Attribute Details
#object ⇒ Object
stores the text in an array format, used with LSI or SVD
26
27
28
|
# File 'lib/clusterer/document_array.rb', line 26
def object
@object
end
|
Instance Method Details
#<<(term) ⇒ Object
44
45
46
|
# File 'lib/clusterer/document_array.rb', line 44
def << (term)
self[term_array_position_mapper(term)] = (self[term_array_position_mapper(term)] || 0) + 1
end
|
#normalize!(idf = nil, add_term = false) ⇒ Object
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
# File 'lib/clusterer/document_array.rb', line 48
def normalize!(idf = nil, add_term = false)
normalizing_factor = 0.0
idf.increment_documents_count if add_term
self[@@term_array_position_mapper.size - 1] ||= 0.0
self.each_with_index do |frequency, ind|
f = add_term ? (idf << term) : (idf ? idf[@@term_array_position_mapper.index(ind)] : 1.0)
self[ind] = (frequency || 0) * f
normalizing_factor += self[ind] ** 2
end
normalizing_factor = Math.sqrt(normalizing_factor)
normalizing_factor = 1 if normalizing_factor.zero?
self.each_with_index {|frequency, ind| self[ind] = frequency/normalizing_factor}
@vector_length = 1.0
self.freeze
end
|
#term_array_position_mapper(term) ⇒ Object
71
72
73
74
75
76
77
|
# File 'lib/clusterer/document_array.rb', line 71
def term_array_position_mapper(term)
if (x = @@term_array_position_mapper[term])
x
else
@@term_array_position_mapper[term] = @@term_array_position_mapper.size
end
end
|
#vector_length ⇒ Object
67
68
69
|
# File 'lib/clusterer/document_array.rb', line 67
def vector_length
@vector_length ||= Math.sqrt(self.inject(0) {|n,y| n + y*y})
end
|