Class: Eluka::Document
- Inherits:
-
Object
- Object
- Eluka::Document
- Defined in:
- lib/eluka/document.rb
Instance Method Summary collapse
- #bag_of_words ⇒ Object
-
#initialize(field, text, analyzer) ⇒ Document
constructor
A new instance of Document.
- #vector ⇒ Object
Constructor Details
#initialize(field, text, analyzer) ⇒ Document
Returns a new instance of Document.
5 6 7 8 9 10 11 |
# File 'lib/eluka/document.rb', line 5 def initialize(field, text, analyzer) @field = field @text = text @analyzer = analyzer @bag_of_words = nil self.bag_of_words end |
Instance Method Details
#bag_of_words ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/eluka/document.rb', line 13 def bag_of_words #Position counter for the document pos = 0 @bag_of_words = Hash.new #Token Stream token_stream = @analyzer.token_stream(:field, @text) while token = token_stream.next do pos += token.pos_inc @bag_of_words[token.text] = Array.new unless @bag_of_words[token.text] @bag_of_words[token.text].push(pos) end end |
#vector ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/eluka/document.rb', line 30 def vector vector = Hash.new squared_length = 0 @bag_of_words.each do |term, pos_vector| squared_length += pos_vector.size**2 #vector[[@field,term].join("||")] = pos_vector.size end length = squared_length.to_f**0.5 @bag_of_words.each do |term, pos_vector| vector[[@field,term].join("||")] = pos_vector.size.to_f / length end vector end |