Class: FuzzyTools::TfIdfIndex
- Inherits:
-
Index
- Object
- Index
- FuzzyTools::TfIdfIndex
show all
- Defined in:
- lib/fuzzy_tools/tf_idf_index.rb
Defined Under Namespace
Classes: Token
Instance Attribute Summary collapse
Attributes inherited from Index
#indexed_attribute, #source
Class Method Summary
collapse
Instance Method Summary
collapse
Methods inherited from Index
#all, #all_with_scores, #find
Constructor Details
#initialize(options = {}) ⇒ TfIdfIndex
Returns a new instance of TfIdfIndex.
21
22
23
24
|
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 21
def initialize(options = {})
@tokenizer = options[:tokenizer] || self.class.default_tokenizer
super
end
|
Instance Attribute Details
#tokenizer ⇒ Object
Returns the value of attribute tokenizer.
19
20
21
|
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 19
def tokenizer
@tokenizer
end
|
Class Method Details
.default_tokenizer ⇒ Object
Instance Method Details
#score(weighted_tokens_1, weighted_tokens_2) ⇒ Object
44
45
46
|
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 44
def score(weighted_tokens_1, weighted_tokens_2)
weighted_tokens_1.cosine_similarity(weighted_tokens_2)
end
|
#select_candidate_documents(query, query_weighted_tokens) ⇒ Object
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 48
def select_candidate_documents(query, query_weighted_tokens)
candidates = Set.new
check_all_threshold = @source_count * 0.75 query_weighted_tokens.tokens.each do |query_token|
if tf_idf_token = @tf_idf_tokens[query_token]
next if tf_idf_token.idf < @idf_cutoff
candidates.merge(tf_idf_token.documents)
if candidates.size > check_all_threshold
candidates = source
break
end
end
end
candidates
end
|
#tokenize(str) ⇒ Object
26
27
28
|
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 26
def tokenize(str)
tokenizer.call(str.to_s)
end
|
#unsorted_scored_results(query) ⇒ Object
30
31
32
33
34
35
36
37
38
39
40
41
42
|
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 30
def unsorted_scored_results(query)
query_weighted_tokens = WeightedDocumentTokens.new(tokenize(query), :weight_function => weight_function)
candidates = select_candidate_documents(query, query_weighted_tokens)
candidates.map do |candidate|
candidate_tokens = @document_tokens[document_attribute(candidate)]
score = self.score(query_weighted_tokens, candidate_tokens)
[candidate, score]
end
end
|