Class: FuzzyTools::TfIdfIndex

Inherits:
Index
  • Object
show all
Defined in:
lib/fuzzy_tools/tf_idf_index.rb

Defined Under Namespace

Classes: Token

Instance Attribute Summary collapse

Attributes inherited from Index

#indexed_attribute, #source

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Index

#all, #all_with_scores, #find

Constructor Details

#initialize(options = {}) ⇒ TfIdfIndex

Returns a new instance of TfIdfIndex.



21
22
23
24
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 21

def initialize(options = {})
  @tokenizer = options[:tokenizer] || self.class.default_tokenizer
  super
end

Instance Attribute Details

#tokenizerObject (readonly)

Returns the value of attribute tokenizer.



19
20
21
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 19

def tokenizer
  @tokenizer
end

Class Method Details

.default_tokenizerObject



15
16
17
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 15

def self.default_tokenizer
  FuzzyTools::Tokenizers::HYBRID
end

Instance Method Details

#score(weighted_tokens_1, weighted_tokens_2) ⇒ Object



44
45
46
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 44

def score(weighted_tokens_1, weighted_tokens_2)
  weighted_tokens_1.cosine_similarity(weighted_tokens_2)
end

#select_candidate_documents(query, query_weighted_tokens) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 48

def select_candidate_documents(query, query_weighted_tokens)
  candidates = Set.new
  check_all_threshold = @source_count * 0.75 # this threshold works best on the accuracy data
  query_weighted_tokens.tokens.each do |query_token|
    if tf_idf_token = @tf_idf_tokens[query_token]
      next if tf_idf_token.idf < @idf_cutoff
      candidates.merge(tf_idf_token.documents)
      if candidates.size > check_all_threshold
        candidates = source
        break
      end
    end
  end
  candidates
end

#tokenize(str) ⇒ Object



26
27
28
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 26

def tokenize(str)
  tokenizer.call(str.to_s)
end

#unsorted_scored_results(query) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/fuzzy_tools/tf_idf_index.rb', line 30

def unsorted_scored_results(query)
  query_weighted_tokens = WeightedDocumentTokens.new(tokenize(query), :weight_function => weight_function)

  candidates = select_candidate_documents(query, query_weighted_tokens)

  candidates.map do |candidate|
    candidate_tokens = @document_tokens[document_attribute(candidate)]

    score = self.score(query_weighted_tokens, candidate_tokens)

    [candidate, score]
  end
end