Class: Reckon::CosineSimilarity

Inherits:
Object
  • Object
show all
Defined in:
lib/reckon/cosine_similarity.rb

Overview

Calculates cosine similarity for tf/idf

Defined Under Namespace

Classes: DocumentInfo

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ CosineSimilarity

Returns a new instance of CosineSimilarity.



19
20
21
# File 'lib/reckon/cosine_similarity.rb', line 19

def initialize(options)
  @docs = DocumentInfo.new({}, {})
end

Instance Method Details

#add_document(account, doc) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/reckon/cosine_similarity.rb', line 23

def add_document(, doc)
  tokens = tokenize(doc)
  LOGGER.info "doc tokens: #{tokens}"
  tokens.each do |n|
    (token, count) = n

    @docs.tokens[token] ||= Hash.new(0)
    @docs.tokens[token][] += count
    @docs.accounts[] ||= Hash.new(0)
    @docs.accounts[][token] += count
  end
end

#find_similar(query) ⇒ Object

find most similar documents to query



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/reckon/cosine_similarity.rb', line 37

def find_similar(query)
  LOGGER.info "find_similar #{query}"

  accounts = docs_to_check(query).map do |a|
    [a, tfidf(@docs.accounts[a])]
  end

  q = tfidf(tokenize(query))

  suggestions = accounts.map do |a, d|
    {
      similarity: calc_similarity(q, d),
      account: a
    }
  end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] }

  LOGGER.info "most similar accounts: #{suggestions}"

  return suggestions
end