Class: Reckon::CosineSimilarity
- Inherits:
-
Object
- Object
- Reckon::CosineSimilarity
- Defined in:
- lib/reckon/cosine_similarity.rb
Overview
Calculates cosine similarity for tf/idf
Defined Under Namespace
Classes: DocumentInfo
Instance Method Summary collapse
- #add_document(account, doc) ⇒ Object
-
#find_similar(query) ⇒ Object
find most similar documents to query.
-
#initialize(options) ⇒ CosineSimilarity
constructor
A new instance of CosineSimilarity.
Constructor Details
#initialize(options) ⇒ CosineSimilarity
Returns a new instance of CosineSimilarity.
19 20 21 |
# File 'lib/reckon/cosine_similarity.rb', line 19 def initialize() @docs = DocumentInfo.new({}, {}) end |
Instance Method Details
#add_document(account, doc) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/reckon/cosine_similarity.rb', line 23 def add_document(account, doc) tokens = tokenize(doc) LOGGER.info "doc tokens: #{tokens}" tokens.each do |n| (token, count) = n @docs.tokens[token] ||= Hash.new(0) @docs.tokens[token][account] += count @docs.accounts[account] ||= Hash.new(0) @docs.accounts[account][token] += count end end |
#find_similar(query) ⇒ Object
find most similar documents to query
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/reckon/cosine_similarity.rb', line 37 def find_similar(query) LOGGER.info "find_similar #{query}" accounts = docs_to_check(query).map do |a| [a, tfidf(@docs.accounts[a])] end q = tfidf(tokenize(query)) suggestions = accounts.map do |a, d| { similarity: calc_similarity(q, d), account: a } end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] } LOGGER.info "most similar accounts: #{suggestions}" return suggestions end |