Module: Simhilarity::Candidates

Included in:
Matcher
Defined in:
lib/simhilarity/candidates.rb

Constant Summary collapse

DEFAULT_NGRAM_OVERLAPS =

default minimum number # of ngram overlaps with :ngrams

3
DEFAULT_SIMHASH_MAX_HAMMING =

default maximum hamming distance with :simhash

7

Instance Method Summary collapse

Instance Method Details

#candidates_all(needles) ⇒ Object

Return ALL candidates. This only works for small datasets.



49
50
51
# File 'lib/simhilarity/candidates.rb', line 49

def candidates_all(needles)
  needles.product(haystack)
end

#candidates_for(needles) ⇒ Object

Find candidates from needles & haystack. The method used depends on the value of candidates



11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/simhilarity/candidates.rb', line 11

def candidates_for(needles)
  # generate candidates
  candidates_method = candidates_method(needles)
  candidates = self.send(candidates_method, needles)

  # if these are the same, no self-dups
  if needles == haystack
    candidates = candidates.reject { |n, h| n == h }
  end

  # map and return
  candidates.map { |n, h| Candidate.new(n, h) }
end

#candidates_method(needles) ⇒ Object

Select the method for finding candidates based on candidates.



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/simhilarity/candidates.rb', line 26

def candidates_method(needles)
  # pick the method
  method = self.candidates
  method ||= (needles.length * haystack.length < 200000) ? :all : :simhash
  case method
  when /^ngrams=(\d+)$/
    method = :ngrams
    self.ngram_overlaps = $1.to_i
  when /^simhash=(\d+)$/
    method = :simhash
    self.simhash_max_hamming = $1.to_i
  end

  method = "candidates_#{method}".to_sym
  if !respond_to?(method, true)
    raise "unsupported candidates #{candidates.inspect}"
  end

  vputs "Using #{method} with needles=#{needles.length} haystack=#{haystack.length}..."
  method
end

#candidates_ngrams(needles) ⇒ Object

Return candidates that overlap with three or more matching ngrams. Only works for small datasets.



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/simhilarity/candidates.rb', line 55

def candidates_ngrams(needles)
  ngram_overlaps = self.ngram_overlaps || DEFAULT_NGRAM_OVERLAPS

  candidates = []
  veach(" ngrams #{ngram_overlaps}", needles) do |n|
    ngrams_set = Set.new(n.ngrams)
    haystack.each do |h|
      count = 0
      h.ngrams.each do |ngram|
        if ngrams_set.include?(ngram)
          if (count += 1) == ngram_overlaps
            candidates << [n, h]
            break
          end
        end
      end
    end
  end
  candidates
end

#candidates_simhash(needles) ⇒ Object

Find candidates that are close based on hamming distance between the simhashes.



78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/simhilarity/candidates.rb', line 78

def candidates_simhash(needles)
  max_hamming = self.simhash_max_hamming || DEFAULT_SIMHASH_MAX_HAMMING

  # search for candidates with low hamming distance
  candidates = []
  veach(" hamming #{max_hamming}", needles) do |n|
    bk_tree.query(n, max_hamming).each do |h, distance|
      candidates << [n, h]
    end
  end
  candidates
end