Class: Simhilarity::Matcher

Inherits:
Object
  • Object
show all
Includes:
Candidates, Score
Defined in:
lib/simhilarity/matcher.rb

Constant Summary

Constants included from Candidates

Candidates::DEFAULT_NGRAM_OVERLAPS, Candidates::DEFAULT_SIMHASH_MAX_HAMMING

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Score

#score, #winners

Methods included from Candidates

#candidates_all, #candidates_for, #candidates_method, #candidates_ngrams, #candidates_simhash

Instance Attribute Details

#candidatesObject

Specifies which method to use for finding candidates. See the README for more details.



27
28
29
# File 'lib/simhilarity/matcher.rb', line 27

def candidates
  @candidates
end

#ngram_overlapsObject

Minimum number of ngram overlaps, defaults to 3 (for candidates

:ngrams)



31
32
33
# File 'lib/simhilarity/matcher.rb', line 31

def ngram_overlaps
  @ngram_overlaps
end

#ngrammerObject

Proc for generating ngrams.



20
21
22
# File 'lib/simhilarity/matcher.rb', line 20

def ngrammer
  @ngrammer
end

#normalizerObject

Proc for normalizing strings.



17
18
19
# File 'lib/simhilarity/matcher.rb', line 17

def normalizer
  @normalizer
end

#readerObject

Proc for turning opaque items into strings.



14
15
16
# File 'lib/simhilarity/matcher.rb', line 14

def reader
  @reader
end

#scorerObject

Proc for scoring ngrams.



23
24
25
# File 'lib/simhilarity/matcher.rb', line 23

def scorer
  @scorer
end

#simhash_max_hammingObject

Maximum simhash hamming distance, defaults to 7. (for candidates

:simhash)



35
36
37
# File 'lib/simhilarity/matcher.rb', line 35

def simhash_max_hamming
  @simhash_max_hamming
end

#verboseObject

If true, show progress bars and timing



11
12
13
# File 'lib/simhilarity/matcher.rb', line 11

def verbose
  @verbose
end

Instance Method Details

#freqObject

Ngram frequency weights from the haystack, or 1 if the ngram isn’t in the haystack. Lazily calculated.



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/simhilarity/matcher.rb', line 55

def freq
  @freq ||= begin
    # calculate ngram counts for the haystack
    counts = Hash.new(0)
    veach("Haystack", @haystack) do |element|
      element.ngrams.each do |ngram|
        counts[ngram] += 1
      end
    end

    # turn counts into inverse frequencies
    map = Hash.new(1)
    total = counts.values.inject(&:+).to_f
    counts.each do |ngram, count|
      map[ngram] = ((total / count) * 10).round
    end
    map
  end
end

#haystackObject

The current haystack.



49
50
51
# File 'lib/simhilarity/matcher.rb', line 49

def haystack
  @haystack
end

#haystack=(haystack) ⇒ Object

Set the haystack.



38
39
40
41
42
43
44
45
46
# File 'lib/simhilarity/matcher.rb', line 38

def haystack=(haystack)
  @haystack = import_list(haystack)

  # this stuff is lazily calculated from the haystack, and needs
  # to be reset whenever the haystack changes.
  @bitsums = { }
  @bk_tree = nil
  @freq = nil
end

#inspectObject

:nodoc:



155
156
157
# File 'lib/simhilarity/matcher.rb', line 155

def inspect #:nodoc:
  "Matcher"
end

#matches(needles) ⇒ Object

Match each item in needles to an item in #haystack. Returns an array of tuples, [needle, haystack, score]. Scores range from 0 to 1, with 1 being a perfect match and 0 being a terrible match.



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/simhilarity/matcher.rb', line 79

def matches(needles)
  if haystack.nil?
    raise RuntimeError.new('can\'t match before setting a haystack')
  end

  # create Elements
  needles = import_list(needles)

  # get candidate matches
  candidates = candidates_for(needles)
  vputs " got #{candidates.length} candidates."

  # pick winners
  winners(needles, candidates)
end

#ngrams(str) ⇒ Object

Generate ngrams from a normalized str.



122
123
124
125
126
127
128
129
130
131
132
# File 'lib/simhilarity/matcher.rb', line 122

def ngrams(str)
  if ngrammer
    return ngrammer.call(str)
  end

  # two letter ngrams (bigrams)
  ngrams = str.each_char.each_cons(2).map(&:join)
  # runs of digits
  ngrams += str.scan(/\d+/)
  ngrams.uniq
end

#ngrams_sum(ngrams) ⇒ Object

Sum up the frequency weights of the ngrams.



135
136
137
# File 'lib/simhilarity/matcher.rb', line 135

def ngrams_sum(ngrams)
  ngrams.map { |i| freq[i] }.inject(&:+) || 0
end

#normalize(incoming_str) ⇒ Object

Normalize an incoming string from the user.



108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/simhilarity/matcher.rb', line 108

def normalize(incoming_str)
  if normalizer
    return normalizer.call(incoming_str)
  end

  str = incoming_str
  str = str.downcase
  str = str.gsub(/[^a-z0-9]/, " ")
  # squish whitespace
  str = str.gsub(/\s+/, " ").strip
  str
end

#read(opaque) ⇒ Object

Turn an opaque item from the user into a string.



96
97
98
99
100
101
102
103
104
105
# File 'lib/simhilarity/matcher.rb', line 96

def read(opaque)
  if reader
    return reader.call(opaque)
  end

  if opaque.is_a?(String)
    return opaque
  end
  raise "can't turn #{opaque.inspect} into string"
end

#simhash(ngrams) ⇒ Object

Calculate the frequency weighted simhash of the ngrams.



142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/simhilarity/matcher.rb', line 142

def simhash(ngrams)
  # map each ngram to its bitsums
  sums = ngrams.map { |i| simhash_bitsums(i) }
  # transpose and calculate final sum for each bit
  bits = sums.transpose.map { |values| values.inject(&:+) }
  # wherever we have a positive sum, the simhash bit is 1
  simhash = 0
  bits.each_with_index do |i, index|
    simhash |= (1 << index) if i > 0
  end
  simhash
end