Class: Simhilarity::Matcher
- Inherits:
-
Object
- Object
- Simhilarity::Matcher
- Includes:
- Candidates, Score
- Defined in:
- lib/simhilarity/matcher.rb
Constant Summary
Constants included from Candidates
Candidates::DEFAULT_NGRAM_OVERLAPS, Candidates::DEFAULT_SIMHASH_MAX_HAMMING
Instance Attribute Summary collapse
-
#candidates ⇒ Object
Specifies which method to use for finding candidates.
-
#ngram_overlaps ⇒ Object
Minimum number of ngram overlaps, defaults to 3 (for candidates = :ngrams).
-
#ngrammer ⇒ Object
Proc for generating ngrams.
-
#normalizer ⇒ Object
Proc for normalizing strings.
-
#reader ⇒ Object
Proc for turning opaque items into strings.
-
#scorer ⇒ Object
Proc for scoring ngrams.
-
#simhash_max_hamming ⇒ Object
Maximum simhash hamming distance, defaults to 7.
-
#verbose ⇒ Object
If true, show progress bars and timing.
Instance Method Summary collapse
-
#freq ⇒ Object
Ngram frequency weights from the haystack, or 1 if the ngram isn’t in the haystack.
-
#haystack ⇒ Object
The current haystack.
-
#haystack=(haystack) ⇒ Object
Set the haystack.
-
#inspect ⇒ Object
:nodoc:.
-
#matches(needles) ⇒ Object
Match each item in
needles
to an item in #haystack. -
#ngrams(str) ⇒ Object
Generate ngrams from a normalized str.
-
#ngrams_sum(ngrams) ⇒ Object
Sum up the frequency weights of the
ngrams
. -
#normalize(incoming_str) ⇒ Object
Normalize an incoming string from the user.
-
#read(opaque) ⇒ Object
Turn an opaque item from the user into a string.
-
#simhash(ngrams) ⇒ Object
Calculate the frequency weighted simhash of the
ngrams
.
Methods included from Score
Methods included from Candidates
#candidates_all, #candidates_for, #candidates_method, #candidates_ngrams, #candidates_simhash
Instance Attribute Details
#candidates ⇒ Object
Specifies which method to use for finding candidates. See the README for more details.
27 28 29 |
# File 'lib/simhilarity/matcher.rb', line 27 def candidates @candidates end |
#ngram_overlaps ⇒ Object
Minimum number of ngram overlaps, defaults to 3 (for candidates
:ngrams)
31 32 33 |
# File 'lib/simhilarity/matcher.rb', line 31 def ngram_overlaps @ngram_overlaps end |
#ngrammer ⇒ Object
Proc for generating ngrams.
20 21 22 |
# File 'lib/simhilarity/matcher.rb', line 20 def ngrammer @ngrammer end |
#normalizer ⇒ Object
Proc for normalizing strings.
17 18 19 |
# File 'lib/simhilarity/matcher.rb', line 17 def normalizer @normalizer end |
#reader ⇒ Object
Proc for turning opaque items into strings.
14 15 16 |
# File 'lib/simhilarity/matcher.rb', line 14 def reader @reader end |
#scorer ⇒ Object
Proc for scoring ngrams.
23 24 25 |
# File 'lib/simhilarity/matcher.rb', line 23 def scorer @scorer end |
#simhash_max_hamming ⇒ Object
Maximum simhash hamming distance, defaults to 7. (for candidates
:simhash)
35 36 37 |
# File 'lib/simhilarity/matcher.rb', line 35 def simhash_max_hamming @simhash_max_hamming end |
#verbose ⇒ Object
If true, show progress bars and timing
11 12 13 |
# File 'lib/simhilarity/matcher.rb', line 11 def verbose @verbose end |
Instance Method Details
#freq ⇒ Object
Ngram frequency weights from the haystack, or 1 if the ngram isn’t in the haystack. Lazily calculated.
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/simhilarity/matcher.rb', line 55 def freq @freq ||= begin # calculate ngram counts for the haystack counts = Hash.new(0) veach("Haystack", @haystack) do |element| element.ngrams.each do |ngram| counts[ngram] += 1 end end # turn counts into inverse frequencies map = Hash.new(1) total = counts.values.inject(&:+).to_f counts.each do |ngram, count| map[ngram] = ((total / count) * 10).round end map end end |
#haystack ⇒ Object
The current haystack.
49 50 51 |
# File 'lib/simhilarity/matcher.rb', line 49 def haystack @haystack end |
#haystack=(haystack) ⇒ Object
Set the haystack.
38 39 40 41 42 43 44 45 46 |
# File 'lib/simhilarity/matcher.rb', line 38 def haystack=(haystack) @haystack = import_list(haystack) # this stuff is lazily calculated from the haystack, and needs # to be reset whenever the haystack changes. @bitsums = { } @bk_tree = nil @freq = nil end |
#inspect ⇒ Object
:nodoc:
155 156 157 |
# File 'lib/simhilarity/matcher.rb', line 155 def inspect #:nodoc: "Matcher" end |
#matches(needles) ⇒ Object
Match each item in needles
to an item in #haystack. Returns an array of tuples, [needle, haystack, score]
. Scores range from 0 to 1, with 1 being a perfect match and 0 being a terrible match.
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/simhilarity/matcher.rb', line 79 def matches(needles) if haystack.nil? raise RuntimeError.new('can\'t match before setting a haystack') end # create Elements needles = import_list(needles) # get candidate matches candidates = candidates_for(needles) vputs " got #{candidates.length} candidates." # pick winners winners(needles, candidates) end |
#ngrams(str) ⇒ Object
Generate ngrams from a normalized str.
122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/simhilarity/matcher.rb', line 122 def ngrams(str) if ngrammer return ngrammer.call(str) end # two letter ngrams (bigrams) ngrams = str.each_char.each_cons(2).map(&:join) # runs of digits ngrams += str.scan(/\d+/) ngrams.uniq end |
#ngrams_sum(ngrams) ⇒ Object
Sum up the frequency weights of the ngrams
.
135 136 137 |
# File 'lib/simhilarity/matcher.rb', line 135 def ngrams_sum(ngrams) ngrams.map { |i| freq[i] }.inject(&:+) || 0 end |
#normalize(incoming_str) ⇒ Object
Normalize an incoming string from the user.
108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/simhilarity/matcher.rb', line 108 def normalize(incoming_str) if normalizer return normalizer.call(incoming_str) end str = incoming_str str = str.downcase str = str.gsub(/[^a-z0-9]/, " ") # squish whitespace str = str.gsub(/\s+/, " ").strip str end |
#read(opaque) ⇒ Object
Turn an opaque item from the user into a string.
96 97 98 99 100 101 102 103 104 105 |
# File 'lib/simhilarity/matcher.rb', line 96 def read(opaque) if reader return reader.call(opaque) end if opaque.is_a?(String) return opaque end raise "can't turn #{opaque.inspect} into string" end |
#simhash(ngrams) ⇒ Object
Calculate the frequency weighted simhash of the ngrams
.
142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/simhilarity/matcher.rb', line 142 def simhash(ngrams) # map each ngram to its bitsums sums = ngrams.map { |i| simhash_bitsums(i) } # transpose and calculate final sum for each bit bits = sums.transpose.map { |values| values.inject(&:+) } # wherever we have a positive sum, the simhash bit is 1 simhash = 0 bits.each_with_index do |i, index| simhash |= (1 << index) if i > 0 end simhash end |