Module: Eco::Data::FuzzyMatch::Pairing
- Included in:
- ClassMethods
- Defined in:
- lib/eco/data/fuzzy_match/pairing.rb
Instance Method Summary collapse
-
#paired_words(str1, str2, normalized: false) {|needle, item| ... } ⇒ Hash
Pair words using some algorithm.
Instance Method Details
#paired_words(str1, str2, normalized: false) {|needle, item| ... } ⇒ Hash
Pair words using some algorithm. It does the following:
- It splits both strings into words.
- Pairs all words by using
block
to score the best match. - Gives
0
score to those words ofstr2
that lost their pair (a word ofstr1
cannot be paired twice). - Merges the
Score
of all the paired words ofstr2
against theirstr1
word pair.
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/eco/data/fuzzy_match/pairing.rb', line 20 def paired_words(str1, str2, normalized: false) str1, str2 = normalize_string([str1, str2]) unless normalized return {nil => [nil, Score.new(0, 0)]} if !str2 || !str1 return {str1 => [nil, Score.new(0, 0)]} if str1.length < 2 || str1.length < 2 needles = get_words(str1, normalized: true) haystack = get_words(str2, normalized: true) ranking = {} faceted = needles.each_with_object({}) do |needle, faceted| faceted[needle] = haystack.map do |item| { pair: item, score: yield(needle, item) }.tap do |result| ranking[item] ||= [] if result[:score].ratio > 0.05 ranking[item] << ({needle: needle, score: result[:score]}) end end end.sort_by do |result| result[:score].ratio end.reverse end paired = {} #scores = {} ranking.each do |item, results| sorted = results.reject do |result| paired.key?(result[:needle]) end.sort_by do |result| result[:score].ratio end.reverse if result = sorted.shift unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score) raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{result[:needle]}' and item #{item}" end paired[result[:needle]] = { pair: item, score: result[:score] } end end pending_items = haystack - paired.values faceted.reject do |needle, results| paired.key?(needle) end.each do |needle, results| results.select! do |result| pending_items.include?(result[:pair]) && result[:score].ratio > 0.05 end if result = results.shift unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score) raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{needle}' and item #{result[:pair]}" end paired[needle] = result pending_items.delete(result[:pair]) end end pending_needles = needles - paired.keys pending_needles.each do |needle| paired[needle] = { pair: nil, score: Score.new(0, needle.length) } end paired.each_with_object({}) do |(needle, data), out| out[needle] = data.values_at(:pair, :score) end end |