Class: FamilyReunion::TaxamatchPreprocessor
- Inherits:
-
Object
- Object
- FamilyReunion::TaxamatchPreprocessor
- Defined in:
- lib/family-reunion/taxamatch_preprocessor.rb
Instance Method Summary collapse
- #get_letters(word) ⇒ Object
- #get_match_candidates(list1, list2) ⇒ Object
-
#initialize(cache) ⇒ TaxamatchPreprocessor
constructor
A new instance of TaxamatchPreprocessor.
- #partition_canonicals(canonicals) ⇒ Object
- #process_binomials(names1, names2) ⇒ Object
- #process_trinomials(names1, names2) ⇒ Object
- #process_uninomials(names1, names2) ⇒ Object
- #similar_words?(word1, word2) ⇒ Boolean
Constructor Details
#initialize(cache) ⇒ TaxamatchPreprocessor
Returns a new instance of TaxamatchPreprocessor.
4 5 6 |
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 4 def initialize(cache) @cache = cache end |
Instance Method Details
#get_letters(word) ⇒ Object
93 94 95 96 97 98 99 100 |
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 93 def get_letters(word) letters = @cache.word_letters[word] if letters == nil letters = word.split('').uniq @cache.word_letters[word] = letters end letters end |
#get_match_candidates(list1, list2) ⇒ Object
8 9 10 11 12 13 14 15 16 17 |
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 8 def get_match_candidates(list1, list2) match_candidates = {:uninomials => {}, :binomials => {}, :trinomials => {}} partitioned_names1 = partition_canonicals(list1) partitioned_names2 = partition_canonicals(list2) [:uninomials, :binomials, :trinomials].each do |bucket| candidates = self.send("process_#{bucket}", partitioned_names1[bucket], partitioned_names2[bucket]) match_candidates[bucket].merge!(candidates) end match_candidates end |
#partition_canonicals(canonicals) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 19 def partition_canonicals(canonicals) partitions = { :uninomials => [], :binomials => [], :trinomials => [], :multinomials => [] } canonicals.each do |name| words = name.split(' ') key = case words.size when 1 :uninomials when 2 :binomials when 3 :trinomials else :multinomials end partitions[key] << [name, words] end partitions end |
#process_binomials(names1, names2) ⇒ Object
49 50 51 52 53 54 55 56 57 58 |
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 49 def process_binomials(names1, names2) names1.inject({}) do |res, n1| names2.each do |n2| if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1]) res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] } end end res end end |
#process_trinomials(names1, names2) ⇒ Object
60 61 62 63 64 65 66 67 68 69 |
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 60 def process_trinomials(names1, names2) names1.inject({}) do |res, n1| names2.each do |n2| if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1]) && similar_words?(n1[1][2], n2[1][2]) res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] } end end res end end |
#process_uninomials(names1, names2) ⇒ Object
38 39 40 41 42 43 44 45 46 47 |
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 38 def process_uninomials(names1, names2) names1.inject({}) do |res, n1| names2.each do |n2| if similar_words?(n1[1][0], n2[1][0]) res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] } end end res end end |
#similar_words?(word1, word2) ⇒ Boolean
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/family-reunion/taxamatch_preprocessor.rb', line 71 def similar_words?(word1, word2) raise RuntimeError unless (word1.is_a?(String) && word2.is_a?(String)) key = [word1, word2].sort.join(':') cached = @cache.similar_words[key] return cached if cached != nil are_similar = false if word1 == word2 are_similar = true else letters1 = get_letters(word1) letters2 = get_letters(word2) symmertric_difference = (letters1 - letters2) + (letters2 - letters1) similar_letters = symmertric_difference.size.to_f/(letters1.size + letters2.size) <= 0.3 similar_length = (word1.size - word2.size).abs.to_f/word1.size <= 0.2 are_similar = similar_letters && similar_length end @cache.similar_words[key] = are_similar are_similar end |