Class: SynonymFinder::DuplicateFinder
- Inherits:
-
Object
- Object
- SynonymFinder::DuplicateFinder
- Defined in:
- lib/synonym-finder/duplicate_finder.rb
Instance Method Summary collapse
- #canonical_duplicates ⇒ Object
- #find_pairs(names, threshold = 0) ⇒ Object
- #get_pairs(names) ⇒ Object
- #get_total_distance(path1, path2) ⇒ Object
-
#initialize(synonym_finder) ⇒ DuplicateFinder
constructor
A new instance of DuplicateFinder.
- #species_epithet_duplicates(threshold_distance) ⇒ Object
Constructor Details
#initialize(synonym_finder) ⇒ DuplicateFinder
Returns a new instance of DuplicateFinder.
4 5 6 7 8 |
# File 'lib/synonym-finder/duplicate_finder.rb', line 4 def initialize(synonym_finder) @synonym_finder = synonym_finder @db = @synonym_finder.db @matches = {} end |
Instance Method Details
#canonical_duplicates ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/synonym-finder/duplicate_finder.rb', line 10 def canonical_duplicates SynonymFinder.logger_write(@synonym_finder.object_id, "Processing canonical forms") @db.execute("select canonical from name_parts group by canonical having count(*) > 1").each_with_index do |canonical, i| i = i + 1 SynonymFinder.logger_write(@synonym_finder.object_id, "Processing canonical form candidate %s" % i) if i % 100 == 0 names = @db.execute("select name_id, path from name_parts where canonical = ?", canonical) find_pairs(names) end @matches.each do |key, value| if value[:total_distance] == 0 value[:type] = :chresonym else value[:type] = :alt_placement end end @matches end |
#find_pairs(names, threshold = 0) ⇒ Object
28 29 30 31 32 33 34 35 36 |
# File 'lib/synonym-finder/duplicate_finder.rb', line 28 def find_pairs(names, threshold = 0) pairs = get_pairs(names) pairs.each do |pair| key = [pair[0][0], pair[1][0]] total_distance = get_total_distance(pair[0][1], pair[1][1]) value = {:total_distance => total_distance} @matches[key] = value if !@matches.has_key?(key) && (threshold == 0 || total_distance <= threshold) end end |
#get_pairs(names) ⇒ Object
48 49 50 51 52 53 54 55 56 |
# File 'lib/synonym-finder/duplicate_finder.rb', line 48 def get_pairs(names) names = names.map { |n| [n[0], n[1].to_s.split("|")] } pairs = [] until names.empty? name = names.pop names.each {|n| pairs << [name, n].sort} end pairs end |
#get_total_distance(path1, path2) ⇒ Object
38 39 40 41 42 43 44 45 46 |
# File 'lib/synonym-finder/duplicate_finder.rb', line 38 def get_total_distance(path1, path2) total_distance = path1.size + path2.size count = 0 path1.zip(path2).each do |pair| break if pair[0] != pair[1] count += 1 end total_distance - count * 2 end |
#species_epithet_duplicates(threshold_distance) ⇒ Object
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/synonym-finder/duplicate_finder.rb', line 58 def species_epithet_duplicates(threshold_distance) SynonymFinder.logger_write(@synonym_finder.object_id, "Processing species epithets") @db.execute("select epithet_stem from name_parts group by epithet_stem having count(*) > 1").each_with_index do |stem, i| i = i + 1 SynonymFinder.logger_write(@synonym_finder.object_id, "Processing species epithet candidate %s" % i) if i % 100 == 0 names = @db.execute("select name_id, path from name_parts where epithet_stem = ?", stem) find_pairs(names, threshold_distance) end count = 0 SynonymFinder.logger_write(@synonym_finder.object_id, "Assigning type to found matches") @matches.each do |key, value| next if value.has_key?(:type) count += 1 SynonymFinder.logger_write(@synonym_finder.object_id, "Processing match %s" % count) if count % 10000 == 0 if value[:total_distance] == 0 epithets = @db.execute("select distinct epithet from name_parts where name_id in (#{key.join(",")})") if epithets.size == 1 value[:type] = :misplaced_synonym else genera = @db.execute("select canonical from name_parts where name_id in (#{key.join(",")})").map { |c| c[0].split(" ")[0] }.uniq value[:type] = genera.size == 1 ? :lexical_variant : :misplaced_synonym end else value[:type] = :homotypic end end @matches end |