Class: SynonymFinder::DuplicateFinder

Inherits:
Object
  • Object
show all
Defined in:
lib/synonym-finder/duplicate_finder.rb

Instance Method Summary collapse

Constructor Details

#initialize(synonym_finder) ⇒ DuplicateFinder

Returns a new instance of DuplicateFinder.



4
5
6
7
8
# File 'lib/synonym-finder/duplicate_finder.rb', line 4

def initialize(synonym_finder)
  @synonym_finder = synonym_finder
  @db = @synonym_finder.db
  @matches = {}
end

Instance Method Details

#canonical_duplicatesObject



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/synonym-finder/duplicate_finder.rb', line 10

def canonical_duplicates
  SynonymFinder.logger_write(@synonym_finder.object_id, "Processing canonical forms")
  @db.execute("select canonical from name_parts group by canonical having count(*) > 1").each_with_index do |canonical, i|
    i = i + 1
    SynonymFinder.logger_write(@synonym_finder.object_id, "Processing canonical form candidate %s" % i) if i % 100 == 0
    names = @db.execute("select name_id, path from name_parts where canonical = ?", canonical)
    find_pairs(names)
  end
  @matches.each do |key, value|
    if value[:total_distance] == 0
      value[:type] = :chresonym
    else
      value[:type] = :alt_placement
    end
  end
  @matches
end

#find_pairs(names, threshold = 0) ⇒ Object



28
29
30
31
32
33
34
35
36
# File 'lib/synonym-finder/duplicate_finder.rb', line 28

def find_pairs(names, threshold = 0)
  pairs = get_pairs(names)
  pairs.each do |pair|
    key = [pair[0][0], pair[1][0]]
    total_distance = get_total_distance(pair[0][1], pair[1][1])
    value = {:total_distance => total_distance}
    @matches[key] = value if !@matches.has_key?(key) && (threshold == 0 || total_distance <= threshold)
  end
end

#get_pairs(names) ⇒ Object



48
49
50
51
52
53
54
55
56
# File 'lib/synonym-finder/duplicate_finder.rb', line 48

def get_pairs(names)
  names = names.map { |n| [n[0], n[1].to_s.split("|")] }
  pairs = []
  until names.empty?
    name = names.pop
    names.each {|n| pairs << [name, n].sort}
  end
  pairs
end

#get_total_distance(path1, path2) ⇒ Object



38
39
40
41
42
43
44
45
46
# File 'lib/synonym-finder/duplicate_finder.rb', line 38

def get_total_distance(path1, path2)
  total_distance = path1.size + path2.size
  count = 0
  path1.zip(path2).each do |pair|
    break if pair[0] != pair[1]
    count += 1
  end
  total_distance - count * 2
end

#species_epithet_duplicates(threshold_distance) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/synonym-finder/duplicate_finder.rb', line 58

def species_epithet_duplicates(threshold_distance)
  SynonymFinder.logger_write(@synonym_finder.object_id, "Processing species epithets")
  @db.execute("select epithet_stem from name_parts group by epithet_stem having count(*) > 1").each_with_index do |stem, i|
    i = i + 1
    SynonymFinder.logger_write(@synonym_finder.object_id, "Processing species epithet candidate %s" % i) if i % 100 == 0
    names = @db.execute("select name_id, path from name_parts where epithet_stem = ?", stem)
    find_pairs(names, threshold_distance)
  end
  count = 0
  SynonymFinder.logger_write(@synonym_finder.object_id, "Assigning type to found matches")
  @matches.each do |key, value|
    next if value.has_key?(:type)
    count += 1
    SynonymFinder.logger_write(@synonym_finder.object_id, "Processing match %s" % count) if count % 10000 == 0
    if value[:total_distance] == 0
      epithets = @db.execute("select distinct epithet from name_parts where name_id in (#{key.join(",")})")
      if epithets.size == 1
        value[:type] = :misplaced_synonym
      else
        genera = @db.execute("select canonical from name_parts where name_id in (#{key.join(",")})").map { |c| c[0].split(" ")[0] }.uniq
        value[:type] = genera.size == 1 ? :lexical_variant : :misplaced_synonym
      end
    else
      value[:type] = :homotypic
    end
  end
  @matches
end