Class: Taxamatch::Authmatch

Inherits:
Object
  • Object
show all
Defined in:
lib/taxamatch_rb/authmatch.rb

Class Method Summary collapse

Class Method Details

.authmatch(authors1, authors2, years1, years2) ⇒ Object


7
8
9
10
11
12
13
# File 'lib/taxamatch_rb/authmatch.rb', line 7

def self.authmatch(authors1, authors2, years1, years2)
  unique_authors1, unique_authors2 =
    remove_duplicate_authors(authors1, authors2)
  year_difference = compare_years(years1, years2)
  get_score(authors1, unique_authors1,
            authors2, unique_authors2, year_difference)
end

.compare_years(years1, years2) ⇒ Object


94
95
96
97
98
99
100
# File 'lib/taxamatch_rb/authmatch.rb', line 94

def self.compare_years(years1, years2)
  return 0 if years1 == [] && years2 == []
  if years1.size == 1 && years2.size == 1
    return (years1[0].to_i - years2[0].to_i).abs
  end
  nil
end

.fuzzy_match_authors(author1, author2) ⇒ Object


84
85
86
87
88
89
90
91
92
# File 'lib/taxamatch_rb/authmatch.rb', line 84

def self.fuzzy_match_authors(author1, author2)
  au1_length = author1.size
  au2_length = author2.size
  dlm = DamerauLevenshtein
  #get around a bug in C code, but it really has to be fixed
  ed = dlm.distance(author1, author2,1,3)
  (ed <= 3 && ([au1_length, au2_length].min > ed * 2) &&
  (ed < 2 || author1[0] == author2[0]))
end

.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff) ⇒ Object


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/taxamatch_rb/authmatch.rb', line 15

def self.get_score(authors1, unique_authors1,
                   authors2, unique_authors2, year_diff)
  count_before = authors1.size + authors2.size
  count_after = unique_authors1.size + unique_authors2.size
  score = 0
  if count_after == 0
    if year_diff != nil
      if year_diff == 0
        score = 100
      elsif year_diff == 1
        score = 54
      end
    else
      score = 94
    end
  elsif unique_authors1.size == 0 || unique_authors2.size == 0
    if year_diff != nil
      if year_diff == 0
        score = 91
      elsif year_diff == 1
        score = 51
      end
    else
      score = 90
    end
  else
    score = ((1 - count_after.to_f/count_before.to_f) * 100).round
    score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
  end
  score > 50 ? score : 0
end

.remove_duplicate_authors(authors1, authors2) ⇒ Object


47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/taxamatch_rb/authmatch.rb', line 47

def self.remove_duplicate_authors(authors1, authors2)
  unique_authors1 = authors1.dup
  unique_authors2 = authors2.dup
  authors1.each do |au1|
    authors2.each do |au2|
      au1_match = au2_match = false
      if au1 == au2
        au1_match = au2_match = true
      elsif au1 == au2[0...au1.size]
        au1_match = true
      elsif au1[0...au2.size] == au2
        au2_match = true
      end
      if (au1.size >= 3 && au1_match) ||
         (au2.size >= 3 && au2_match) ||
         (au1_match && au2_match)
        unique_authors1.delete au1
        unique_authors2.delete au2
      elsif au1_match
        unique_authors1.delete au1
      elsif au2_match
        unique_authors2.delete au2
      else
        #TODO: masking a bug in damerau levenshtsin
        # mod which appears comparing 1letter to a longer string
        if au1.size > 1 &&
           au2.size > 1 &&
           self.fuzzy_match_authors(au1, au2)
          unique_authors1.delete au1
          unique_authors2.delete au2
        end
      end
    end
  end
  [unique_authors1, unique_authors2]
end