Module: Simhilarity::Score

Included in:
Matcher
Defined in:
lib/simhilarity/score.rb

Instance Method Summary collapse

Instance Method Details

#score(candidate) ⇒ Object

Score a Candidate. The default implementation is the dice coefficient, (2*c)/(a+b).

  • a: the frequency weighted sum of the ngrams in a

  • b: the frequency weighted sum of the ngrams in b

  • c: the frequency weighted sum of the ngrams in (a & b)

[View source]

42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/simhilarity/score.rb', line 42

def score(candidate)
  if scorer
    return scorer.call(candidate)
  end

  c = (candidate.a.ngrams & candidate.b.ngrams)
  return 0 if c.length == 0

  a = candidate.a.ngrams_sum
  b = candidate.b.ngrams_sum
  c = ngrams_sum(c)
  (2.0 * c) / (a + b)
end

#winners(needles, candidates) ⇒ Object

walk candidates by score, pick winners

[View source]

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/simhilarity/score.rb', line 4

def winners(needles, candidates)
  # calculate this first so we get a nice progress bar
  veach("Scoring", candidates) do |i|
    i.score = score(i)
  end

  # sort by score
  candidates = candidates.sort_by { |i| -i.score }

  # walk them, eliminate dups
  seen = Set.new
  winners = candidates.map do |i|
    next if seen.include?(i.a)
    seen << i.a
    i
  end.compact

  # build map from needle => candidate...
  needle_to_winner = { }
  winners.each { |i| needle_to_winner[i.a] = i }

  # so we can return in the original order
  needles.map do |i|
    if candidate = needle_to_winner[i]
      [ i.opaque, candidate.b.opaque, candidate.score ]
    else
      [ i.opaque, nil, nil ]
    end
  end
end