Class: Matching::Matcher

Inherits:
Object
  • Object
show all
Defined in:
lib/matching/matcher.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ Matcher

Returns a new instance of Matcher.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/matching/matcher.rb', line 16

def initialize(opts={})
  @left_store = opts[:left_store]
  @right_store = opts[:right_store]
  @min_score = opts[:min_score] || 1.0

  @join_pairs = []
  @compare_pairs = []
  @custom_functions = []
  @filter_functions = []
  @right_matches = {} #hash keyed on right_class records, used during main rec loop
  @left_matches = {} #hash keyed on left_class records, created after main rec loop from reverse of @right_matches
  @left_losers = [] #array of left objects that were matched to right records then unmatched, requiring re-match attempt

  # Create @right_index using either a hash or Redis as the backing store
  if opts[:redis_db] && opts[:redis_db].to_i >= 1
    @right_index = RedisIndex.new(opts[:redis_db])
  else
    @right_index = HashIndex.new
  end
end

Instance Attribute Details

#compare_pairsObject (readonly)

Returns the value of attribute compare_pairs.



6
7
8
# File 'lib/matching/matcher.rb', line 6

def compare_pairs
  @compare_pairs
end

#custom_functionsObject (readonly)

Returns the value of attribute custom_functions.



6
7
8
# File 'lib/matching/matcher.rb', line 6

def custom_functions
  @custom_functions
end

#filter_functionsObject (readonly)

Returns the value of attribute filter_functions.



6
7
8
# File 'lib/matching/matcher.rb', line 6

def filter_functions
  @filter_functions
end

#join_pairsObject (readonly)

Returns the value of attribute join_pairs.



6
7
8
# File 'lib/matching/matcher.rb', line 6

def join_pairs
  @join_pairs
end

#left_matchesObject (readonly)

Returns the value of attribute left_matches.



7
8
9
# File 'lib/matching/matcher.rb', line 7

def left_matches
  @left_matches
end

#left_storeObject (readonly)

Returns the value of attribute left_store.



5
6
7
# File 'lib/matching/matcher.rb', line 5

def left_store
  @left_store
end

#min_scoreObject

Returns the value of attribute min_score.



4
5
6
# File 'lib/matching/matcher.rb', line 4

def min_score
  @min_score
end

#right_indexObject (readonly)

Returns the value of attribute right_index.



8
9
10
# File 'lib/matching/matcher.rb', line 8

def right_index
  @right_index
end

#right_matchesObject (readonly)

Returns the value of attribute right_matches.



7
8
9
# File 'lib/matching/matcher.rb', line 7

def right_matches
  @right_matches
end

#right_storeObject (readonly)

Returns the value of attribute right_store.



5
6
7
# File 'lib/matching/matcher.rb', line 5

def right_store
  @right_store
end

Class Method Details

.define(opts = nil, &block) ⇒ Object



10
11
12
13
14
# File 'lib/matching/matcher.rb', line 10

def self.define(opts=nil, &block)
  m = new(opts)
  m.define(block)
  m
end

Instance Method Details

#compare(left_attr, right_attr, weight, is_fuzzy = false) ⇒ Object

For records matched via join attributes, comparisons may be applied to adjust the score.



64
65
66
# File 'lib/matching/matcher.rb', line 64

def compare(left_attr, right_attr, weight, is_fuzzy = false)
  @compare_pairs << AttributePair.new(left_attr, right_attr, weight, is_fuzzy)
end

#compare_values(left, right, opts = {}) ⇒ Object

Compare left and right arguments and return similarity as a floating point value where 0.0 represents no similarity and 1.0 represents equality.

Raises:

  • (ArgumentError)


39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/matching/matcher.rb', line 39

def compare_values(left,right,opts={})
  return 0.0 unless left && right

  raise ArgumentError, "Cannot compare values of dissimilar type - left = #{left}, right = #{right}" unless left.class == right.class

  if opts[:fuzzy]
    raise ArgumentError, "Cannot calculate fuzzy comparison for type #{left.class}" unless left.respond_to?(:similarity_to)
    left.similarity_to(right,opts)
  else
    (left == right ? 1.0 : 0.0)
  end
end

#custom(lmbda) ⇒ Object

Custom functions may adjust the score beyond the simple comparisons performed via @compare_pairs.



70
71
72
# File 'lib/matching/matcher.rb', line 70

def custom(lmbda)
  @custom_functions << lmbda
end

#define(&block) ⇒ Object



52
53
54
# File 'lib/matching/matcher.rb', line 52

def define(&block)
  instance_eval(&block)
end

#evaluate_left_losersObject

Attempt to find matches while any left losers remain



212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# File 'lib/matching/matcher.rb', line 212

def evaluate_left_losers
  return unless @left_losers.any?

  # Use a copy of the array because it may be filled again as
  # find_matches is called
  working_losers = @left_losers
  @left_losers = []
  working_losers.each do |left_obj| 
    ranked_matches = find_matches(left_obj)
    pair_matches(left_obj, ranked_matches)
  end

  # To understand recursion you first must understand recursion
  evaluate_left_losers
end

#exceptions(side) ⇒ Object



242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/matching/matcher.rb', line 242

def exceptions(side)
  if side == :left 
    store, matches = @left_store, @left_matches
  else 
    store, matches = @right_store, @right_matches
  end

  arr = []
  if arr.class == ArrayStore
    arr = store.arr - matches
  else
    store.each do |obj|
      arr << obj unless matches[obj]
    end
  end
  arr
end

#filter(lmbda) ⇒ Object

Filter lambdas must return a boolean. Returning true will prevent a match.



75
76
77
# File 'lib/matching/matcher.rb', line 75

def filter(lmbda)
  @filter_functions << lmbda
end

#find_matches(left_obj) ⇒ Object

Return of scored matches for the left_object argument. Results are in an ordered array of form [[right_obj_a, score_a], [right_obj_b, score_b], …]



151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/matching/matcher.rb', line 151

def find_matches(left_obj)
  potential_matches = find_potential_matches(left_obj)
  ranked_pairs = []

  potential_matches.each do |right_obj|
    score = score_pair(left_obj, right_obj)
    ranked_pairs << [right_obj, score] if score >= @min_score
  end

  ranked_pairs.sort! { |a,b| a[1] <=> b[1] }
  ranked_pairs.reverse
end

#find_potential_matches(left_obj) ⇒ Object

Return an array of right_objects that match the left_object by join criteria. This is equivalent to an index lookup. No scoring is done by this method.



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# File 'lib/matching/matcher.rb', line 167

def find_potential_matches(left_obj)
  right_objects = []

  @join_pairs.each do |jp|
    left_val = left_obj.send(jp.left_attr)
    next if left_val.nil? || left_val == ''

    matches = @right_index.get(jp.right_attr, left_val)
    right_objects = right_objects | matches if matches
  end

  # At this point right_objects contains an array of right object ID's.
  # Retrieve the matching objects now.
  right_objects.map! { |r_id| @right_store.find(r_id) }
end

#index_right_objectsObject

Indexes attribues from right object in @right_index (either hash or Redis, see initialize). For each join_pair, store the attribute’s values in the form:

attr:val -> [array_of_ids]


139
140
141
142
143
144
145
146
147
# File 'lib/matching/matcher.rb', line 139

def index_right_objects

  # Require at least one exact_pair else would execute in quadratic time
  raise 'Matcher requires at least one join pair to be defined' unless @join_pairs.any?

  @right_store.each do |right_obj, id|
    @join_pairs.each { |jp| @right_index.put(jp.right_attr, right_obj.send(jp.right_attr), id) }
  end
end

#join(left_attr, right_attr, weight) ⇒ Object

One or more join attributes are required for a match between two records to occur. Attributes must be equal.



58
59
60
# File 'lib/matching/matcher.rb', line 58

def join(left_attr, right_attr, weight)
  @join_pairs << AttributePair.new(left_attr, right_attr, weight)
end

#left_exceptionsObject

Returns array of non-matched left objects



229
230
231
232
233
# File 'lib/matching/matcher.rb', line 229

def left_exceptions
  return @left_exceptions if @left_exceptions
  @left_exceptions = exceptions(:left)
  @left_exceptions
end

#matchObject

Perform matching



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/matching/matcher.rb', line 104

def match
  unless @left_store && @right_store
    raise ArgumentError, "Matcher requires left_store and right_store attributes"
  end

  # Index right objects to speed search
  index_right_objects

  # Evaluate each left record for matches.
  # If more than one match is found, the best-possible match
  # will be awarded the match unless another object is already
  # matched to it. Conflicts are resolved in a separate method.
  @left_store.each do |left_obj|

    yield left_obj if block_given?

    # Results are pre-sorted with the best matches first
    ranked_matches = find_matches(left_obj)

    # Attempt to pair the left_object with one of the 
    # ranked right matches
    pair_matches(left_obj, ranked_matches)
  end #each left_obj

  # Call the recursive method evaluate_left_losers which will attempt to
  # find new matches
  evaluate_left_losers

  # Populate left_matches as the mirror of right_matches
  @right_matches.each { |right_obj, match| @left_matches[match.left_obj] = match }
end

#matchesObject



260
261
262
263
264
# File 'lib/matching/matcher.rb', line 260

def matches
  @left_matches.map do |left_obj, match|
    match
  end 
end

#pair_matches(left_obj, ranked_matches) ⇒ Object

Evaluate and possibly create Match objects to join the left_object to one of the right_objects from the ranked_matches array



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/matching/matcher.rb', line 186

def pair_matches(left_obj, ranked_matches)

  ranked_matches.each do |pair|
    (right_obj, score) = pair

    if @right_matches[right_obj]
      # A match already exists. Determine which left_obj is the best fit.
      if score > @right_matches[right_obj].score
        # The current left_obj is a better fit.
        # Record the other left_obj as a loser then switch
        # the match for the right_obj.
        @left_losers << @right_matches[right_obj].left_obj
        @right_matches[right_obj] = Match.new(left_obj, right_obj, score)
        break
      else
        # Continue looping to try to find a better match
      end
    else
      # Assign first match for this right_obj
      @right_matches[right_obj] = Match.new(left_obj, right_obj, score)
      break
    end
  end
end

#right_exceptionsObject

Returns array of non-matched right objects



236
237
238
239
240
# File 'lib/matching/matcher.rb', line 236

def right_exceptions
  return @right_exceptions if @right_exceptions
  @right_exceptions = exceptions(:right)
  @right_exceptions
end

#score_pair(left_obj, right_obj) ⇒ Object

Given join, compare, and custom rules, return the floating point matching score of two objects.



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/matching/matcher.rb', line 81

def score_pair(left_obj, right_obj)
  score = 0

  @join_pairs.each do |pair|
    score += pair.weight * compare_values(left_obj.send(pair.left_attr), right_obj.send(pair.right_attr))
  end

  @compare_pairs.each do |pair|
    score += pair.weight * compare_values(left_obj.send(pair.left_attr), right_obj.send(pair.right_attr), pair.is_fuzzy)
  end

  @custom_functions.each do |lmbda|
    score += lmbda.call(left_obj, right_obj)
  end

  @filter_functions.each do |lmbda|
    score = 0 unless lmbda.call(left_obj, right_obj)
  end

  score
end