Class: Matching::Deduplicator

Inherits:
Object
  • Object
show all
Defined in:
lib/matching/deduplicator.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(store, opts = {}) ⇒ Deduplicator

hash of all ids present in @groups. Eventually all ids from @store will be added. Stored in form { id => index_of_groups_object }



9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/matching/deduplicator.rb', line 9

def initialize(store,opts={})
  raise 'Store parameter required' unless store
  @store = store

  @criteria = []
 
  # Create an index using either a hash or Redis as the backing store
  if opts[:redis_db] && opts[:redis_db].to_i >= 1
    @index = RedisIndex.new(opts[:redis_db])
  else
    @index = HashIndex.new
  end
end

Instance Attribute Details

#criteriaObject

Returns the value of attribute criteria.



4
5
6
# File 'lib/matching/deduplicator.rb', line 4

def criteria
  @criteria
end

#groupedObject

Returns the value of attribute grouped.



6
7
8
# File 'lib/matching/deduplicator.rb', line 6

def grouped
  @grouped
end

#groupsObject

array of arrays of duplicate records in form [[1,5],,[6]]



5
6
7
# File 'lib/matching/deduplicator.rb', line 5

def groups
  @groups
end

#indexObject

Returns the value of attribute index.



4
5
6
# File 'lib/matching/deduplicator.rb', line 4

def index
  @index
end

#storeObject

Returns the value of attribute store.



4
5
6
# File 'lib/matching/deduplicator.rb', line 4

def store
  @store
end

Instance Method Details

#create_indexObject



106
107
108
109
110
111
112
113
114
# File 'lib/matching/deduplicator.rb', line 106

def create_index
  raise 'Deduplicator requires at least one match attribute be defined' unless @criteria.any?

  @store.each do |obj, id|
    unique_attrs.each do |ma|
      @index.put(ma, obj.send(ma), id)
    end
  end
end

#deduplicateObject



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/matching/deduplicator.rb', line 35

def deduplicate
  @groups = []      # Array of arrays containing ids of grouped objects
  @nil_group = []   # Special array of objects whose indexed values are all nil (because index isn't tracking them)
  @grouped = {}     # Hash of each object's id to the index of @groups in which its found
  
  # Index all records in the store to speed search
  create_index

  # Place each object into an array in @groups that contain all
  # records that match the defined matching logic.
  @store.each do |obj,store_idx|

    puts "On #{store_idx}" if store_idx % 100 == 0 && store_idx > 0

    # Shortcut the process if there is only one array in criteria 
    # and this object is already present (because it can't possibly match
    # a second time)
    next if @criteria.size == 1 && @grouped[obj.id]

    @criteria.each do |arr|

      # Find matching objects
      all_matches = nil
      arr.each do |match_attr|
        val = obj.send(match_attr)

        if val != nil
          matches = @index.get(match_attr, val)
          all_matches = (all_matches ? all_matches & matches : matches)
        end
      end

      if all_matches.nil?
        @nil_group << obj.id
        next
      end

      # Assign matched objects to a group.
      # Groups may be merged in this process. 
      current_group_indexes = all_matches.inject([]) do |arr,id| 
        arr << @grouped[id] if @grouped[id] 
        arr
      end.uniq.compact

      next if current_group_indexes.size == 1 # can only be [obj_id]

      if current_group_indexes.size > 1
        # Merge related groups into mega_group based on first group
        mega_group = @groups[current_group_indexes[0]] 
        current_group_indexes[1..-1].each do |idx| 
          @groups[idx].each { |id| mega_group << id } 
          @groups.delete_at(idx)
        end
      
        # Re-assign @grouped for all objects to new mega-group
        mega_group.each { |obj_id| @grouped[obj_id] = current_group_indexes[0] }
      else
        # Create new group
        @groups << all_matches
        group_idx = @groups.size - 1
        all_matches.each { |obj_id| @grouped[obj_id] = group_idx }
      end
    end   
  end

  # Add the contents of nil group as a single group
  @groups << @nil_group if @nil_group.any?

  #puts "Results: #{@groups.inspect}"
end

#define(&block) ⇒ Object



31
32
33
# File 'lib/matching/deduplicator.rb', line 31

def define(&block)
  instance_eval(&block)
end

#each_with_groupsObject

Returns each object in store along with its group’s index and index within the group. For example… group_idx | idx | name

0 |   0 | Fred Smith
0 |   1 | Fred Smith
1 |   0 | Jane Green
2 |   0 | Linda Smythe
2 |   1 | Linda Smythe


124
125
126
127
128
129
130
# File 'lib/matching/deduplicator.rb', line 124

def each_with_groups
  @groups.each_with_index do |arr,grp_idx|
    arr.each_with_index do |obj_id,obj_idx|
      yield(@store.find(obj_id), grp_idx, obj_idx) 
    end
  end
end

#match_attrs(attrs) ⇒ Object



23
24
25
# File 'lib/matching/deduplicator.rb', line 23

def match_attrs(attrs)
  @criteria << [*attrs] #converts to array if not already, doesn't affect arrays  
end

#unique_attrsObject



27
28
29
# File 'lib/matching/deduplicator.rb', line 27

def unique_attrs
  @criteria.flatten.uniq
end