Class: Matching::Deduplicator
- Inherits:
-
Object
- Object
- Matching::Deduplicator
- Defined in:
- lib/matching/deduplicator.rb
Instance Attribute Summary collapse
-
#criteria ⇒ Object
Returns the value of attribute criteria.
-
#grouped ⇒ Object
Returns the value of attribute grouped.
-
#groups ⇒ Object
array of arrays of duplicate records in form [[1,5],,[6]].
-
#index ⇒ Object
Returns the value of attribute index.
-
#store ⇒ Object
Returns the value of attribute store.
Instance Method Summary collapse
- #create_index ⇒ Object
- #deduplicate ⇒ Object
- #define(&block) ⇒ Object
-
#each_with_groups ⇒ Object
Returns each object in store along with its group’s index and index within the group.
-
#initialize(store, opts = {}) ⇒ Deduplicator
constructor
hash of all ids present in @groups.
- #match_attrs(attrs) ⇒ Object
- #unique_attrs ⇒ Object
Constructor Details
#initialize(store, opts = {}) ⇒ Deduplicator
hash of all ids present in @groups. Eventually all ids from @store will be added. Stored in form { id => index_of_groups_object }
9 10 11 12 13 14 15 16 17 18 19 20 21 |
# File 'lib/matching/deduplicator.rb', line 9 def initialize(store,opts={}) raise 'Store parameter required' unless store @store = store @criteria = [] # Create an index using either a hash or Redis as the backing store if opts[:redis_db] && opts[:redis_db].to_i >= 1 @index = RedisIndex.new(opts[:redis_db]) else @index = HashIndex.new end end |
Instance Attribute Details
#criteria ⇒ Object
Returns the value of attribute criteria.
4 5 6 |
# File 'lib/matching/deduplicator.rb', line 4 def criteria @criteria end |
#grouped ⇒ Object
Returns the value of attribute grouped.
6 7 8 |
# File 'lib/matching/deduplicator.rb', line 6 def grouped @grouped end |
#groups ⇒ Object
array of arrays of duplicate records in form [[1,5],,[6]]
5 6 7 |
# File 'lib/matching/deduplicator.rb', line 5 def groups @groups end |
#index ⇒ Object
Returns the value of attribute index.
4 5 6 |
# File 'lib/matching/deduplicator.rb', line 4 def index @index end |
#store ⇒ Object
Returns the value of attribute store.
4 5 6 |
# File 'lib/matching/deduplicator.rb', line 4 def store @store end |
Instance Method Details
#create_index ⇒ Object
106 107 108 109 110 111 112 113 114 |
# File 'lib/matching/deduplicator.rb', line 106 def create_index raise 'Deduplicator requires at least one match attribute be defined' unless @criteria.any? @store.each do |obj, id| unique_attrs.each do |ma| @index.put(ma, obj.send(ma), id) end end end |
#deduplicate ⇒ Object
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/matching/deduplicator.rb', line 35 def deduplicate @groups = [] # Array of arrays containing ids of grouped objects @nil_group = [] # Special array of objects whose indexed values are all nil (because index isn't tracking them) @grouped = {} # Hash of each object's id to the index of @groups in which its found # Index all records in the store to speed search create_index # Place each object into an array in @groups that contain all # records that match the defined matching logic. @store.each do |obj,store_idx| puts "On #{store_idx}" if store_idx % 100 == 0 && store_idx > 0 # Shortcut the process if there is only one array in criteria # and this object is already present (because it can't possibly match # a second time) next if @criteria.size == 1 && @grouped[obj.id] @criteria.each do |arr| # Find matching objects all_matches = nil arr.each do |match_attr| val = obj.send(match_attr) if val != nil matches = @index.get(match_attr, val) all_matches = (all_matches ? all_matches & matches : matches) end end if all_matches.nil? @nil_group << obj.id next end # Assign matched objects to a group. # Groups may be merged in this process. current_group_indexes = all_matches.inject([]) do |arr,id| arr << @grouped[id] if @grouped[id] arr end.uniq.compact next if current_group_indexes.size == 1 # can only be [obj_id] if current_group_indexes.size > 1 # Merge related groups into mega_group based on first group mega_group = @groups[current_group_indexes[0]] current_group_indexes[1..-1].each do |idx| @groups[idx].each { |id| mega_group << id } @groups.delete_at(idx) end # Re-assign @grouped for all objects to new mega-group mega_group.each { |obj_id| @grouped[obj_id] = current_group_indexes[0] } else # Create new group @groups << all_matches group_idx = @groups.size - 1 all_matches.each { |obj_id| @grouped[obj_id] = group_idx } end end end # Add the contents of nil group as a single group @groups << @nil_group if @nil_group.any? #puts "Results: #{@groups.inspect}" end |
#define(&block) ⇒ Object
31 32 33 |
# File 'lib/matching/deduplicator.rb', line 31 def define(&block) instance_eval(&block) end |
#each_with_groups ⇒ Object
Returns each object in store along with its group’s index and index within the group. For example… group_idx | idx | name
0 | 0 | Fred Smith
0 | 1 | Fred Smith
1 | 0 | Jane Green
2 | 0 | Linda Smythe
2 | 1 | Linda Smythe
124 125 126 127 128 129 130 |
# File 'lib/matching/deduplicator.rb', line 124 def each_with_groups @groups.each_with_index do |arr,grp_idx| arr.each_with_index do |obj_id,obj_idx| yield(@store.find(obj_id), grp_idx, obj_idx) end end end |
#match_attrs(attrs) ⇒ Object
23 24 25 |
# File 'lib/matching/deduplicator.rb', line 23 def match_attrs(attrs) @criteria << [*attrs] #converts to array if not already, doesn't affect arrays end |
#unique_attrs ⇒ Object
27 28 29 |
# File 'lib/matching/deduplicator.rb', line 27 def unique_attrs @criteria.flatten.uniq end |