Module: AttrSimilar::SimilarityMatching

Defined in:
lib/attr_similar/similarity_matching.rb

Class Method Summary collapse

Class Method Details

.find_first_similar(scope, entity, threshold_or_thresholds, attributes) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/attr_similar/similarity_matching.rb', line 3

def self.find_first_similar(scope, entity, threshold_or_thresholds, attributes)
  # Only consider attributes on entity with non-blank values
  filtered_attributes = attributes.select { |attribute| !entity.send(attribute).blank? }
  return nil if filtered_attributes.size == 0

  # Use single threshold or threshold determined by number of filtered attributes
  threshold = if threshold_or_thresholds.is_a?(Array)
    threshold_or_thresholds[filtered_attributes.size - 1]
  else
    threshold_or_thresholds
  end

  scope = scope.where('id != ?', entity.id) if entity.id

  # Build up per-attribute entity lists and candidate entity list
  per_attribute_entity_lists, candidate_entities = filtered_attributes.inject([[], []]) do |memo, attribute|
    entities = scope.where(attribute => entity.send(attribute))
    if entities.size > 0
      memo[0].concat([entities])   # Add to per_attribute_entity_lists
      memo[1].concat(entities)     # Add to candidate_entities
    end
    memo
  end

  # No similar entities if count of per-attribute entity lists is below threshold
  return nil if per_attribute_entity_lists.size < threshold

  per_attribute_entity_id_lists = per_attribute_entity_lists.map { |list| list.map(&:id) }

  candidate_entity_map = candidate_entities.inject({}) do |map, candidate_entity|
    map[candidate_entity.id] = candidate_entity
    map
  end
  candidate_entity_ids = candidate_entity_map.keys

  # Find entities that are in at least "threshold" lists
  candidate_entity_ids.each do |candidate_entity_id|
    count = per_attribute_entity_id_lists.reduce(0) do |count, list|
      count += list.include?(candidate_entity_id) ? 1 : 0
    end
    return candidate_entity_map[candidate_entity_id] if count >= threshold
  end

  # No similar entities
  nil
end