Class: String

Inherits:
Object
  • Object
show all
Defined in:
lib/matching/similarity.rb

Instance Method Summary collapse

Instance Method Details

#name_similarity_to(other_string) ⇒ Object

Given two names, return a floating-point evaluation of similarity in the range 0.0 - 1.0



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/matching/similarity.rb', line 46

def name_similarity_to(other_string)
  return 0.0 if self.nil? || other_string.nil? || self.size == 0 || other_string.size == 0
  return 1.0 if self == other_string

  l_tokens = self.tokenize
  r_tokens = other_string.tokenize

  total_sim = 0.0
  l_tokens.each do |l|
    r_tokens.each do |r|
      total_sim += l.raw_similarity_to(r)
    end
  end

  avg_tokens = (l_tokens.size + r_tokens.size).to_f / 2.0
  score = total_sim / avg_tokens
  (score > 1.0 ? 1.0 : score)
end

#raw_similarity_to(other) ⇒ Object

Returns a floating point value of the similarity between this string and other. Uses ‘text’ gem, rubyforge.org/projects/text



68
69
70
71
72
73
74
75
76
# File 'lib/matching/similarity.rb', line 68

def raw_similarity_to(other)
  delta = Text::Levenshtein::distance(self.downcase, other.downcase)
  return 0.0 unless delta
  return 1.0 if delta == 0

  avg_len = (size + other.size).to_f / 2.0
  return 0.0 if delta > avg_len
  (avg_len - delta.to_f) / avg_len
end

#similarity_to(other_string, opts = {}) ⇒ Object



23
24
25
26
27
28
29
30
31
# File 'lib/matching/similarity.rb', line 23

def similarity_to(other_string, opts={})
  case opts[:comparison] 
    when :name
      name_similarity_to(other_string)
    else 
      ## use just levenshtein edit distance (see levenshtein.rb)
      return raw_similarity_to(other_string)
  end
end

#tokenizeObject

Given a string, return one or more tokens parsed with the following rules:

  1. Turn commas into spaces

  2. Split on spaces

  3. Strip periods

  4. Discard any tokens with single letters



38
39
40
41
42
# File 'lib/matching/similarity.rb', line 38

def tokenize
  tokens = self.gsub(/\,/,' ').gsub(/\./,'').split(' ')
  tokens.reject! { |p| p.size == 1 }
  tokens
end