Module: Levenshtein
- Defined in:
- lib/levenshtein.rb,
lib/levenshtein/version.rb,
lib/levenshtein/exception.rb
Defined Under Namespace
Classes: LevenshteinException
Constant Summary collapse
- VERSION =
"0.2.1"
Class Method Summary collapse
-
.distance(a1, a2, threshold = nil) ⇒ Object
Returns the Levenshtein distance between two sequences.
- .distance_fast(rb_o1, rb_o2, rb_threshold) ⇒ Object
-
.distance_fast_or_slow(a1, a2, threshold) ⇒ Object
:nodoc:.
-
.distance_slow(a1, a2, threshold) ⇒ Object
:nodoc:.
-
.normalized_distance(a1, a2, threshold = nil) ⇒ Object
Returns the Levenshtein distance as a number between 0.0 and 1.0.
Class Method Details
.distance(a1, a2, threshold = nil) ⇒ Object
Returns the Levenshtein distance between two sequences.
The two sequences can be two strings, two arrays, or two other objects. Strings, arrays and arrays of strings are handled with optimized (very fast) C code. All other sequences are handled with generic (fast) C code.
The sequences should respond to :length and :[] and all objects in the sequences (as returned by []) should response to :==.
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/levenshtein.rb', line 37 def self.distance(a1, a2, threshold=nil) a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one. # Handle some basic circumstances. return 0 if a1 == a2 return a2.length if a1.length == 0 if threshold return nil if (a2.length-a1.length) >= threshold a3, a4 = nil, nil a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-) a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan) if a3 and a4 return nil if (a3-a4).length >= threshold return nil if (a4-a3).length >= threshold end end distance_fast_or_slow(a1, a2, threshold) end |
.distance_fast(rb_o1, rb_o2, rb_threshold) ⇒ Object
4 5 6 7 8 9 10 11 12 13 14 15 16 |
# File 'ext/levenshtein/levenshtein_fast.c', line 4
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
} else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
} else {
return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
}
} else {
return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
}
}
|
.distance_fast_or_slow(a1, a2, threshold) ⇒ Object
:nodoc:
61 62 63 64 65 66 67 |
# File 'lib/levenshtein.rb', line 61 def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc: if respond_to?(:distance_fast) distance_fast(a1, a2, threshold) # Implemented in C. else distance_slow(a1, a2, threshold) # Implemented in Ruby. end end |
.distance_slow(a1, a2, threshold) ⇒ Object
:nodoc:
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/levenshtein.rb', line 69 def self.distance_slow(a1, a2, threshold) # :nodoc: l1 = a1.length l2 = a2.length offset = 0 while offset < l1 and offset < l2 and a1[offset] == a2[offset] offset += 1 end while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1] l1 -= 1 l2 -= 1 end l1 -= offset l2 -= offset crow = (0..l1).to_a 1.upto(l2) do |y| prow = crow crow = [y] 1.upto(l1) do |x| crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min end # Stop analysing this sequence as soon as the best possible # result for this sequence is bigger than the best result so far. # (The minimum value in the next row will be equal to or greater # than the minimum value in this row.) return nil if threshold and crow.min >= threshold end crow[-1] end |
.normalized_distance(a1, a2, threshold = nil) ⇒ Object
Returns the Levenshtein distance as a number between 0.0 and 1.0. It’s basically the Levenshtein distance divided by the length of the longest sequence.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/levenshtein.rb', line 9 def self.normalized_distance(a1, a2, threshold=nil) a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one. if a2.length == 0 0.0 # Since a1.length < a2.length, a1 must be empty as well. else if threshold if d = self.distance(a1, a2, (threshold*a2.length+1).to_i) d.to_f/a2.length else nil end else self.distance(a1, a2).to_f/a2.length end end end |