Module: Levenshtein
- Defined in:
- lib/levenshtein.rb,
lib/levenshtein/version.rb
Defined Under Namespace
Modules: Util
Constant Summary collapse
- VERSION =
"0.2.2"
Class Method Summary collapse
-
.distance(a1, a2, threshold = nil, options = {}) ⇒ Object
Returns the Levenshtein distance between two sequences.
- .distance_fast(rb_o1, rb_o2, rb_threshold) ⇒ Object
-
.distance_fast_or_slow(a1, a2, threshold, options) ⇒ Object
:nodoc:.
-
.distance_slow(a1, a2, threshold) ⇒ Object
:nodoc:.
-
.normalized_distance(a1, a2, threshold = nil, options = {}) ⇒ Object
Returns the Levenshtein distance as a number between 0.0 and 1.0.
Class Method Details
.distance(a1, a2, threshold = nil, options = {}) ⇒ Object
Returns the Levenshtein distance between two sequences.
The two sequences can be two strings, two arrays, or two other objects responding to :each. All sequences are by generic (fast) C code.
All objects in the sequences should respond to :hash and :eql?.
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/levenshtein.rb', line 40 def self.distance(a1, a2, threshold=nil, ={}) a1, a2 = a1.scan(/./), a2.scan(/./) if String === a1 and String === a2 a1, a2 = Util.pool(a1, a2) # Handle some basic circumstances. return 0 if a1 == a2 return a2.size if a1.empty? return a1.size if a2.empty? if threshold return nil if (a1.size-a2.size) >= threshold return nil if (a2.size-a1.size) >= threshold return nil if (a1-a2).size >= threshold return nil if (a2-a1).size >= threshold end # Remove the common prefix and the common postfix. l1 = a1.size l2 = a2.size offset = 0 no_more_optimizations = true while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset]) offset += 1 no_more_optimizations = false end while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1]) l1 -= 1 l2 -= 1 no_more_optimizations = false end if no_more_optimizations distance_fast_or_slow(a1, a2, threshold, ) else l1 -= offset l2 -= offset a1 = a1[offset, l1] a2 = a2[offset, l2] distance(a1, a2, threshold, ) end end |
.distance_fast(rb_o1, rb_o2, rb_threshold) ⇒ Object
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'ext/levenshtein/levenshtein_fast.c', line 4
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
VALUE *p1, *p2;
long l1, l2;
long col, row;
int threshold;
int *prev_row, *curr_row, *temp_row;
int curr_row_min, result;
int value1, value2;
/* Be sure that all equivalent objects in rb_o1 and rb_o2 (a.eql?(b) == true) are taken from a pool (a.equal?(b) == true). */
/* This is done in levenshtein.rb by means of Util.pool. */
/* Get the sizes of both arrays. */
l1 = RARRAY_LEN(rb_o1);
l2 = RARRAY_LEN(rb_o2);
/* Get the pointers of both arrays. */
p1 = RARRAY_PTR(rb_o1);
p2 = RARRAY_PTR(rb_o2);
/* Convert Ruby's threshold to C's threshold. */
if (!NIL_P(rb_threshold)) {
threshold = FIX2INT(rb_threshold);
} else {
threshold = -1;
}
/* The Levenshtein algorithm itself. */
/* s1= */
/* ERIK */
/* */
/* 01234 */
/* s2=V 11234 */
/* E 21234 */
/* E 32234 */
/* N 43334 <- prev_row */
/* S 54444 <- curr_row */
/* T 65555 */
/* R 76566 */
/* A 87667 */
/* Allocate memory for both rows */
prev_row = (int*) ALLOC_N(int, (l1+1));
curr_row = (int*) ALLOC_N(int, (l1+1));
/* Initialize the current row. */
for (col=0; col<=l1; col++) {
curr_row[col] = col;
}
for (row=1; row<=l2; row++) {
/* Copy the current row to the previous row. */
temp_row = prev_row;
prev_row = curr_row;
curr_row = temp_row;
/* Calculate the values of the current row. */
curr_row[0] = row;
curr_row_min = row;
for (col=1; col<=l1; col++) {
/* Equal (cost=0) or substitution (cost=1). */
value1 = prev_row[col-1] + ((p1[col-1] == p2[row-1]) ? 0 : 1);
/* Insertion if it's cheaper than substitution. */
value2 = prev_row[col]+1;
if (value2 < value1) {
value1 = value2;
}
/* Deletion if it's cheaper than substitution. */
value2 = curr_row[col-1]+1;
if (value2 < value1) {
value1 = value2;
}
/* Keep track of the minimum value on this row. */
if (value1 < curr_row_min) {
curr_row_min = value1;
}
curr_row[col] = value1;
}
/* Return nil as soon as we exceed the threshold. */
if (threshold > -1 && curr_row_min >= threshold) {
free(prev_row);
free(curr_row);
return Qnil;
}
}
/* The result is the last value on the last row. */
result = curr_row[l1];
free(prev_row);
free(curr_row);
/* Return the Ruby version of the result. */
return INT2FIX(result);
}
|
.distance_fast_or_slow(a1, a2, threshold, options) ⇒ Object
:nodoc:
91 92 93 94 95 96 97 |
# File 'lib/levenshtein.rb', line 91 def self.distance_fast_or_slow(a1, a2, threshold, ) # :nodoc: if respond_to?(:distance_fast) and [:force_slow] distance_fast(a1, a2, threshold) # Implemented in C. else distance_slow(a1, a2, threshold) # Implemented in Ruby. end end |
.distance_slow(a1, a2, threshold) ⇒ Object
:nodoc:
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/levenshtein.rb', line 99 def self.distance_slow(a1, a2, threshold) # :nodoc: crow = (0..a1.size).to_a 1.upto(a2.size) do |y| prow = crow crow = [y] 1.upto(a1.size) do |x| crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min end # Stop analysing this sequence as soon as the best possible # result for this sequence is bigger than the best result so far. # (The minimum value in the next row will be equal to or greater # than the minimum value in this row.) return nil if threshold and crow.min >= threshold end crow[-1] end |
.normalized_distance(a1, a2, threshold = nil, options = {}) ⇒ Object
Returns the Levenshtein distance as a number between 0.0 and 1.0. It’s basically the Levenshtein distance divided by the size of the longest sequence.
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/levenshtein.rb', line 10 def self.normalized_distance(a1, a2, threshold=nil, ={}) size = [a1.size, a2.size].max if a1.size == 0 and a2.size == 0 0.0 elsif a1.size == 0 a2.size.to_f/size elsif a2.size == 0 a1.size.to_f/size else if threshold if d = self.distance(a1, a2, (threshold*size).to_i+1) d.to_f/size else nil end else self.distance(a1, a2).to_f/size end end end |