Module: Levenshtein

Defined in:
lib/levenshtein.rb,
lib/levenshtein/version.rb,
lib/levenshtein/exception.rb

Defined Under Namespace

Classes: LevenshteinException

Constant Summary collapse

VERSION =
"0.2.1"

Class Method Summary collapse

Class Method Details

.distance(a1, a2, threshold = nil) ⇒ Object

Returns the Levenshtein distance between two sequences.

The two sequences can be two strings, two arrays, or two other objects. Strings, arrays and arrays of strings are handled with optimized (very fast) C code. All other sequences are handled with generic (fast) C code.

The sequences should respond to :length and :[] and all objects in the sequences (as returned by []) should response to :==.



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/levenshtein.rb', line 37

def self.distance(a1, a2, threshold=nil)
  a1, a2	= a2, a1	if a1.length > a2.length	# a1 is the short one; a2 is the long one.

  # Handle some basic circumstances.

  return 0		if a1 == a2
  return a2.length	if a1.length == 0

  if threshold
    return nil	if (a2.length-a1.length) >= threshold

    a3, a4	= nil, nil
    a3, a4	= a1, a2			if a1.respond_to?(:-) and a2.respond_to?(:-)
    a3, a4	= a1.scan(/./), a2.scan(/./)	if a1.respond_to?(:scan) and a2.respond_to?(:scan)

    if a3 and a4
      return nil	if (a3-a4).length >= threshold
      return nil	if (a4-a3).length >= threshold
    end
  end

  distance_fast_or_slow(a1, a2, threshold)
end

.distance_fast(rb_o1, rb_o2, rb_threshold) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
# File 'ext/levenshtein/levenshtein_fast.c', line 4

VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
  if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
    return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
  } else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
    if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
      return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
    } else {
      return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
    }
  } else {
    return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
  }
}

.distance_fast_or_slow(a1, a2, threshold) ⇒ Object

:nodoc:



61
62
63
64
65
66
67
# File 'lib/levenshtein.rb', line 61

def self.distance_fast_or_slow(a1, a2, threshold)	# :nodoc:
  if respond_to?(:distance_fast)
    distance_fast(a1, a2, threshold)	# Implemented in C.
  else
    distance_slow(a1, a2, threshold)	# Implemented in Ruby.
  end
end

.distance_slow(a1, a2, threshold) ⇒ Object

:nodoc:



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/levenshtein.rb', line 69

def self.distance_slow(a1, a2, threshold)	# :nodoc:
  l1	= a1.length
  l2	= a2.length

  offset	= 0
 
  while offset < l1 and offset < l2 and a1[offset] == a2[offset]
    offset += 1
  end
 
  while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
    l1 -= 1
    l2 -= 1
  end
 
  l1 -= offset
  l2 -= offset

  crow	= (0..l1).to_a

  1.upto(l2) do |y|
    prow	= crow
    crow	= [y]

    1.upto(l1) do |x|
      crow[x]	= [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
    end

    # Stop analysing this sequence as soon as the best possible
    # result for this sequence is bigger than the best result so far.
    # (The minimum value in the next row will be equal to or greater
    # than the minimum value in this row.)

    return nil	if threshold and crow.min >= threshold
  end

  crow[-1]
end

.normalized_distance(a1, a2, threshold = nil) ⇒ Object

Returns the Levenshtein distance as a number between 0.0 and 1.0. It’s basically the Levenshtein distance divided by the length of the longest sequence.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/levenshtein.rb', line 9

def self.normalized_distance(a1, a2, threshold=nil)
  a1, a2	= a2, a1	if a1.length > a2.length	# a1 is the short one; a2 is the long one.

  if a2.length == 0
    0.0	# Since a1.length < a2.length, a1 must be empty as well.
  else
    if threshold
      if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
        d.to_f/a2.length
      else
        nil
      end
    else
      self.distance(a1, a2).to_f/a2.length
    end
  end
end