Module: BatchJaroWinkler

Extended by:
FFI::Library
Defined in:
lib/batch_jaro_winkler.rb,
lib/batch_jaro_winkler/version.rb,
ext/batch_jaro_winkler/batch_jaro_winkler.c

Defined Under Namespace

Classes: BjwResult, RuntimeModel

Constant Summary collapse

VERSION =
'0.1.3'

Class Method Summary collapse

Class Method Details

.allocate_c_data(nb_candidates, with_min_scores) ⇒ Object

Automatically freed when the block closes



74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/batch_jaro_winkler.rb', line 74

def self.allocate_c_data(nb_candidates, with_min_scores)
  FFI::MemoryPointer.new(:uint32, 1, false) do |exportable_model_size|
    FFI::MemoryPointer.new(:pointer, nb_candidates, false) do |c_candidates|
      FFI::MemoryPointer.new(:uint32, nb_candidates, false) do |c_candidates_lengths|
        return yield([exportable_model_size, c_candidates, c_candidates_lengths, nil]) unless with_min_scores
        FFI::MemoryPointer.new(:float, nb_candidates, false) do |c_min_scores|
          yield([exportable_model_size, c_candidates, c_candidates_lengths, c_min_scores])
        end
      end
    end
  end
end

.build_exportable_model(candidates, opts = {}) ⇒ Object



149
150
151
# File 'lib/batch_jaro_winkler.rb', line 149

def self.build_exportable_model(candidates, opts = {})
  BatchJaroWinkler.build_exportable_model_bytes(0, candidates, opts)
end

.build_exportable_model_bytes(char_width, candidates, opts = {}) ⇒ Object

inp_candidates must follow one of these formats:

  • ‘hi’, ‘hello’
  • { candidate: ‘hi’, min_score: 0.5 }, { candidate: ‘hello’, min_score: 0.8 }


90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/batch_jaro_winkler.rb', line 90

def self.build_exportable_model_bytes(char_width, candidates, opts = {})
  current_version_has_memory_leak = version_with_memory_leak?(RUBY_VERSION)
  nb_runtime_threads = opts[:nb_runtime_threads] || 1
  if nb_runtime_threads < 1
    raise ArgumentError.new('nb_runtime_threads must be > 0')
  end
  candidates_encoded = char_width != 0
  char_width = 4 unless candidates_encoded
  if char_width != 1 && char_width != 2 && char_width != 4
    raise ArgumentError.new('char_width must be 1, 2 or 4')
  end
  # float size is platform dependent, so don't rely on it being 4
  float_size = 0
  FFI::MemoryPointer.new(:float, 1, false) do |one_float|
    float_size = one_float.size
  end
  nb_candidates = candidates.size
  with_min_scores = false
  if nb_candidates > 0 && candidates[0].respond_to?(:each_pair) && candidates[0].key?(:min_score)
    with_min_scores = true
  end

  exportable_model = nil
  allocate_c_data(nb_candidates, with_min_scores) do |(exportable_model_size, c_candidates, c_candidates_lengths, c_min_scores)|
    # Keep in ruby array also to guarantee that encoded strings are not garbage collected.
    _stored_candidates = Array.new(nb_candidates)
    candidates.each_with_index do |cand, i_cand|
      cand_string = cand
      if with_min_scores
        cand_string = cand[:candidate]
        if cand[:min_score] < 0.0 or cand[:min_score] > 1.0
          raise 'min_score must be >= 0.0 and <= 1.0'
        end
        c_min_scores.put(:float, i_cand * float_size, cand[:min_score])
      end
      unless candidates_encoded
        cand_string = current_version_has_memory_leak ? encode_utf32_le_without_memory_leak(cand_string) : cand_string.encode('utf-32le')
      end
      cand_string.force_encoding('ascii')
      cand_length = cand_string.size / char_width
      cand_string = FFI::MemoryPointer.from_string(cand_string)
      _stored_candidates[i_cand] = cand_string
      c_candidates.put(:pointer, i_cand * FFI::Pointer.size, cand_string)
      # sizeof(uint32_t) = 4
      c_candidates_lengths.put(:uint32, i_cand * 4, cand_length)
    end

    exportable_model = BatchJaroWinkler.bjw_build_exportable_model(c_candidates, char_width, c_candidates_lengths, nb_candidates, c_min_scores, nb_runtime_threads, exportable_model_size)
    next unless exportable_model

    # Will free the raw C exportable model when GC'd
    _gced_exportable_model = FFI::AutoPointer.new(exportable_model, BatchJaroWinkler.method(:_bjw_free))
    exportable_model = exportable_model.read_string(exportable_model_size.get(:uint32, 0))
  end

  raise 'batch_jaro_winkler.build_exportable_model failed' unless exportable_model
  exportable_model
end

.build_runtime_model(exportable_model) ⇒ Object



153
154
155
# File 'lib/batch_jaro_winkler.rb', line 153

def self.build_runtime_model(exportable_model)
  RuntimeModel.new(exportable_model)
end

.jaro_distance(runtime_model, inp, opts = {}) ⇒ Object



239
240
241
# File 'lib/batch_jaro_winkler.rb', line 239

def self.jaro_distance(runtime_model, inp, opts = {})
  BatchJaroWinkler.jaro_distance_bytes(0, runtime_model, inp, opts)
end

.jaro_distance_bytes(char_width, runtime_model, inp, opts = {}) ⇒ Object



233
234
235
236
237
# File 'lib/batch_jaro_winkler.rb', line 233

def self.jaro_distance_bytes(char_width, runtime_model, inp, opts = {})
  opts[:weight] = nil
  opts[:threshold] = nil
  BatchJaroWinkler.jaro_winkler_distance_bytes(char_width, runtime_model, inp, opts)
end

.jaro_winkler_distance(runtime_model, inp, opts = {}) ⇒ Object



229
230
231
# File 'lib/batch_jaro_winkler.rb', line 229

def self.jaro_winkler_distance(runtime_model, inp, opts = {})
  BatchJaroWinkler.jaro_winkler_distance_bytes(0, runtime_model, inp, opts)
end

.jaro_winkler_distance_bytes(char_width, runtime_model, inp, opts = {}) ⇒ Object



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/batch_jaro_winkler.rb', line 157

def self.jaro_winkler_distance_bytes(char_width, runtime_model, inp, opts = {})
  return [] if opts[:n_best_results] == 0
  current_version_has_memory_leak = version_with_memory_leak?(RUBY_VERSION)
  opts[:weight] = 0.1 unless opts.key?(:weight)
  opts[:threshold] = 0.7 unless opts.key?(:threshold)
  opts[:n_best_results] = 0 unless opts[:n_best_results]

  if !(opts[:min_score].nil?) && (opts[:min_score] < 0.0 || opts[:min_score] > 1.0)
    raise ArgumentError.new('min_score must be >= 0.0 and <= 1.0')
  end
  if !(opts[:weight].nil?) && (opts[:weight] < 0.0 || opts[:weight] > 0.25)
    raise ArgumentError.new('weight must be >= 0.0 and <= 0.25')
  end
  if !(opts[:threshold].nil?) && (opts[:threshold] < 0.0 || opts[:threshold] > 1.0)
    raise ArgumentError.new('threshold must be >= 0.0 and <= 1.0')
  end
  if opts[:n_best_results] < 0
    raise ArgumentError.new('n_best_results must be >= 0')
  end
  opts[:min_score] = -1.0 if opts[:min_score].nil?
  opts[:weight] = -1.0 if opts[:weight].nil?
  opts[:threshold] = -1.0 if opts[:threshold].nil?

  inp_encoded = char_width != 0
  char_width = 4 unless inp_encoded
  if char_width != 1 && char_width != 2 && char_width != 4
    raise ArgumentError.new('char_width must be 1, 2 or 4')
  end

  unless inp_encoded
    inp = current_version_has_memory_leak ? encode_utf32_le_without_memory_leak(inp) : inp.encode('utf-32le')
  end
  inp.force_encoding('ascii')
  c_results = nil
  nb_results = nil
  FFI::MemoryPointer.new(:uint32, 1, false) do |c_nb_results|
    c_results = BatchJaroWinkler.bjw_jaro_winkler_distance(runtime_model.model, inp, inp.size / char_width, opts[:min_score], opts[:weight], opts[:threshold], opts[:n_best_results], c_nb_results)
    nb_results = c_nb_results.get(:uint32, 0)
  end
  raise 'batch_jaro_winkler.jaro_winkler_distance failed' unless c_results

  # Will free the raw C results when GC'd
  _gced_results = FFI::AutoPointer.new(c_results, BatchJaroWinkler.method(:_bjw_free))
  c_results_address = c_results.address
  c_results = FFI::Pointer.new(BjwResult, c_results)

  native_conversion = true
  begin
    BatchJaroWinkler.method(:rb_bjw_build_runtime_result)
  rescue NameError
    native_conversion = false
  end

  if native_conversion
    res = []
    ok = BatchJaroWinkler.rb_bjw_build_runtime_result([], res, c_results_address, nb_results, inp_encoded, char_width)
    raise 'rb_bjw_build_runtime_result failed' unless ok
    res
  else
    # standard slow ffi version
    Array.new(nb_results) do |i_result|
      res = BjwResult.new(c_results[i_result])
      candidate = res[:candidate].read_string(res[:candidate_length] * char_width)
      unless inp_encoded
        candidate.force_encoding('utf-32le')
        candidate = candidate.encode('utf-8')
      end
      [candidate, res[:score]]
    end
  end
end

.rb_bjw_build_runtime_result(tmp_store, rb_results, rb_c_results, rb_nb_results, rb_inp_encoded, rb_char_width) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'ext/batch_jaro_winkler/batch_jaro_winkler.c', line 14

VALUE	rb_bjw_build_runtime_result(VALUE self, VALUE tmp_store, VALUE rb_results, VALUE rb_c_results, VALUE rb_nb_results, VALUE rb_inp_encoded, VALUE rb_char_width)
{
	bjw_result	*results;
	uint32_t	nb_results;
	uint32_t	i_result;
	VALUE		tmp_candidate;
	rb_encoding	*utf32le_encoding;
	rb_encoding	*utf8_encoding;
	VALUE		rb_utf8_encoding;
	uint32_t	char_width;
	int			inp_encoded;
	char		*all_candidates;
	VALUE		rb_all_candidates;
	uint64_t	total_nb_bytes;
	uint64_t	decal;
	uint64_t	bytes_len;
	uint64_t	candidate_length_in_bytes;
	uint64_t	i_char;

	nb_results = (uint32_t)(NUM2ULL(rb_nb_results));
	results = (bjw_result*)(NUM2ULL(rb_c_results));
	char_width = (uint32_t)(NUM2ULL(rb_char_width));
	inp_encoded = RTEST(rb_inp_encoded);

	utf32le_encoding = rb_enc_find("UTF-32LE");
	utf8_encoding = rb_enc_find("UTF-8");
	rb_utf8_encoding = rb_enc_from_encoding(utf8_encoding);
	// We use tmp_store so that local ruby objects are marked by the GC
	rb_ary_push(tmp_store, rb_utf8_encoding);

	if (!inp_encoded)
	{
		total_nb_bytes = 0;
		for (i_result = 0; i_result < nb_results; i_result++)
			total_nb_bytes += results[i_result].candidate_length;
		total_nb_bytes *= char_width;
		all_candidates = malloc(total_nb_bytes);
		if (!all_candidates)
			return (Qfalse);
		decal = 0;
		for (i_result = 0; i_result < nb_results; i_result++)
		{
			bytes_len = results[i_result].candidate_length * char_width;
			for (i_char = 0; i_char < bytes_len; i_char++)
				all_candidates[decal + i_char] = ((char*)results[i_result].candidate)[i_char];
			decal += bytes_len;
		}
		rb_all_candidates = rb_enc_str_new(all_candidates, total_nb_bytes, utf32le_encoding);
		// We use tmp_store so that local ruby objects are marked by the GC
		rb_ary_push(tmp_store, rb_all_candidates);
		free(all_candidates);
		rb_all_candidates = rb_str_encode(rb_all_candidates, rb_utf8_encoding, 0, Qnil);
		// We use tmp_store so that local ruby objects are marked by the GC
		rb_ary_push(tmp_store, rb_all_candidates);
		all_candidates = RSTRING_PTR(rb_all_candidates);
	}

	decal = 0;
	for (i_result = 0; i_result < nb_results; i_result++)
	{
		if (!inp_encoded)
		{
			candidate_length_in_bytes = 0;
			for (i_char = 0; i_char < results[i_result].candidate_length; i_char++)
			{
				if ((all_candidates[decal + candidate_length_in_bytes] & 0xf8) == 0xf0)
					candidate_length_in_bytes += 4;
				else if ((all_candidates[decal + candidate_length_in_bytes] & 0xf0) == 0xe0)
					candidate_length_in_bytes += 3;
				else if ((all_candidates[decal + candidate_length_in_bytes] & 0xe0) == 0xc0)
					candidate_length_in_bytes += 2;
				else
					candidate_length_in_bytes += 1;
			}
			tmp_candidate = rb_enc_str_new(all_candidates + decal, candidate_length_in_bytes, utf8_encoding);
			decal += candidate_length_in_bytes;
		}
		else
			tmp_candidate = rb_str_new(results[i_result].candidate, results[i_result].candidate_length * char_width);
		rb_ary_push(rb_results, rb_ary_new_from_args(2, tmp_candidate, rb_float_new(results[i_result].score)));
	}
	return (Qtrue);
}