Class: AhoCorasick::KeywordTree

Inherits:
Object
  • Object
show all
Defined in:
ext/ruby-ahocorasick.c

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeObject

Creates a new KeywordTree

require 'ahocorasick'
kwt = Ahocorasick::KeywordTree.new


58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'ext/ruby-ahocorasick.c', line 58

static VALUE 
rb_kwt_init(VALUE self)
{ 
  AC_STRUCT * tree;
  struct kwt_struct_data *kwt_data;

  kwt_data = ALLOC(struct kwt_struct_data);
  tree     = ac_alloc();
  DATA_PTR(self) = kwt_data;
  kwt_data->tree            = tree;
  kwt_data->last_id         = 1;
  kwt_data->dictionary_size = 0;
  kwt_data->is_frozen       = 0;
  return self;
}

Class Method Details

.from_fileObject

Creates a new KeywordTree and loads the dictionary from a file

% cat dict0.txt
foo
bar
base

k= AhoCorasick::KeywordTree.from_file "dict0.txt"
k.search("basement").size # => 1


266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# File 'ext/ruby-ahocorasick.c', line 266

static VALUE
rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
{ 

  // TODO: 
  //  * use rb_kwt_add_string
  //  * use rb_io* to handle the file

  struct kwt_struct_data *kwt_data;
  char word[1024];
  int id;
  VALUE self;
  VALUE f_string;
  FILE *dictionary;

  rb_scan_args(argc, argv, "10", &f_string);
 
  id = 0;
  SafeStringValue( f_string );
  self= rb_class_new_instance( 0, NULL, klass );
  KeywordTree( self, kwt_data );

  dictionary = fopen( RSTRING( f_string )->ptr, "r" );
  if(dictionary == NULL)
    rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", RSTRING(f_string)->ptr);

  while(fgets(word, 1024, dictionary) != NULL) {
    ac_add_string(kwt_data->tree, word, strlen(word)-1, id++);
    kwt_data->dictionary_size++;
  }

  kwt_data->last_id= id+1;
  fclose(dictionary);
  return self;
}

Instance Method Details

#add_stringObject Also known as: <<

Adds a sequence to this KeywordTree.

kwt.add_string("foo1$21^ 98N3 ba>Z")
kwt << "bar" # using the alias

Note: you can also specify the id, a number between 1 and k

kwt.add_string "bar", 123 # => 123

This id should be unique in the context of the current tree.

Returns the id of the inserted object.

kwt.add_string("test", 18) # => 18
kwt.add_string("baz") # => 19


217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'ext/ruby-ahocorasick.c', line 217

static VALUE
rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
{ 
  VALUE v_string, v_id;
  struct kwt_struct_data *kwt_data;
  char * string;
  int id;

  rb_scan_args(argc, argv, "11", &v_string, &v_id);
 
  Check_Type(v_string, T_STRING);
  string= RSTRING(v_string)->ptr;
  KeywordTree(self, kwt_data);

  if(kwt_data->is_frozen == 1)
    rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.", string);

  if(v_id == Qnil) {
    id = kwt_data->last_id;
  } else if(TYPE(v_id) != T_FIXNUM) {
    rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.", RSTRING(v_id)->ptr);
  } else if(NUM2INT(v_id) <= 0) {
    rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%d\" given.", NUM2INT(v_id));
  } else {
    id= NUM2INT(v_id);
  }
  
  if(ac_add_string(kwt_data->tree, string, strlen(string), id) == 0)
    rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", string, id);

  kwt_data->last_id= id + 1;
  kwt_data->dictionary_size++;
  return INT2FIX(id);
}

#find_allObject Also known as: search

Search the current tree.

It returns an array on hashes, e.g.

[ { :id => int, :value => int, :starts_at => int, :ends_at => int}, { ... } ]

Returns an empty array when the search didn’t return any result.

# assuming a valid KeywordTree kwt object:
kwt.add_string("one")
kwt.add_string("two")

kwt.search( "moved two times already" ).each  do | result |
  result[:id] # => 2
  result[:ends_at] # => 9
  result[:starts_at] # => 6
  result[:value] # => two
end # => 1


131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'ext/ruby-ahocorasick.c', line 131

static VALUE
rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
{
  char * remain;        // returned by ac_search, the remaing text to search
  int lgt, id, ends_at, starts_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
  VALUE v_result;  // one result, as hash
  VALUE v_results; // all the results, an array

  VALUE v_search;  // search string, function argument
  struct kwt_struct_data *kwt_data;
  
  // one mandatory argument.
  rb_scan_args(argc, argv, "1", &v_search);
  // it should be string.
  Check_Type(v_search, T_STRING);
  // get the structure
  KeywordTree(self, kwt_data);
  // freeze the tree, if not already
  if(kwt_data->is_frozen == 0) {
    if(ac_prep( kwt_data->tree ) == 0) 
      rb_raise(rb_eRuntimeError, "Cannot freeze the tree");
    kwt_data->is_frozen = 1;
  }
  // prepare the return value
  v_results= rb_ary_new();
  // fail quickly and return the empty array
  if(kwt_data->dictionary_size == 0) 
    return v_results;
  // prepare the search
  ac_search_init(kwt_data->tree, RSTRING( v_search )->ptr, RSTRING( v_search )->len);
  // loop trought the results
  while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
    // this is an individual result as a hash
    v_result= rb_hash_new();
    rb_hash_aset( v_result, sym_id,        INT2FIX(id) );
    rb_hash_aset( v_result, sym_starts_at, INT2FIX( ends_at - lgt - 1 ) );
    rb_hash_aset( v_result, sym_ends_at,   INT2FIX( ends_at - 1 ) );
    rb_hash_aset( v_result, sym_value, rb_str_new(remain, lgt) );
    rb_ary_push( v_results, v_result );
  }
  // reopen the tree
  kwt_data->is_frozen= 0;
  return v_results;
}

#makeObject

It freezes the current KeywordTree.

Note: This method is called internally by search

require 'ahocorasick'

kwt = Ahocorasick::KeywordTree.new

kwt.add_string("one")
kwt.add_string("two")
kwt.make()


90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'ext/ruby-ahocorasick.c', line 90

static VALUE 
rb_kwt_make(VALUE self)
{ 
  struct kwt_struct_data *kwt_data;
  KeywordTree(self, kwt_data);

  if(kwt_data->is_frozen == 1)
    return Qtrue;
  
  if(ac_prep( kwt_data->tree ) == 1) {
    kwt_data->is_frozen = 1;
    return Qtrue;
  }

  rb_raise(rb_eRuntimeError, "Cannot freeze the tree");
}

#sizeObject

Returns the size of this KeywordTree

kwt.add_string("foo")
kwt.add_string("bar")
kwt.size #=> 2


187
188
189
190
191
192
193
194
# File 'ext/ruby-ahocorasick.c', line 187

static VALUE 
rb_kwt_size(VALUE self)
{ 
  struct kwt_struct_data *kwt_data;
  KeywordTree(self, kwt_data);

  return INT2FIX(kwt_data->dictionary_size);
}