Class: Whistlepig::Index

Inherits:
Object
  • Object
show all
Defined in:
lib/whistlepig.rb,
ext/whistlepig/whistlepig.c

Overview

A full-text index. You can add entries to it, and you can run queries against it.

To add documents, create Entry objects and call add_entry. Entries represent the document before addition; add_entry will return an integer docid and the entry can be discarded at that point.

To run queries, the simplest option is to call Index#search or Index#each_result_for.

The more complex option is to use setup_query, run_query, and teardown_query, in that order. The advantage of this approach is that run_query can be called multiple times, and each call will return more results, allowing for query pagination.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(v_pathname_base) ⇒ Object



160
161
162
163
# File 'ext/whistlepig/whistlepig.c', line 160

static VALUE index_init(VALUE self, VALUE v_pathname_base) {
  rb_iv_set(self, "@pathname_base", v_pathname_base);
  return self;
}

Instance Attribute Details

#pathname_baseObject (readonly)

Class Method Details

.create(pathname_base) ⇒ Object

Creates a new index, raising an error if it already exists. The on-disk representation will be multiple files starting with pathname_base.



78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'ext/whistlepig/whistlepig.c', line 78

static VALUE index_create(VALUE class, VALUE v_pathname_base) {
  Check_Type(v_pathname_base, T_STRING);

  wp_index* index;
  wp_error* e = wp_index_create(&index, strdup(RSTRING_PTR(v_pathname_base)));
  //printf("# index create at %p, error is %p\n", index, e);
  RAISE_IF_NECESSARY(e);

  VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
  VALUE argv[1] = { v_pathname_base };
  rb_obj_call_init(o_index, 1, argv);
  return o_index;
}

.delete!(pathname_base) ⇒ Object

Deletes the index with base pathname pathname_base from disk. Does nothing if the index does not exist. If that index is currently loaded in memory, expect may to see segfaults when you try to access it.



137
138
139
140
141
142
143
144
# File 'ext/whistlepig/whistlepig.c', line 137

static VALUE index_delete(VALUE class, VALUE v_pathname_base) {
  Check_Type(v_pathname_base, T_STRING);

  wp_error* e = wp_index_delete(RSTRING_PTR(v_pathname_base));
  RAISE_IF_NECESSARY(e);

  return v_pathname_base;
}

.exists?(pathname_base) ⇒ Boolean

Returns true iff an index with base pathname of pathname_base exists on disk.

Returns:

  • (Boolean)


122
123
124
125
126
127
# File 'ext/whistlepig/whistlepig.c', line 122

static VALUE index_exists(VALUE class, VALUE v_pathname_base) {
  Check_Type(v_pathname_base, T_STRING);

  if(wp_index_exists(RSTRING_PTR(v_pathname_base))) return Qtrue;
  else return Qfalse;
}

.load(pathname_base) ⇒ Object

Loads a new index, raising an error if it doesn’t exists. The on-disk * representation will be multiple files starting with pathname_base.



101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'ext/whistlepig/whistlepig.c', line 101

static VALUE index_load(VALUE class, VALUE v_pathname_base) {
  Check_Type(v_pathname_base, T_STRING);

  wp_index* index;
  wp_error* e = wp_index_load(&index, strdup(RSTRING_PTR(v_pathname_base)));
  //printf("# index load at %p, error is %p\n", index, e);
  RAISE_IF_NECESSARY(e);

  VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
  VALUE argv[1] = { v_pathname_base };
  rb_obj_call_init(o_index, 1, argv);
  return o_index;
}

.new(pathname_base) ⇒ Object

Creates or loads a new index. The on-disk representation will be multiple files starting * with pathname_base.

The index may be later be explicitly closed with Index#close. It will also be automatically closed when Ruby exits.



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'ext/whistlepig/whistlepig.c', line 52

static VALUE index_new(VALUE class, VALUE v_pathname_base) {
  Check_Type(v_pathname_base, T_STRING);

  wp_index* index;
  wp_error* e;
  char* pathname_base = RSTRING_PTR(v_pathname_base);

  if(wp_index_exists(pathname_base)) e = wp_index_load(&index, strdup(pathname_base));
  else e = wp_index_create(&index, strdup(pathname_base));
  RAISE_IF_NECESSARY(e);

  VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
  VALUE argv[1] = { v_pathname_base };
  rb_obj_call_init(o_index, 1, argv);
  return o_index;
}

Instance Method Details

#add_entry(entry) ⇒ Object

Adds the entry entry to the index. Returns the document id corresponding to this entry.



275
276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'ext/whistlepig/whistlepig.c', line 275

static VALUE index_add_entry(VALUE self, VALUE v_entry) {
  if(CLASS_OF(v_entry) != c_entry) {
    rb_raise(rb_eTypeError, "entry must be a Whistlepig::Entry object"); // would be nice to support subclasses somehow...
    // not reached
  }

  wp_index* index; Data_Get_Struct(self, wp_index, index);
  wp_entry* entry; Data_Get_Struct(v_entry, wp_entry, entry);
  uint64_t doc_id;
  wp_error* e = wp_index_add_entry(index, entry, &doc_id);
  RAISE_IF_NECESSARY(e);

  return INT2NUM(doc_id);
}

#add_label(doc_id, label) ⇒ Object

Adds the label label to the document corresponding to doc id doc_id in the index. label must be a String. If the label has already been added to the document, does nothing.



297
298
299
300
301
302
303
304
305
306
# File 'ext/whistlepig/whistlepig.c', line 297

static VALUE index_add_label(VALUE self, VALUE v_doc_id, VALUE v_label) {
  Check_Type(v_doc_id, T_FIXNUM);
  Check_Type(v_label, T_STRING);

  wp_index* index; Data_Get_Struct(self, wp_index, index);
  wp_error* e = wp_index_add_label(index, RSTRING_PTR(v_label), NUM2INT(v_doc_id));
  RAISE_IF_NECESSARY(e);

  return v_label;
}

#closeObject

Closes the index, flushing all changes to disk. Future calls to this index may result in a segfault.



195
196
197
198
199
200
201
# File 'ext/whistlepig/whistlepig.c', line 195

static VALUE index_close(VALUE self) {
  wp_index* index; Data_Get_Struct(self, wp_index, index);
  wp_error* e = wp_index_unload(index);
  RAISE_IF_NECESSARY(e);

  return Qnil;
}

#count(query) ⇒ Object

Returns the number of entries matched by query, which should be a Query object. Note that in the current implementation, this is almost as expensive as retrieving all the results directly.



173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'ext/whistlepig/whistlepig.c', line 173

static VALUE index_count(VALUE self, VALUE v_query) {
  if(CLASS_OF(v_query) != c_query) {
    rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
    // not reached
  }

  wp_index* index; Data_Get_Struct(self, wp_index, index);
  wp_query* query; Data_Get_Struct(v_query, wp_query, query);
  uint32_t num_results;
  // clone the query because we don't want to interrupt any search state
  // which may otherwise be being used for pagination.
  wp_error* e = wp_index_count_results(index, wp_query_clone(query), &num_results);
  RAISE_IF_NECESSARY(e);

  return INT2NUM(num_results);
}

#each_result_for(query, chunk_size = 10) ⇒ Object

Runs a query and yield each matching doc id. Handles the mechanics of setting up and tearing down the query.



21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/whistlepig.rb', line 21

def each_result_for query, chunk_size=10
  setup_query query
  begin
    while true
      results = run_query query, chunk_size
      results.each { |r| yield r }
      break if results.size < chunk_size
    end
  ensure
    teardown_query query
  end
  self
end

#remove_label(doc_id, label) ⇒ Object

Removes the label label from the document corresponding to doc id doc_id in the index. label must be a String. If the label has not been added to the document, does nothing.



315
316
317
318
319
320
321
322
323
324
# File 'ext/whistlepig/whistlepig.c', line 315

static VALUE index_remove_label(VALUE self, VALUE v_doc_id, VALUE v_label) {
  Check_Type(v_doc_id, T_FIXNUM);
  Check_Type(v_label, T_STRING);

  wp_index* index; Data_Get_Struct(self, wp_index, index);
  wp_error* e = wp_index_remove_label(index, RSTRING_PTR(v_label), NUM2INT(v_doc_id));
  RAISE_IF_NECESSARY(e);

  return v_label;
}

#run_query(query, max_num_results) ⇒ Object

Runs a query which has been first passed to setup_query, and returns an array of at most max_num_results doc ids. Can be called multiple times to retrieve successive results from the query. The query must have been passed to setup_query first, or terrible things will happen. The query must be passed to teardown_query when done, or memory leaks will occur.



556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
# File 'ext/whistlepig/whistlepig.c', line 556

static VALUE index_run_query(VALUE self, VALUE v_query, VALUE v_max_num_results) {
  Check_Type(v_max_num_results, T_FIXNUM);
  if(CLASS_OF(v_query) != c_query) {
    rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
    // not reached
  }

  wp_index* index; Data_Get_Struct(self, wp_index, index);
  wp_query* query; Data_Get_Struct(v_query, wp_query, query);

  uint32_t max_num_results = NUM2INT(v_max_num_results);
  uint32_t num_results;
  uint64_t* results = malloc(sizeof(uint64_t) * max_num_results);

  wp_error* e = wp_index_run_query(index, query, max_num_results, &num_results, results);
  RAISE_IF_NECESSARY(e);

  VALUE array = rb_ary_new2(num_results);
  for(uint32_t i = 0; i < num_results; i++) {
    rb_ary_store(array, i, INT2NUM(results[i]));
  }
  free(results);

  return array;
}

#search(query, max_results = nil) ⇒ Object

Convenience method. Runs a query and returns up to max_results matching doc ids. Handles the mechanics of setting up and tearing down the query.



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/whistlepig.rb', line 38

def search query, max_results=nil
  setup_query query
  ret = []
  num_per_call = max_results || 100
  begin
    while true
      results = run_query query, num_per_call
      ret += results
      break if max_results || results.size < num_per_call
    end
  ensure
    teardown_query query
  end

  ret
end

#setup_query(query) ⇒ Object

Initializes query for use with run_query. If you do not call teardown_query on this query later, you will leak memory.



510
511
512
513
514
515
516
517
518
519
520
521
522
# File 'ext/whistlepig/whistlepig.c', line 510

static VALUE index_setup_query(VALUE self, VALUE v_query) {
  if(CLASS_OF(v_query) != c_query) {
    rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
    // not reached
  }

  wp_index* index; Data_Get_Struct(self, wp_index, index);
  wp_query* query; Data_Get_Struct(v_query, wp_query, query);
  wp_error* e = wp_index_setup_query(index, query);
  RAISE_IF_NECESSARY(e);

  return self;
}

#sizeObject

Returns the number of entries in the index.



150
151
152
153
154
155
156
157
158
# File 'ext/whistlepig/whistlepig.c', line 150

static VALUE index_size(VALUE self) {
  wp_index* index;
  Data_Get_Struct(self, wp_index, index);

  uint64_t num_docs;
  wp_error* e = wp_index_num_docs(index, &num_docs);
  RAISE_IF_NECESSARY(e);
  return INT2NUM(num_docs);
}

#teardown_query(query) ⇒ Object

Releases any held state used by the query, if it has been first passed to setup_query. If you call run_query on this query after calling this function, terrible things will happen.



531
532
533
534
535
536
537
538
539
540
541
542
543
# File 'ext/whistlepig/whistlepig.c', line 531

static VALUE index_teardown_query(VALUE self, VALUE v_query) {
  if(CLASS_OF(v_query) != c_query) {
    rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
    // not reached
  }

  wp_index* index; Data_Get_Struct(self, wp_index, index);
  wp_query* query; Data_Get_Struct(v_query, wp_query, query);
  wp_error* e = wp_index_teardown_query(index, query);
  RAISE_IF_NECESSARY(e);

  return self;
}