Class: Whistlepig::Index
- Inherits:
-
Object
- Object
- Whistlepig::Index
- Defined in:
- lib/whistlepig.rb,
ext/whistlepig/whistlepig.c
Overview
A full-text index. You can add entries to it, and you can run queries against it.
To add documents, create Entry objects and call add_entry. Entries represent the document before addition; add_entry will return an integer docid and the entry can be discarded at that point.
To run queries, the simplest option is to call Index#search or Index#each_result_for.
The more complex option is to use setup_query, run_query, and teardown_query, in that order. The advantage of this approach is that run_query can be called multiple times, and each call will return more results, allowing for query pagination.
Instance Attribute Summary collapse
- #pathname_base ⇒ Object readonly
Class Method Summary collapse
-
.create(pathname_base) ⇒ Object
Creates a new index, raising an error if it already exists.
-
.delete!(pathname_base) ⇒ Object
Deletes the index with base pathname
pathname_base
from disk. -
.exists?(pathname_base) ⇒ Boolean
Returns true iff an index with base pathname of
pathname_base
exists on disk. -
.load(pathname_base) ⇒ Object
Loads a new index, raising an error if it doesn’t exists.
-
.new(pathname_base) ⇒ Object
Creates or loads a new index.
Instance Method Summary collapse
-
#add_entry(entry) ⇒ Object
Adds the entry
entry
to the index. -
#add_label(doc_id, label) ⇒ Object
Adds the label
label
to the document corresponding to doc iddoc_id
in the index. -
#close ⇒ Object
Closes the index, flushing all changes to disk.
-
#count(query) ⇒ Object
Returns the number of entries matched by
query
, which should be a Query object. -
#each_result_for(query, chunk_size = 10) ⇒ Object
Runs a query and yield each matching doc id.
- #initialize(v_pathname_base) ⇒ Object constructor
-
#remove_label(doc_id, label) ⇒ Object
Removes the label
label
from the document corresponding to doc iddoc_id
in the index. -
#run_query(query, max_num_results) ⇒ Object
Runs a query which has been first passed to setup_query, and returns an array of at most
max_num_results
doc ids. -
#search(query, max_results = nil) ⇒ Object
Convenience method.
-
#setup_query(query) ⇒ Object
Initializes query for use with run_query.
-
#size ⇒ Object
Returns the number of entries in the index.
-
#teardown_query(query) ⇒ Object
Releases any held state used by the query, if it has been first passed to setup_query.
Constructor Details
#initialize(v_pathname_base) ⇒ Object
160 161 162 163 |
# File 'ext/whistlepig/whistlepig.c', line 160
static VALUE index_init(VALUE self, VALUE v_pathname_base) {
rb_iv_set(self, "@pathname_base", v_pathname_base);
return self;
}
|
Instance Attribute Details
#pathname_base ⇒ Object (readonly)
Class Method Details
.create(pathname_base) ⇒ Object
Creates a new index, raising an error if it already exists. The on-disk representation will be multiple files starting with pathname_base
.
78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'ext/whistlepig/whistlepig.c', line 78
static VALUE index_create(VALUE class, VALUE v_pathname_base) {
Check_Type(v_pathname_base, T_STRING);
wp_index* index;
wp_error* e = wp_index_create(&index, strdup(RSTRING_PTR(v_pathname_base)));
//printf("# index create at %p, error is %p\n", index, e);
RAISE_IF_NECESSARY(e);
VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
VALUE argv[1] = { v_pathname_base };
rb_obj_call_init(o_index, 1, argv);
return o_index;
}
|
.delete!(pathname_base) ⇒ Object
Deletes the index with base pathname pathname_base
from disk. Does nothing if the index does not exist. If that index is currently loaded in memory, expect may to see segfaults when you try to access it.
137 138 139 140 141 142 143 144 |
# File 'ext/whistlepig/whistlepig.c', line 137
static VALUE index_delete(VALUE class, VALUE v_pathname_base) {
Check_Type(v_pathname_base, T_STRING);
wp_error* e = wp_index_delete(RSTRING_PTR(v_pathname_base));
RAISE_IF_NECESSARY(e);
return v_pathname_base;
}
|
.exists?(pathname_base) ⇒ Boolean
Returns true iff an index with base pathname of pathname_base
exists on disk.
122 123 124 125 126 127 |
# File 'ext/whistlepig/whistlepig.c', line 122
static VALUE index_exists(VALUE class, VALUE v_pathname_base) {
Check_Type(v_pathname_base, T_STRING);
if(wp_index_exists(RSTRING_PTR(v_pathname_base))) return Qtrue;
else return Qfalse;
}
|
.load(pathname_base) ⇒ Object
Loads a new index, raising an error if it doesn’t exists. The on-disk * representation will be multiple files starting with pathname_base
.
101 102 103 104 105 106 107 108 109 110 111 112 113 |
# File 'ext/whistlepig/whistlepig.c', line 101
static VALUE index_load(VALUE class, VALUE v_pathname_base) {
Check_Type(v_pathname_base, T_STRING);
wp_index* index;
wp_error* e = wp_index_load(&index, strdup(RSTRING_PTR(v_pathname_base)));
//printf("# index load at %p, error is %p\n", index, e);
RAISE_IF_NECESSARY(e);
VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
VALUE argv[1] = { v_pathname_base };
rb_obj_call_init(o_index, 1, argv);
return o_index;
}
|
.new(pathname_base) ⇒ Object
Creates or loads a new index. The on-disk representation will be multiple files starting * with pathname_base
.
The index may be later be explicitly closed with Index#close. It will also be automatically closed when Ruby exits.
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'ext/whistlepig/whistlepig.c', line 52
static VALUE index_new(VALUE class, VALUE v_pathname_base) {
Check_Type(v_pathname_base, T_STRING);
wp_index* index;
wp_error* e;
char* pathname_base = RSTRING_PTR(v_pathname_base);
if(wp_index_exists(pathname_base)) e = wp_index_load(&index, strdup(pathname_base));
else e = wp_index_create(&index, strdup(pathname_base));
RAISE_IF_NECESSARY(e);
VALUE o_index = Data_Wrap_Struct(class, NULL, index_free, index);
VALUE argv[1] = { v_pathname_base };
rb_obj_call_init(o_index, 1, argv);
return o_index;
}
|
Instance Method Details
#add_entry(entry) ⇒ Object
Adds the entry entry
to the index. Returns the document id corresponding to this entry.
275 276 277 278 279 280 281 282 283 284 285 286 287 288 |
# File 'ext/whistlepig/whistlepig.c', line 275
static VALUE index_add_entry(VALUE self, VALUE v_entry) {
if(CLASS_OF(v_entry) != c_entry) {
rb_raise(rb_eTypeError, "entry must be a Whistlepig::Entry object"); // would be nice to support subclasses somehow...
// not reached
}
wp_index* index; Data_Get_Struct(self, wp_index, index);
wp_entry* entry; Data_Get_Struct(v_entry, wp_entry, entry);
uint64_t doc_id;
wp_error* e = wp_index_add_entry(index, entry, &doc_id);
RAISE_IF_NECESSARY(e);
return INT2NUM(doc_id);
}
|
#add_label(doc_id, label) ⇒ Object
Adds the label label
to the document corresponding to doc id doc_id
in the index. label
must be a String. If the label has already been added to the document, does nothing.
297 298 299 300 301 302 303 304 305 306 |
# File 'ext/whistlepig/whistlepig.c', line 297
static VALUE index_add_label(VALUE self, VALUE v_doc_id, VALUE v_label) {
Check_Type(v_doc_id, T_FIXNUM);
Check_Type(v_label, T_STRING);
wp_index* index; Data_Get_Struct(self, wp_index, index);
wp_error* e = wp_index_add_label(index, RSTRING_PTR(v_label), NUM2INT(v_doc_id));
RAISE_IF_NECESSARY(e);
return v_label;
}
|
#close ⇒ Object
Closes the index, flushing all changes to disk. Future calls to this index may result in a segfault.
195 196 197 198 199 200 201 |
# File 'ext/whistlepig/whistlepig.c', line 195 static VALUE index_close(VALUE self) { wp_index* index; Data_Get_Struct(self, wp_index, index); wp_error* e = wp_index_unload(index); RAISE_IF_NECESSARY(e); return Qnil; } |
#count(query) ⇒ Object
Returns the number of entries matched by query
, which should be a Query object. Note that in the current implementation, this is almost as expensive as retrieving all the results directly.
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
# File 'ext/whistlepig/whistlepig.c', line 173
static VALUE index_count(VALUE self, VALUE v_query) {
if(CLASS_OF(v_query) != c_query) {
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
// not reached
}
wp_index* index; Data_Get_Struct(self, wp_index, index);
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
uint32_t num_results;
// clone the query because we don't want to interrupt any search state
// which may otherwise be being used for pagination.
wp_error* e = wp_index_count_results(index, wp_query_clone(query), &num_results);
RAISE_IF_NECESSARY(e);
return INT2NUM(num_results);
}
|
#each_result_for(query, chunk_size = 10) ⇒ Object
Runs a query and yield each matching doc id. Handles the mechanics of setting up and tearing down the query.
21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/whistlepig.rb', line 21 def each_result_for query, chunk_size=10 setup_query query begin while true results = run_query query, chunk_size results.each { |r| yield r } break if results.size < chunk_size end ensure teardown_query query end self end |
#remove_label(doc_id, label) ⇒ Object
Removes the label label
from the document corresponding to doc id doc_id
in the index. label
must be a String. If the label has not been added to the document, does nothing.
315 316 317 318 319 320 321 322 323 324 |
# File 'ext/whistlepig/whistlepig.c', line 315
static VALUE index_remove_label(VALUE self, VALUE v_doc_id, VALUE v_label) {
Check_Type(v_doc_id, T_FIXNUM);
Check_Type(v_label, T_STRING);
wp_index* index; Data_Get_Struct(self, wp_index, index);
wp_error* e = wp_index_remove_label(index, RSTRING_PTR(v_label), NUM2INT(v_doc_id));
RAISE_IF_NECESSARY(e);
return v_label;
}
|
#run_query(query, max_num_results) ⇒ Object
Runs a query which has been first passed to setup_query, and returns an array of at most max_num_results
doc ids. Can be called multiple times to retrieve successive results from the query. The query must have been passed to setup_query first, or terrible things will happen. The query must be passed to teardown_query when done, or memory leaks will occur.
556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 |
# File 'ext/whistlepig/whistlepig.c', line 556
static VALUE index_run_query(VALUE self, VALUE v_query, VALUE v_max_num_results) {
Check_Type(v_max_num_results, T_FIXNUM);
if(CLASS_OF(v_query) != c_query) {
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
// not reached
}
wp_index* index; Data_Get_Struct(self, wp_index, index);
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
uint32_t max_num_results = NUM2INT(v_max_num_results);
uint32_t num_results;
uint64_t* results = malloc(sizeof(uint64_t) * max_num_results);
wp_error* e = wp_index_run_query(index, query, max_num_results, &num_results, results);
RAISE_IF_NECESSARY(e);
VALUE array = rb_ary_new2(num_results);
for(uint32_t i = 0; i < num_results; i++) {
rb_ary_store(array, i, INT2NUM(results[i]));
}
free(results);
return array;
}
|
#search(query, max_results = nil) ⇒ Object
Convenience method. Runs a query and returns up to max_results
matching doc ids. Handles the mechanics of setting up and tearing down the query.
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/whistlepig.rb', line 38 def search query, max_results=nil setup_query query ret = [] num_per_call = max_results || 100 begin while true results = run_query query, num_per_call ret += results break if max_results || results.size < num_per_call end ensure teardown_query query end ret end |
#setup_query(query) ⇒ Object
Initializes query for use with run_query. If you do not call teardown_query on this query later, you will leak memory.
510 511 512 513 514 515 516 517 518 519 520 521 522 |
# File 'ext/whistlepig/whistlepig.c', line 510
static VALUE index_setup_query(VALUE self, VALUE v_query) {
if(CLASS_OF(v_query) != c_query) {
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
// not reached
}
wp_index* index; Data_Get_Struct(self, wp_index, index);
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
wp_error* e = wp_index_setup_query(index, query);
RAISE_IF_NECESSARY(e);
return self;
}
|
#size ⇒ Object
Returns the number of entries in the index.
150 151 152 153 154 155 156 157 158 |
# File 'ext/whistlepig/whistlepig.c', line 150 static VALUE index_size(VALUE self) { wp_index* index; Data_Get_Struct(self, wp_index, index); uint64_t num_docs; wp_error* e = wp_index_num_docs(index, &num_docs); RAISE_IF_NECESSARY(e); return INT2NUM(num_docs); } |
#teardown_query(query) ⇒ Object
Releases any held state used by the query, if it has been first passed to setup_query. If you call run_query on this query after calling this function, terrible things will happen.
531 532 533 534 535 536 537 538 539 540 541 542 543 |
# File 'ext/whistlepig/whistlepig.c', line 531
static VALUE index_teardown_query(VALUE self, VALUE v_query) {
if(CLASS_OF(v_query) != c_query) {
rb_raise(rb_eTypeError, "query must be a Whistlepig::Query object"); // would be nice to support subclasses somehow...
// not reached
}
wp_index* index; Data_Get_Struct(self, wp_index, index);
wp_query* query; Data_Get_Struct(v_query, wp_query, query);
wp_error* e = wp_index_teardown_query(index, query);
RAISE_IF_NECESSARY(e);
return self;
}
|