Class: RSI::Indexer
Overview
Document index. Interface for adding documents to index, and for querying an index.
Constant Summary collapse
- META_FILE =
"meta.yaml"
- DOCS_FILE =
"docs.list"
Instance Attribute Summary collapse
-
#analyzer ⇒ Object
Analyzer to use for document and query tokenization.
-
#dicts ⇒ Object
Analyzer to use for document and query tokenization.
-
#query_analyzer ⇒ Object
Analyzer to use for document and query tokenization.
-
#root ⇒ Object
readonly
Dictionary of terms.
-
#serializer ⇒ Object
Analyzer to use for document and query tokenization.
Instance Method Summary collapse
-
#add_document(doc_uri, content) ⇒ Object
Add a document to the index.
-
#delete_document(doc_uri) ⇒ Object
Remove a document from the index (slow!).
-
#find_all(terms_str) ⇒ Object
Return a list of document ids which contain any of the given search terms (AND query).
-
#find_any(terms_str) ⇒ Object
Return a list of document ids which contain any of the given search termsn (OR query).
-
#flush ⇒ Object
Stop adding documents to the index, and serialize to storage.
- #get_dict_for_field(field) ⇒ Object
-
#initialize(root) ⇒ Indexer
constructor
A new instance of Indexer.
-
#map_field_type(type) ⇒ Object
Gets a dictionary instance for the given field type.
- #OLD_find_all(terms_str) ⇒ Object
- #open ⇒ Object
Methods included from Loggable
Constructor Details
#initialize(root) ⇒ Indexer
Returns a new instance of Indexer.
31 32 33 34 35 36 37 38 39 40 |
# File 'lib/rsi/index.rb', line 31 def initialize( root ) @root = root @docs = {} @meta = { :next_docid => 0 } @serializer = RSI::NativeSerializer.new() @analyzer = RSI::DefaultTextAnalyzer.new() @query_analyzer = RSI::DefaultTextAnalyzer.new() @dicts = {} @opened = false end |
Instance Attribute Details
#analyzer ⇒ Object
Analyzer to use for document and query tokenization.
26 27 28 |
# File 'lib/rsi/index.rb', line 26 def analyzer @analyzer end |
#dicts ⇒ Object
Analyzer to use for document and query tokenization.
26 27 28 |
# File 'lib/rsi/index.rb', line 26 def dicts @dicts end |
#query_analyzer ⇒ Object
Analyzer to use for document and query tokenization.
26 27 28 |
# File 'lib/rsi/index.rb', line 26 def query_analyzer @query_analyzer end |
#root ⇒ Object (readonly)
Dictionary of terms.
24 25 26 |
# File 'lib/rsi/index.rb', line 24 def root @root end |
#serializer ⇒ Object
Analyzer to use for document and query tokenization.
26 27 28 |
# File 'lib/rsi/index.rb', line 26 def serializer @serializer end |
Instance Method Details
#add_document(doc_uri, content) ⇒ Object
Add a document to the index.
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/rsi/index.rb', line 83 def add_document( doc_uri, content ) open() unless @opened logger.info("Adding document #{doc_uri}") if @docs.has_value?( doc_uri ) raise IndexException, "Cannot do updates yet" else docid = next_docid() @docs[ docid ] = doc_uri pos = 0 term_entries = {} logger.debug("Tokenizing") @analyzer.tokenize( content ).each do |field, termlist| termlist.each do |term| termid = @dicts[field].get_termid_for(term, true) raise "POO" if termid==nil unless term_entries.has_key?( termid ) term_entries[termid] = [] end term_entries[termid] << pos pos += 1 end logger.debug("Adding term entries to #{field}") term_entries.each do |termid, pos_list| @dicts[field].add_term_entries(docid, termid, term_entries[termid]) end end end end |
#delete_document(doc_uri) ⇒ Object
Remove a document from the index (slow!).
113 114 115 116 |
# File 'lib/rsi/index.rb', line 113 def delete_document( doc_uri ) open() unless @opened raise "This is too hard for me, yet" end |
#find_all(terms_str) ⇒ Object
Return a list of document ids which contain any of the given search terms (AND query). The terms will be tokenized by the current Analyzer.
145 146 147 148 149 150 151 |
# File 'lib/rsi/index.rb', line 145 def find_all( terms_str ) q = @query_analyzer.tokenize_query( terms_str ) logger.debug( "Query=#{q.to_s}" ) docids = q.evaluate( self ) docids.uniq! return docids.collect {|id| @docs[id]} end |
#find_any(terms_str) ⇒ Object
Return a list of document ids which contain any of the given search termsn (OR query). The terms will be tokenized by the current Analyzer.
132 133 134 135 |
# File 'lib/rsi/index.rb', line 132 def find_any( terms_str ) open() unless @opened raise "unimplemented" end |
#flush ⇒ Object
Stop adding documents to the index, and serialize to storage.
119 120 121 122 123 124 125 126 127 |
# File 'lib/rsi/index.rb', line 119 def flush() open() unless @opened logger.info("Finishing") () store_doclist() @dicts.each do |field, dict| dict.store() end end |
#get_dict_for_field(field) ⇒ Object
137 138 139 |
# File 'lib/rsi/index.rb', line 137 def get_dict_for_field( field ) return @dicts[field] end |
#map_field_type(type) ⇒ Object
Gets a dictionary instance for the given field type
73 74 75 76 77 78 79 80 |
# File 'lib/rsi/index.rb', line 73 def map_field_type( type ) case type when RSI::FIELD_TYPE_TEXT return RSI::Dictionary when RSI::FIELD_TYPE_DATE raise "implement me! XXX" end end |
#OLD_find_all(terms_str) ⇒ Object
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# File 'lib/rsi/index.rb', line 153 def OLD_find_all( terms_str ) open() unless @opened # this querying logic is too fragile logger.info { "Query: #{terms_str}" } t_set = @query_analyzer.tokenize_query( terms_str ) logger.debug { "Tokenized: #{t_set}" } # build map of docid => term-match-count finds = {} t_set.each do |field, term_list| term_list.each do |term| logger.debug { "field='#{field}', term='#{term}'" } # lookup termid in dict for field unless @dicts[field].has_term?( term ) logger.info { "No term #{term} in dictionary #{field}" } next end termid = @dicts[field].get_termid_for( term ) logger.debug { "termid=#{termid}" } # get list of entries for termid e_list = @dicts[field].get_entry_list( termid ) # get list of docids e_list.each do |e| logger.debug { " docid=#{e.docid}" } finds[ e.docid ] = finds[ e.docid ].to_i + 1 end end end total_terms = 0 t_set.each_value {|vl| total_terms += vl.size() } logger.debug { "Total terms: #{total_terms}" } # foreach docid in map: match if term-match-count == terms-count d_return = [] finds.each do |docid, count| if count == total_terms # return docid uri = @docs[ docid ] d_return << uri end end return d_return end |
#open ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
# File 'lib/rsi/index.rb', line 42 def open() Dir.mkdir( @root ) unless FileTest.exists?( @root ) log_fh = File.open( File.join( @root, "index.log" ), File::WRONLY|File::APPEND|File::CREAT ) log_fh.sync = true logger.info( "Trying to reload index..." ) begin reload() rescue logger.info( "Reload failed (#{$!}), creating new index" ) # nothing to do end # Query the analyzer, getting the fields it tokenizes. # Initialize and open a dictionary for each field. logger.info( "Assigning dictionaries..." ) @analyzer.get_field_types().each do |field, type| field_root = File.join( @root, field ) klass = map_field_type( type ) logger.debug( "Field: #{field} at #{field_root} is #{klass}" ) @dicts[field] = klass.new( field_root ) @dicts[field].serializer = @serializer end logger.info( "Opening dictionaries" ) @dicts.each do |name, dict| logger.debug( "Dictionary: #{name}" ) dict.open() end @opened = true end |