Class: RSI::Indexer

Inherits:
Object
  • Object
show all
Includes:
Loggable
Defined in:
lib/rsi/index.rb

Overview

Document index. Interface for adding documents to index, and for querying an index.

Constant Summary collapse

META_FILE =
"meta.yaml"
DOCS_FILE =
"docs.list"

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Loggable

#logger

Constructor Details

#initialize(root) ⇒ Indexer

Returns a new instance of Indexer.



31
32
33
34
35
36
37
38
39
40
# File 'lib/rsi/index.rb', line 31

def initialize( root )
  @root = root
  @docs = {}
  @meta = { :next_docid => 0 }
  @serializer = RSI::NativeSerializer.new()
  @analyzer = RSI::DefaultTextAnalyzer.new()
  @query_analyzer = RSI::DefaultTextAnalyzer.new()
  @dicts = {}
  @opened = false
end

Instance Attribute Details

#analyzerObject

Analyzer to use for document and query tokenization.



26
27
28
# File 'lib/rsi/index.rb', line 26

def analyzer
  @analyzer
end

#dictsObject

Analyzer to use for document and query tokenization.



26
27
28
# File 'lib/rsi/index.rb', line 26

def dicts
  @dicts
end

#query_analyzerObject

Analyzer to use for document and query tokenization.



26
27
28
# File 'lib/rsi/index.rb', line 26

def query_analyzer
  @query_analyzer
end

#rootObject (readonly)

Dictionary of terms.



24
25
26
# File 'lib/rsi/index.rb', line 24

def root
  @root
end

#serializerObject

Analyzer to use for document and query tokenization.



26
27
28
# File 'lib/rsi/index.rb', line 26

def serializer
  @serializer
end

Instance Method Details

#add_document(doc_uri, content) ⇒ Object

Add a document to the index.



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/rsi/index.rb', line 83

def add_document( doc_uri, content )
  open() unless @opened
  logger.info("Adding document #{doc_uri}")
  if @docs.has_value?( doc_uri )
    raise IndexException, "Cannot do updates yet"
  else
    docid = next_docid()
    @docs[ docid ] = doc_uri
    pos = 0
    term_entries = {}
    logger.debug("Tokenizing")
    @analyzer.tokenize( content ).each do |field, termlist|
      termlist.each do |term|
        termid = @dicts[field].get_termid_for(term, true)
        raise "POO" if termid==nil
        unless term_entries.has_key?( termid )
          term_entries[termid] = []
        end
        term_entries[termid] << pos
        pos += 1
      end
      logger.debug("Adding term entries to #{field}")
      term_entries.each do |termid, pos_list|
        @dicts[field].add_term_entries(docid, termid, term_entries[termid])
      end
    end
  end
end

#delete_document(doc_uri) ⇒ Object

Remove a document from the index (slow!).



113
114
115
116
# File 'lib/rsi/index.rb', line 113

def delete_document( doc_uri )
  open() unless @opened
  raise "This is too hard for me, yet"
end

#find_all(terms_str) ⇒ Object

Return a list of document ids which contain any of the given search terms (AND query). The terms will be tokenized by the current Analyzer.



145
146
147
148
149
150
151
# File 'lib/rsi/index.rb', line 145

def find_all( terms_str )
  q = @query_analyzer.tokenize_query( terms_str )
  logger.debug( "Query=#{q.to_s}" )
  docids = q.evaluate( self )
  docids.uniq!
  return docids.collect {|id| @docs[id]}
end

#find_any(terms_str) ⇒ Object

Return a list of document ids which contain any of the given search termsn (OR query). The terms will be tokenized by the current Analyzer.



132
133
134
135
# File 'lib/rsi/index.rb', line 132

def find_any( terms_str )
  open() unless @opened
  raise "unimplemented"
end

#flushObject

Stop adding documents to the index, and serialize to storage.



119
120
121
122
123
124
125
126
127
# File 'lib/rsi/index.rb', line 119

def flush()
  open() unless @opened
  logger.info("Finishing")
  ()
  store_doclist()
  @dicts.each do |field, dict|
    dict.store()
  end
end

#get_dict_for_field(field) ⇒ Object



137
138
139
# File 'lib/rsi/index.rb', line 137

def get_dict_for_field( field )
  return @dicts[field]
end

#map_field_type(type) ⇒ Object

Gets a dictionary instance for the given field type



73
74
75
76
77
78
79
80
# File 'lib/rsi/index.rb', line 73

def map_field_type( type )
  case type
    when RSI::FIELD_TYPE_TEXT
      return RSI::Dictionary
    when RSI::FIELD_TYPE_DATE
      raise "implement me! XXX"
  end
end

#OLD_find_all(terms_str) ⇒ Object



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/rsi/index.rb', line 153

def OLD_find_all( terms_str )
  open() unless @opened

  # this querying logic is too fragile
  logger.info { "Query: #{terms_str}" }
  t_set = @query_analyzer.tokenize_query( terms_str )
  logger.debug { "Tokenized: #{t_set}" }
  # build map of docid => term-match-count
  finds = {}
  t_set.each do |field, term_list|
    term_list.each do |term|
      logger.debug { "field='#{field}', term='#{term}'" }
      # lookup termid in dict for field
      unless @dicts[field].has_term?( term )
        logger.info { "No term #{term} in dictionary #{field}" }
        next
      end
      termid = @dicts[field].get_termid_for( term )
      logger.debug { "termid=#{termid}" }
      # get list of entries for termid
      e_list = @dicts[field].get_entry_list( termid )
      # get list of docids
      e_list.each do |e|
        logger.debug { "  docid=#{e.docid}" }
        finds[ e.docid ] = finds[ e.docid ].to_i + 1
      end
    end
  end
  total_terms = 0
  t_set.each_value {|vl| total_terms += vl.size() }
  logger.debug { "Total terms: #{total_terms}" }
  # foreach docid in map: match if term-match-count == terms-count
  d_return = []
  finds.each do |docid, count|
    if count == total_terms
      # return docid
      uri = @docs[ docid ]
      d_return << uri
    end
  end
  return d_return
end

#openObject



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/rsi/index.rb', line 42

def open()
  Dir.mkdir( @root ) unless FileTest.exists?( @root )
  log_fh = File.open( File.join( @root, "index.log" ), 
                      File::WRONLY|File::APPEND|File::CREAT )
  log_fh.sync = true
  logger.info( "Trying to reload index..." )
  begin
    reload()
  rescue
    logger.info( "Reload failed (#{$!}), creating new index" )
    # nothing to do
  end
  # Query the analyzer, getting the fields it tokenizes.
  # Initialize and open a dictionary for each field.
  logger.info( "Assigning dictionaries..." )
  @analyzer.get_field_types().each do |field, type|
    field_root = File.join( @root, field )
    klass = map_field_type( type )
    logger.debug( "Field: #{field} at #{field_root} is #{klass}" )
    @dicts[field] = klass.new( field_root )
    @dicts[field].serializer = @serializer
  end
  logger.info( "Opening dictionaries" )
  @dicts.each do |name, dict|
    logger.debug( "Dictionary: #{name}" )
    dict.open()
  end
  @opened = true
end