Class: RSI::Indexer

Inherits:

Object

Object
RSI::Indexer

show all

Includes:: Loggable

Defined in:: lib/rsi/index.rb

Overview

Document index. Interface for adding documents to index, and for querying an index.

Constant Summary collapse

META_FILE =

"meta.yaml"

DOCS_FILE =

"docs.list"

Instance Attribute Summary collapse

#analyzer ⇒ Object

Analyzer to use for document and query tokenization.
#dicts ⇒ Object

Analyzer to use for document and query tokenization.
#query_analyzer ⇒ Object

Analyzer to use for document and query tokenization.
#root ⇒ Object readonly

Dictionary of terms.
#serializer ⇒ Object

Analyzer to use for document and query tokenization.

Instance Method Summary collapse

#add_document(doc_uri, content) ⇒ Object

Add a document to the index.
#delete_document(doc_uri) ⇒ Object

Remove a document from the index (slow!).
#find_all(terms_str) ⇒ Object

Return a list of document ids which contain any of the given search terms (AND query).
#find_any(terms_str) ⇒ Object

Return a list of document ids which contain any of the given search termsn (OR query).
#flush ⇒ Object

Stop adding documents to the index, and serialize to storage.
#get_dict_for_field(field) ⇒ Object
#initialize(root) ⇒ Indexer constructor

A new instance of Indexer.
#map_field_type(type) ⇒ Object

Gets a dictionary instance for the given field type.
#OLD_find_all(terms_str) ⇒ Object
#open ⇒ Object

Methods included from Loggable

#logger

Constructor Details

#initialize(root) ⇒ `Indexer`

Returns a new instance of Indexer.

# File 'lib/rsi/index.rb', line 31

def initialize( root )
  @root = root
  @docs = {}
  @meta = { :next_docid => 0 }
  @serializer = RSI::NativeSerializer.new()
  @analyzer = RSI::DefaultTextAnalyzer.new()
  @query_analyzer = RSI::DefaultTextAnalyzer.new()
  @dicts = {}
  @opened = false
end

Instance Attribute Details

#analyzer ⇒ `Object`

Analyzer to use for document and query tokenization.



26
27
28

# File 'lib/rsi/index.rb', line 26

def analyzer
  @analyzer
end

#dicts ⇒ `Object`

Analyzer to use for document and query tokenization.



26
27
28

# File 'lib/rsi/index.rb', line 26

def dicts
  @dicts
end

#query_analyzer ⇒ `Object`

Analyzer to use for document and query tokenization.



26
27
28

# File 'lib/rsi/index.rb', line 26

def query_analyzer
  @query_analyzer
end

#root ⇒ `Object` (readonly)

Dictionary of terms.



24
25
26

# File 'lib/rsi/index.rb', line 24

def root
  @root
end

#serializer ⇒ `Object`

Analyzer to use for document and query tokenization.



26
27
28

# File 'lib/rsi/index.rb', line 26

def serializer
  @serializer
end

Instance Method Details

#add_document(doc_uri, content) ⇒ `Object`

Add a document to the index.

# File 'lib/rsi/index.rb', line 83

def add_document( doc_uri, content )
  open() unless @opened
  logger.info("Adding document #{doc_uri}")
  if @docs.has_value?( doc_uri )
    raise IndexException, "Cannot do updates yet"
  else
    docid = next_docid()
    @docs[ docid ] = doc_uri
    pos = 0
    term_entries = {}
    logger.debug("Tokenizing")
    @analyzer.tokenize( content ).each do |field, termlist|
      termlist.each do |term|
        termid = @dicts[field].get_termid_for(term, true)
        raise "POO" if termid==nil
        unless term_entries.has_key?( termid )
          term_entries[termid] = []
        end
        term_entries[termid] << pos
        pos += 1
      end
      logger.debug("Adding term entries to #{field}")
      term_entries.each do |termid, pos_list|
        @dicts[field].add_term_entries(docid, termid, term_entries[termid])
      end
    end
  end
end

#delete_document(doc_uri) ⇒ `Object`

Remove a document from the index (slow!).

# File 'lib/rsi/index.rb', line 113

def delete_document( doc_uri )
  open() unless @opened
  raise "This is too hard for me, yet"
end

#find_all(terms_str) ⇒ `Object`

Return a list of document ids which contain any of the given search terms (AND query). The terms will be tokenized by the current Analyzer.

# File 'lib/rsi/index.rb', line 145

def find_all( terms_str )
  q = @query_analyzer.tokenize_query( terms_str )
  logger.debug( "Query=#{q.to_s}" )
  docids = q.evaluate( self )
  docids.uniq!
  return docids.collect {|id| @docs[id]}
end

#find_any(terms_str) ⇒ `Object`

Return a list of document ids which contain any of the given search termsn (OR query). The terms will be tokenized by the current Analyzer.

# File 'lib/rsi/index.rb', line 132

def find_any( terms_str )
  open() unless @opened
  raise "unimplemented"
end

#flush ⇒ `Object`

Stop adding documents to the index, and serialize to storage.

# File 'lib/rsi/index.rb', line 119

def flush()
  open() unless @opened
  logger.info("Finishing")
  store_metadata()
  store_doclist()
  @dicts.each do |field, dict|
    dict.store()
  end
end

#get_dict_for_field(field) ⇒ `Object`



137
138
139

# File 'lib/rsi/index.rb', line 137

def get_dict_for_field( field )
  return @dicts[field]
end

#map_field_type(type) ⇒ `Object`

Gets a dictionary instance for the given field type

# File 'lib/rsi/index.rb', line 73

def map_field_type( type )
  case type
    when RSI::FIELD_TYPE_TEXT
      return RSI::Dictionary
    when RSI::FIELD_TYPE_DATE
      raise "implement me! XXX"
  end
end

#OLD_find_all(terms_str) ⇒ `Object`

# File 'lib/rsi/index.rb', line 153

def OLD_find_all( terms_str )
  open() unless @opened

  # this querying logic is too fragile
  logger.info { "Query: #{terms_str}" }
  t_set = @query_analyzer.tokenize_query( terms_str )
  logger.debug { "Tokenized: #{t_set}" }
  # build map of docid => term-match-count
  finds = {}
  t_set.each do |field, term_list|
    term_list.each do |term|
      logger.debug { "field='#{field}', term='#{term}'" }
      # lookup termid in dict for field
      unless @dicts[field].has_term?( term )
        logger.info { "No term #{term} in dictionary #{field}" }
        next
      end
      termid = @dicts[field].get_termid_for( term )
      logger.debug { "termid=#{termid}" }
      # get list of entries for termid
      e_list = @dicts[field].get_entry_list( termid )
      # get list of docids
      e_list.each do |e|
        logger.debug { "  docid=#{e.docid}" }
        finds[ e.docid ] = finds[ e.docid ].to_i + 1
      end
    end
  end
  total_terms = 0
  t_set.each_value {|vl| total_terms += vl.size() }
  logger.debug { "Total terms: #{total_terms}" }
  # foreach docid in map: match if term-match-count == terms-count
  d_return = []
  finds.each do |docid, count|
    if count == total_terms
      # return docid
      uri = @docs[ docid ]
      d_return << uri
    end
  end
  return d_return
end

#open ⇒ `Object`

# File 'lib/rsi/index.rb', line 42

def open()
  Dir.mkdir( @root ) unless FileTest.exists?( @root )
  log_fh = File.open( File.join( @root, "index.log" ), 
                      File::WRONLY|File::APPEND|File::CREAT )
  log_fh.sync = true
  logger.info( "Trying to reload index..." )
  begin
    reload()
  rescue
    logger.info( "Reload failed (#{$!}), creating new index" )
    # nothing to do
  end
  # Query the analyzer, getting the fields it tokenizes.
  # Initialize and open a dictionary for each field.
  logger.info( "Assigning dictionaries..." )
  @analyzer.get_field_types().each do |field, type|
    field_root = File.join( @root, field )
    klass = map_field_type( type )
    logger.debug( "Field: #{field} at #{field_root} is #{klass}" )
    @dicts[field] = klass.new( field_root )
    @dicts[field].serializer = @serializer
  end
  logger.info( "Opening dictionaries" )
  @dicts.each do |name, dict|
    logger.debug( "Dictionary: #{name}" )
    dict.open()
  end
  @opened = true
end

Class: RSI::Indexer

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Loggable

Constructor Details

#initialize(root) ⇒ Indexer

Instance Attribute Details

#analyzer ⇒ Object

#dicts ⇒ Object

#query_analyzer ⇒ Object

#root ⇒ Object (readonly)

#serializer ⇒ Object

Instance Method Details

#add_document(doc_uri, content) ⇒ Object

#delete_document(doc_uri) ⇒ Object

#find_all(terms_str) ⇒ Object

#find_any(terms_str) ⇒ Object

#flush ⇒ Object

#get_dict_for_field(field) ⇒ Object

#map_field_type(type) ⇒ Object

#OLD_find_all(terms_str) ⇒ Object

#open ⇒ Object

#initialize(root) ⇒ `Indexer`

#analyzer ⇒ `Object`

#dicts ⇒ `Object`

#query_analyzer ⇒ `Object`

#root ⇒ `Object` (readonly)

#serializer ⇒ `Object`

#add_document(doc_uri, content) ⇒ `Object`

#delete_document(doc_uri) ⇒ `Object`

#find_all(terms_str) ⇒ `Object`

#find_any(terms_str) ⇒ `Object`

#flush ⇒ `Object`

#get_dict_for_field(field) ⇒ `Object`

#map_field_type(type) ⇒ `Object`

#OLD_find_all(terms_str) ⇒ `Object`

#open ⇒ `Object`