Class: RSI::DefaultTextAnalyzer

Inherits:
Object
  • Object
show all
Includes:
Loggable
Defined in:
lib/rsi/analysis.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Loggable

#logger

Constructor Details

#initializeDefaultTextAnalyzer

Returns a new instance of DefaultTextAnalyzer.



19
20
21
# File 'lib/rsi/analysis.rb', line 19

def initialize()
  @stoplist = nil
end

Instance Attribute Details

#stoplistObject

Returns the value of attribute stoplist.



17
18
19
# File 'lib/rsi/analysis.rb', line 17

def stoplist
  @stoplist
end

Instance Method Details

#get_field_typesObject

Returns a map of fields to field types, for each field returned by this analyzer’s tokenize() method. Field names should be safe to be used as file path components.



26
27
28
# File 'lib/rsi/analysis.rb', line 26

def get_field_types()
  return { "text" => RSI::FIELD_TYPE_TEXT }
end

#tokenize(content) ⇒ Object

Given a chunk of text content, returns a list of indexable terms contained in that content. The content may not be a complete document. The terms returned may not be a unique set. The terms returned will all be set to field ‘text’.



35
36
37
# File 'lib/rsi/analysis.rb', line 35

def tokenize( content ) # -> { field, [terms...] }..
  return { "text" => tokenize_text(content) }
end

#tokenize_query(query) ⇒ Object



39
40
41
42
43
44
45
# File 'lib/rsi/analysis.rb', line 39

def tokenize_query( query )
  q = RSI::ANDQuery.new()
  tokenize_text( query ).each do |t|
    q.add_subquery( RSI::TermQuery.new( 'text', t ) )
  end
  return q
end

#tokenize_text(content) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/rsi/analysis.rb', line 47

def tokenize_text( content )
  initialize_stoplist()
  c = content.dup.to_s #copy
  c.gsub!( /\'s\b/, "s" ) # normalize contractions
  c.gsub!( /n\'t\b/, "nt" )
  c.tr!( "^a-zA-Z0-9", " " ) # thunk non-wordy chars to ws
  a = c.split() # split on whitespace
  a.collect! { |x| x.length<3 ? nil : x } # remove short terms
  a.compact!
  a.collect! { |x| x.stem } # stem terms
  a.compact!
  a.collect! { |x| x.upcase }
  a.compact!
  a.collect! { |x| @stoplist.has_key?(x) ? nil : x } # remove stops
  a.uniq!
  a.compact!
  return a
end