Class: InvertedIndex::Cleaner

Inherits:
Object
  • Object
show all
Defined in:
lib/inverted_index/cleaner.rb

Class Method Summary collapse

Class Method Details

.clean(tokens, text = '') ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/inverted_index/cleaner.rb', line 3

def self.clean(tokens, text='')
  # To lowercase
  tokens = tokens.each {|token| token.downcase}
 
  # Remove stopwords
  tokens = tokens - InvertedIndex::Stopwords.words

  # Remove all non-word characters
  words = []
  tokens = tokens.each do |token|
    word = token.gsub(/\W/,'')
    words << word if !word.empty?
  end
  tokens = words

  # TODO: Scan text for special text (e.g. dates, time)
  # A date looks like /((january|february|march)\s\d,\s\d\d\d\d)/i
  # A time looks like
  # 00:00 # 00:00:00 # 00:00:00 a.m. # 00:00:00 p.m. # 00:00:00 pm
  matches = text.scan(/(\d\d:\d\d(:\d\d)?(\s(a|p)\.?m\.?)?)/i)
  matches.each {|match| tokens << match[0].downcase.strip}
   
  # Remove all non-ascii words
  ascii_terms = []
  tokens.each {|token| ascii_terms << token if token.ascii_only?}
  tokens = ascii_terms

  # Stem
  stemmed_terms = []
  tokens.each {|token| stemmed_terms << token.stem.downcase if !token.stem.empty?}
  tokens = stemmed_terms
end