3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
# File 'lib/inverted_index/cleaner.rb', line 3
def self.clean(tokens, text='')
tokens = tokens.each {|token| token.downcase}
tokens = tokens - InvertedIndex::Stopwords.words
words = []
tokens = tokens.each do |token|
word = token.gsub(/\W/,'')
words << word if !word.empty?
end
tokens = words
matches = text.scan(/(\d\d:\d\d(:\d\d)?(\s(a|p)\.?m\.?)?)/i)
matches.each {|match| tokens << match[0].downcase.strip}
ascii_terms = []
tokens.each {|token| ascii_terms << token if token.ascii_only?}
tokens = ascii_terms
stemmed_terms = []
tokens.each {|token| stemmed_terms << token.stem.downcase if !token.stem.empty?}
tokens = stemmed_terms
end
|