Class: VSS::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/vss/tokenizer.rb

Constant Summary collapse

STOP_WORDS =
%w[
  a b c d e f g h i j k l m n o p q r s t u v w x y z
  an and are as at be by for from has he in is it its
  of on that the to was were will with upon without among
].inject({}) { |h,v| h[v] = true; h }

Class Method Summary collapse

Class Method Details

.tokenize(string) ⇒ Object



11
12
13
14
15
# File 'lib/vss/tokenizer.rb', line 11

def self.tokenize(string)
  stripped = string.to_s.gsub(/[^a-z0-9\-\s\']/i, "") # removes punctuation
  words = stripped.split(/\s+/).reject { |word| word.match(/^\s*$/) }.map(&:downcase).map(&:stem)
  words.reject { |word| STOP_WORDS.key?(word) }.uniq
end