Module: RetrievalLite::Tokenizer
- Defined in:
- lib/retrieval_lite/tokenizer.rb
Constant Summary collapse
- SPECIAL_SEPARATERS =
['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
Class Method Summary collapse
-
.parse_content(content) ⇒ Hash<String, Integer>
A hash that gives term frequency of content.
Class Method Details
.parse_content(content) ⇒ Hash<String, Integer>
Returns a hash that gives term frequency of content.
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/retrieval_lite/tokenizer.rb', line 6 def self.parse_content(content) tokens = Hash.new(0) # initialize to 0 # removes everything BUT the letters token_text = content.strip.downcase.split(/#{separaters_regex}/) token_text.each do |t| # also validates whether there are no other special characters left in there if has_hyphen?(t) tokens[t] += 1 else # get rid of any extra symbols we might have forgotten. term = t.gsub(/[^a-z]/, '') # just in case the entire string was just non-characters if term != '' tokens[term] += 1 end end end tokens end |