Module: RetrievalLite::Tokenizer

Defined in:
lib/retrieval_lite/tokenizer.rb

Constant Summary collapse

SPECIAL_SEPARATERS =
['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']

Class Method Summary collapse

Class Method Details

.parse_content(content) ⇒ Hash<String, Integer>

Returns a hash that gives term frequency of content.

Parameters:

  • content (String)

    the text of the document

Returns:

  • (Hash<String, Integer>)

    a hash that gives term frequency of content



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/retrieval_lite/tokenizer.rb', line 6

def self.parse_content(content)
  tokens = Hash.new(0) # initialize to 0

  # removes everything BUT the letters
  token_text = content.strip.downcase.split(/#{separaters_regex}/)

  token_text.each do |t|
    # also validates whether there are no other special characters left in there
    if has_hyphen?(t)
      tokens[t] += 1
    else
      # get rid of any extra symbols we might have forgotten.
      term = t.gsub(/[^a-z]/, '')

      # just in case the entire string was just non-characters
      if term != ''
        tokens[term] += 1
      end
    end
  end

  tokens
end