Module: Clusterer::Tokenizer

Included in:
Document, DocumentArray
Defined in:
lib/clusterer/tokenizer.rb

Overview

the tokenizer algorithms take a block, to which the string tokens are passed

Instance Method Summary collapse

Instance Method Details

#simple_ngram_tokenizer(text, options = {}) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/clusterer/tokenizer.rb', line 46

def simple_ngram_tokenizer (text, options = {})
  ngram = options[:ngram] || 3
  
  ngram_list = (0..ngram).collect { []}
  text.split(/[\.\?\!]/).each do |sentence|
    #split the text into sentences, Ngrams cannot straddle sentences
    
    sentence.gsub(/[^\w\s]/,"").split.each do |word|
      word.downcase!
      word = word.stem unless options[:no_stem]
      if word.size > 2 and !STOP_WORDS.include?(word)
        yield(word)
        2.upto(ngram) do |i|
          ngram_list[i].delete_if {|j| j << word; j.size == i ? (yield(j.join(" ")); true) : false}
          ngram_list[i] << [word]
        end
      else
        #the ngrams cannot have a stop word at beginning and end
        2.upto(ngram) {|i| ngram_list[i].delete_if {|j| (j.size == i - 1) ? true : (j << word; false)}}
      end
    end
  end
end

#simple_tokenizer(text, options = {}) ⇒ Object



38
39
40
41
42
43
44
# File 'lib/clusterer/tokenizer.rb', line 38

def simple_tokenizer (text, options = {})
  text.gsub(/[^\w\s]/,"").split.each do |word|
    word.downcase!
    word = word.stem unless options[:no_stem]
    yield(word) if word.size > 2 and !STOP_WORDS.include?(word)
  end
end