Module: Clusterer::Tokenizer
- Included in:
- Document, DocumentArray
- Defined in:
- lib/clusterer/tokenizer.rb
Overview
the tokenizer algorithms take a block, to which the string tokens are passed
Instance Method Summary collapse
Instance Method Details
#simple_ngram_tokenizer(text, options = {}) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/clusterer/tokenizer.rb', line 46 def simple_ngram_tokenizer (text, = {}) ngram = [:ngram] || 3 ngram_list = (0..ngram).collect { []} text.split(/[\.\?\!]/).each do |sentence| #split the text into sentences, Ngrams cannot straddle sentences sentence.gsub(/[^\w\s]/,"").split.each do |word| word.downcase! word = word.stem unless [:no_stem] if word.size > 2 and !STOP_WORDS.include?(word) yield(word) 2.upto(ngram) do |i| ngram_list[i].delete_if {|j| j << word; j.size == i ? (yield(j.join(" ")); true) : false} ngram_list[i] << [word] end else #the ngrams cannot have a stop word at beginning and end 2.upto(ngram) {|i| ngram_list[i].delete_if {|j| (j.size == i - 1) ? true : (j << word; false)}} end end end end |
#simple_tokenizer(text, options = {}) ⇒ Object
38 39 40 41 42 43 44 |
# File 'lib/clusterer/tokenizer.rb', line 38 def simple_tokenizer (text, = {}) text.gsub(/[^\w\s]/,"").split.each do |word| word.downcase! word = word.stem unless [:no_stem] yield(word) if word.size > 2 and !STOP_WORDS.include?(word) end end |