Module: StuffClassifier::Tokenizer
- Included in:
- Base
- Defined in:
- lib/stuff-classifier/tokenizer.rb
Instance Attribute Summary collapse
-
#stemming ⇒ Object
writeonly
Sets the attribute stemming.
Instance Method Summary collapse
- #each_word(string) ⇒ Object
- #ignore_words ⇒ Object
- #ignore_words=(value) ⇒ Object
- #stemming? ⇒ Boolean
Instance Attribute Details
#stemming=(value) ⇒ Object (writeonly)
Sets the attribute stemming
4 5 6 |
# File 'lib/stuff-classifier/tokenizer.rb', line 4 def stemming=(value) @stemming = value end |
Instance Method Details
#each_word(string) ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/stuff-classifier/tokenizer.rb', line 18 def each_word(string) string = string.strip return if string == '' words = [] cnt = string.gsub(/['`]/, '') cnt.split("\n").each do |line| line_cnt = line.gsub(/[^a-zA-Z]+/, ' ') line_cnt.split(/\s+/).each do |w| next if w == '' || ignore_words.member?(w.downcase) if stemming? w = w.stem.downcase next if ignore_words.member?(w) else w = w.downcase end words << (block_given? ? (yield w) : w) end end return words end |
#ignore_words ⇒ Object
10 11 12 |
# File 'lib/stuff-classifier/tokenizer.rb', line 10 def ignore_words @ignore_words || StuffClassifier::STOP_WORDS end |
#ignore_words=(value) ⇒ Object
6 7 8 |
# File 'lib/stuff-classifier/tokenizer.rb', line 6 def ignore_words=(value) @ignore_words = value end |
#stemming? ⇒ Boolean
14 15 16 |
# File 'lib/stuff-classifier/tokenizer.rb', line 14 def stemming? defined?(@stemming) ? @stemming : false end |