Class: Classifier::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/classifier/base.rb

Direct Known Subclasses

Bayes, LSI

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Base

Returns a new instance of Base.



7
8
9
10
11
12
# File 'lib/classifier/base.rb', line 7

def initialize(options = {})
  options.reverse_merge!(:language => 'en')
  options.reverse_merge!(:encoding => 'UTF_8')

  @options = options
end

Instance Method Details

#clean_word_hash(str) ⇒ Object

Return a word hash without extra punctuation or short symbols, just stemmed words



33
34
35
# File 'lib/classifier/base.rb', line 33

def clean_word_hash str
	word_hash_for_words str.gsub(/[^\w\s]/,"").split
end

#prepare_category_name(val) ⇒ Object



14
15
16
# File 'lib/classifier/base.rb', line 14

def prepare_category_name val
  val.to_s.gsub("_"," ").capitalize.intern 
end

#without_punctuation(str) ⇒ Object

Removes common punctuation symbols, returning a new string. E.g.,

"Hello (greeting's), with {braces} < >...?".without_punctuation
=> "Hello  greetings   with  braces         "


22
23
24
# File 'lib/classifier/base.rb', line 22

def without_punctuation str
  str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
end

#word_hash(str) ⇒ Object

Return a Hash of strings => ints. Each word in the string is stemmed, interned, and indexes to its frequency in the document.



28
29
30
# File 'lib/classifier/base.rb', line 28

def word_hash str
	word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
end