Class: StuffClassifier::TfIdf
- Inherits:
-
Base
- Object
- Base
- StuffClassifier::TfIdf
show all
- Defined in:
- lib/stuff-classifier/tf-idf.rb
Instance Attribute Summary
Attributes inherited from Base
#name
Attributes included from Tokenizer
#stemming
Instance Method Summary
collapse
Methods inherited from Base
#cat_count, #categories, #incr_cat, #incr_word, #initialize, open, #save_state, #total_count, #train, #word_count, #word_prob, #word_weighted_average
Methods included from Tokenizer
#each_word, #ignore_words, #ignore_words=, #stemming?
Instance Method Details
#cat_scores(text) ⇒ Object
18
19
20
21
22
23
24
25
|
# File 'lib/stuff-classifier/tf-idf.rb', line 18
def cat_scores(text)
probs = {}
categories.each do |cat|
p = text_prob(text, cat)
probs[cat] = p
end
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
end
|
#classify(text, default = nil) ⇒ Object
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
# File 'lib/stuff-classifier/tf-idf.rb', line 27
def classify(text, default=nil)
max_prob = 0.0
best = nil
cat_scores(text).each do |score|
cat, prob = score
if prob > max_prob
max_prob = prob
best = cat
end
end
max_prob > 0 ? best : default
end
|
#text_prob(text, cat) ⇒ Object
14
15
16
|
# File 'lib/stuff-classifier/tf-idf.rb', line 14
def text_prob(text, cat)
each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p}
end
|
#tf_idf(word, cat) ⇒ Object
2
3
4
5
6
7
8
9
10
11
12
|
# File 'lib/stuff-classifier/tf-idf.rb', line 2
def tf_idf(word, cat)
word_cat_nr = word_count(word, cat)
cat_nr = cat_count(cat)
tf = 1.0 * word_cat_nr / cat_nr
total_categories = categories.length
categories_with_word = (@wcount[word] || []).length
idf = Math.log((total_categories + 2) / (categories_with_word + 1.0), 10)
return tf * idf
end
|