Class: Dictionary::TF_IDF
- Inherits:
-
Object
- Object
- Dictionary::TF_IDF
- Defined in:
- lib/rbbt/bow/dictionary.rb
Instance Attribute Summary collapse
-
#docs ⇒ Object
readonly
Returns the value of attribute docs.
-
#num_docs ⇒ Object
readonly
Returns the value of attribute num_docs.
-
#terms ⇒ Object
readonly
Returns the value of attribute terms.
-
#total_terms ⇒ Object
readonly
Returns the value of attribute total_terms.
Instance Method Summary collapse
- #add(terms) ⇒ Object
- #best(options = {}) ⇒ Object
- #df ⇒ Object
- #idf ⇒ Object
-
#initialize(options = {}) ⇒ TF_IDF
constructor
A new instance of TF_IDF.
- #tf ⇒ Object
- #tf_idf ⇒ Object
- #weights(options = {}) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ TF_IDF
Returns a new instance of TF_IDF.
17 18 19 20 21 22 23 24 25 26 |
# File 'lib/rbbt/bow/dictionary.rb', line 17 def initialize( = {}) @term_limit = { :limit => 500_000, }.merge()[:limit] @terms = Hash.new(0) @docs = Hash.new(0) @num_docs = 0 @total_terms = 0 end |
Instance Attribute Details
#docs ⇒ Object (readonly)
Returns the value of attribute docs.
15 16 17 |
# File 'lib/rbbt/bow/dictionary.rb', line 15 def docs @docs end |
#num_docs ⇒ Object (readonly)
Returns the value of attribute num_docs.
15 16 17 |
# File 'lib/rbbt/bow/dictionary.rb', line 15 def num_docs @num_docs end |
#terms ⇒ Object (readonly)
Returns the value of attribute terms.
15 16 17 |
# File 'lib/rbbt/bow/dictionary.rb', line 15 def terms @terms end |
#total_terms ⇒ Object (readonly)
Returns the value of attribute total_terms.
15 16 17 |
# File 'lib/rbbt/bow/dictionary.rb', line 15 def total_terms @total_terms end |
Instance Method Details
#add(terms) ⇒ Object
29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/rbbt/bow/dictionary.rb', line 29 def add(terms) if @term_limit && @terms.length > @term_limit terms = terms.delete_if{|term, count| !@terms.include? term } end terms.each{|term, count| @terms[term] += count @total_terms += count @docs[term] += 1 } @num_docs += 1 end |
#best(options = {}) ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/rbbt/bow/dictionary.rb', line 76 def best( = {}) high, low, limit = { :low => 0, :high => 1, }.merge(). values_at(:high, :low, :limit) num_docs = @num_docs.to_f best = df.select{|term, value| value >= low && value <= high }.collect{|p| term = p.first df_value = p.last [term, @terms[term].to_f / num_docs * Math::log(1.0/df_value) ] } if limit Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten] else Hash[*best.flatten] end end |
#df ⇒ Object
42 43 44 45 46 47 48 |
# File 'lib/rbbt/bow/dictionary.rb', line 42 def df df = Hash.new(0) @docs.each{|term, count| df[term] = count.to_f / @num_docs } df end |
#idf ⇒ Object
58 59 60 61 62 63 64 65 |
# File 'lib/rbbt/bow/dictionary.rb', line 58 def idf idf = Hash.new(0) num_docs = @num_docs.to_f @docs.each{|term, count| idf[term] = Math::log(num_docs / count) } idf end |
#tf ⇒ Object
50 51 52 53 54 55 56 |
# File 'lib/rbbt/bow/dictionary.rb', line 50 def tf tf = Hash.new(0) @terms.each{|term, count| tf[term] = count.to_f / @total_terms } tf end |
#tf_idf ⇒ Object
67 68 69 70 71 72 73 74 |
# File 'lib/rbbt/bow/dictionary.rb', line 67 def tf_idf tf_idf = Hash.new(0) num_docs = @num_docs.to_f @docs.each{|term, count| tf_idf[term] = @terms[term].to_f / @total_terms * Math::log(num_docs / count) } tf_idf end |
#weights(options = {}) ⇒ Object
101 102 103 104 105 106 107 108 109 110 |
# File 'lib/rbbt/bow/dictionary.rb', line 101 def weights( = {}) best_terms = best().keys weights = {} num_docs = @num_docs.to_f best_terms.each{|term| weights[term] = Math::log(num_docs / @docs[term]) } weights end |