Class: TfIdfSimilarity::TfIdfModel
- Defined in:
- lib/tf-idf-similarity/tf_idf_model.rb,
lib/tf-idf-similarity/extras/tf_idf_model.rb
Instance Method Summary collapse
-
#augmented_average_term_frequency(document, term) ⇒ Object
(also: #augmented_average_tf)
Chisholm ATFA.
-
#augmented_log_term_frequency(document, term) ⇒ Object
(also: #augmented_log_tf)
Chisholm LOGG.
-
#augmented_normalized_term_frequency(document, term) ⇒ Object
(also: #augmented_normalized_tf)
SMART a, Salton n, Chisholm ATF1.
-
#binary_term_frequency(document, term) ⇒ Object
(also: #binary_tf)
SMART b, Salton b, Chisholm BNRY.
-
#changed_coefficient_augmented_normalized_term_frequency(document, term) ⇒ Object
(also: #changed_coefficient_augmented_normalized_tf)
Chisholm ATFC.
-
#entropy(term) ⇒ Object
Chisholm ENPY.
-
#global_frequency_inverse_document_frequency(term) ⇒ Object
(also: #gfidf)
Chisholm IGFF.
-
#incremented_global_frequency_inverse_document_frequency(term) ⇒ Object
(also: #incremented_gfidf)
Chisholm IGFI.
-
#inverse_document_frequency(term) ⇒ Float
(also: #idf)
Return the term’s inverse document frequency.
-
#log_global_frequency_inverse_document_frequency(term) ⇒ Object
(also: #log_gfidf)
Chisholm IGFL.
-
#log_term_frequency(document, term) ⇒ Object
(also: #log_tf)
SMART l, Chisholm LOGA.
-
#no_collection_frequency(term) ⇒ Object
SMART n, Salton x, Chisholm NONE.
-
#no_normalization(matrix) ⇒ Object
SMART n, Salton x, Chisholm NONE.
-
#normalized_log_term_frequency(document, term) ⇒ Object
(also: #normalized_log_tf)
SMART L, Chisholm LOGN.
-
#normalized_term_frequency(document, term, a = 0) ⇒ Object
(also: #normalized_tf)
-
#pivoted_unique_normalization(matrix) ⇒ Object
SMART u, Chisholm PUQN.
-
#plain_inverse_document_frequency(term, numerator = 0, denominator = 0) ⇒ Object
(also: #plain_idf)
SMART t, Salton f, Chisholm IDFB.
-
#probabilistic_inverse_document_frequency(term) ⇒ Object
(also: #probabilistic_idf)
SMART p, Salton p, Chisholm IDFP.
-
#square_root_global_frequency_inverse_document_frequency(term) ⇒ Object
(also: #square_root_gfidf)
Chisholm IGFS.
-
#square_root_term_frequency(document, term) ⇒ Object
(also: #square_root_tf)
Chisholm SQRT.
-
#term_frequency(document, term) ⇒ Float
(also: #tf)
Returns the term’s frequency in the document.
Constructor Details
This class inherits a constructor from TfIdfSimilarity::Model
Instance Method Details
#augmented_average_term_frequency(document, term) ⇒ Object Also known as: augmented_average_tf
Chisholm ATFA
127 128 129 130 131 132 133 134 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 127 def augmented_average_term_frequency(document, term) count = document.term_count(term) if count > 0 0.9 + 0.1 * count / document.average_term_count else 0 end end |
#augmented_log_term_frequency(document, term) ⇒ Object Also known as: augmented_log_tf
Chisholm LOGG
173 174 175 176 177 178 179 180 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 173 def augmented_log_term_frequency(document, term) count = document.term_count(term) if count > 0 0.2 + 0.8 * log(count + 1) else 0 end end |
#augmented_normalized_term_frequency(document, term) ⇒ Object Also known as: augmented_normalized_tf
SMART a, Salton n, Chisholm ATF1
121 122 123 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 121 def augmented_normalized_term_frequency(document, term) 0.5 + 0.5 * normalized_term_frequency(document, term) end |
#binary_term_frequency(document, term) ⇒ Object Also known as: binary_tf
SMART b, Salton b, Chisholm BNRY
103 104 105 106 107 108 109 110 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 103 def binary_term_frequency(document, term) count = document.term_count(term) if count > 0 1 else 0 end end |
#changed_coefficient_augmented_normalized_term_frequency(document, term) ⇒ Object Also known as: changed_coefficient_augmented_normalized_tf
Chisholm ATFC
138 139 140 141 142 143 144 145 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 138 def changed_coefficient_augmented_normalized_term_frequency(document, term) count = document.term_count(term) if count > 0 0.2 + 0.8 * count / document.maximum_term_count else 0 end end |
#entropy(term) ⇒ Object
Chisholm ENPY
62 63 64 65 66 67 68 69 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 62 def entropy(term) denominator = @model.term_count(term).to_f logN = log(documents.size) 1 + documents.reduce(0) do |sum,document| quotient = document.term_count(term) / denominator sum += quotient * log(quotient) / logN end end |
#global_frequency_inverse_document_frequency(term) ⇒ Object Also known as: gfidf
Chisholm IGFF
38 39 40 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 38 def global_frequency_inverse_document_frequency(term) @model.term_count(term) / @model.document_count(term).to_f end |
#incremented_global_frequency_inverse_document_frequency(term) ⇒ Object Also known as: incremented_gfidf
Chisholm IGFI
50 51 52 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 50 def incremented_global_frequency_inverse_document_frequency(term) global_frequency_inverse_document_frequency(term) + 1 end |
#inverse_document_frequency(term) ⇒ Float Also known as: idf
Return the term’s inverse document frequency.
10 11 12 13 |
# File 'lib/tf-idf-similarity/tf_idf_model.rb', line 10 def inverse_document_frequency(term) df = @model.document_count(term) 1 + log(documents.size / (df + 1.0)) end |
#log_global_frequency_inverse_document_frequency(term) ⇒ Object Also known as: log_gfidf
Chisholm IGFL
44 45 46 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 44 def log_global_frequency_inverse_document_frequency(term) log(global_frequency_inverse_document_frequency(term) + 1) end |
#log_term_frequency(document, term) ⇒ Object Also known as: log_tf
SMART l, Chisholm LOGA
151 152 153 154 155 156 157 158 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 151 def log_term_frequency(document, term) count = document.term_count(term) if count > 0 1 + log(count) else 0 end end |
#no_collection_frequency(term) ⇒ Object
SMART n, Salton x, Chisholm NONE
18 19 20 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 18 def no_collection_frequency(term) 1.0 end |
#no_normalization(matrix) ⇒ Object
SMART n, Salton x, Chisholm NONE
78 79 80 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 78 def no_normalization(matrix) matrix end |
#normalized_log_term_frequency(document, term) ⇒ Object Also known as: normalized_log_tf
SMART L, Chisholm LOGN
162 163 164 165 166 167 168 169 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 162 def normalized_log_term_frequency(document, term) count = document.term_count(term) if count > 0 (1 + log(count)) / (1 + log(document.average_term_count)) else 0 end end |
#normalized_term_frequency(document, term, a = 0) ⇒ Object Also known as: normalized_tf
115 116 117 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 115 def normalized_term_frequency(document, term, a = 0) a + (1 - a) * document.term_count(term) / document.maximum_term_count end |
#pivoted_unique_normalization(matrix) ⇒ Object
SMART u, Chisholm PUQN
85 86 87 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 85 def pivoted_unique_normalization(matrix) raise NotImplementedError end |
#plain_inverse_document_frequency(term, numerator = 0, denominator = 0) ⇒ Object Also known as: plain_idf
SMART t, Salton f, Chisholm IDFB
25 26 27 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 25 def plain_inverse_document_frequency(term, numerator = 0, denominator = 0) log((documents.size + numerator) / (@model.document_count(term).to_f + denominator)) end |
#probabilistic_inverse_document_frequency(term) ⇒ Object Also known as: probabilistic_idf
SMART p, Salton p, Chisholm IDFP
31 32 33 34 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 31 def probabilistic_inverse_document_frequency(term) count = @model.document_count(term).to_f log((documents.size - count) / count) end |
#square_root_global_frequency_inverse_document_frequency(term) ⇒ Object Also known as: square_root_gfidf
Chisholm IGFS
56 57 58 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 56 def square_root_global_frequency_inverse_document_frequency(term) sqrt(global_frequency_inverse_document_frequency(term) - 0.9) end |
#square_root_term_frequency(document, term) ⇒ Object Also known as: square_root_tf
Chisholm SQRT
184 185 186 187 188 189 190 191 |
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 184 def square_root_term_frequency(document, term) count = document.term_count(term) if count > 0 sqrt(count - 0.5) + 1 else 0 end end |
#term_frequency(document, term) ⇒ Float Also known as: tf
Returns the term’s frequency in the document.
21 22 23 24 |
# File 'lib/tf-idf-similarity/tf_idf_model.rb', line 21 def term_frequency(document, term) tf = document.term_count(term) sqrt(tf) end |