Class: TfIdfSimilarity::TfIdfModel

Inherits:
Model
  • Object
show all
Defined in:
lib/tf-idf-similarity/tf_idf_model.rb,
lib/tf-idf-similarity/extras/tf_idf_model.rb

Instance Method Summary collapse

Constructor Details

This class inherits a constructor from TfIdfSimilarity::Model

Instance Method Details

#augmented_average_term_frequency(document, term) ⇒ Object Also known as: augmented_average_tf

Chisholm ATFA



127
128
129
130
131
132
133
134
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 127

def augmented_average_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    0.9 + 0.1 * count / document.average_term_count
  else
    0
  end
end

#augmented_log_term_frequency(document, term) ⇒ Object Also known as: augmented_log_tf

Chisholm LOGG



173
174
175
176
177
178
179
180
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 173

def augmented_log_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    0.2 + 0.8 * log(count + 1)
  else
    0
  end
end

#augmented_normalized_term_frequency(document, term) ⇒ Object Also known as: augmented_normalized_tf

SMART a, Salton n, Chisholm ATF1



121
122
123
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 121

def augmented_normalized_term_frequency(document, term)
  0.5 + 0.5 * normalized_term_frequency(document, term)
end

#binary_term_frequency(document, term) ⇒ Object Also known as: binary_tf

SMART b, Salton b, Chisholm BNRY



103
104
105
106
107
108
109
110
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 103

def binary_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    1
  else
    0
  end
end

#changed_coefficient_augmented_normalized_term_frequency(document, term) ⇒ Object Also known as: changed_coefficient_augmented_normalized_tf

Chisholm ATFC



138
139
140
141
142
143
144
145
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 138

def changed_coefficient_augmented_normalized_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    0.2 + 0.8 * count / document.maximum_term_count
  else
    0
  end
end

#entropy(term) ⇒ Object

Chisholm ENPY



62
63
64
65
66
67
68
69
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 62

def entropy(term)
  denominator = @model.term_count(term).to_f
  logN = log(documents.size)
  1 + documents.reduce(0) do |sum,document|
    quotient = document.term_count(term) / denominator
    sum += quotient * log(quotient) / logN
  end
end

#global_frequency_inverse_document_frequency(term) ⇒ Object Also known as: gfidf

Chisholm IGFF



38
39
40
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 38

def global_frequency_inverse_document_frequency(term)
  @model.term_count(term) / @model.document_count(term).to_f
end

#incremented_global_frequency_inverse_document_frequency(term) ⇒ Object Also known as: incremented_gfidf

Chisholm IGFI



50
51
52
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 50

def incremented_global_frequency_inverse_document_frequency(term)
  global_frequency_inverse_document_frequency(term) + 1
end

#inverse_document_frequency(term) ⇒ Float Also known as: idf

Return the term’s inverse document frequency.

Parameters:

  • term (String)

    a term

Returns:

  • (Float)

    the term’s inverse document frequency



10
11
12
13
# File 'lib/tf-idf-similarity/tf_idf_model.rb', line 10

def inverse_document_frequency(term)
  df = @model.document_count(term)
  1 + log(documents.size / (df + 1.0))
end

#log_global_frequency_inverse_document_frequency(term) ⇒ Object Also known as: log_gfidf

Chisholm IGFL



44
45
46
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 44

def log_global_frequency_inverse_document_frequency(term)
  log(global_frequency_inverse_document_frequency(term) + 1)
end

#log_term_frequency(document, term) ⇒ Object Also known as: log_tf

SMART l, Chisholm LOGA



151
152
153
154
155
156
157
158
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 151

def log_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    1 + log(count)
  else
    0
  end
end

#no_collection_frequency(term) ⇒ Object

SMART n, Salton x, Chisholm NONE



18
19
20
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 18

def no_collection_frequency(term)
  1.0
end

#no_normalization(matrix) ⇒ Object

SMART n, Salton x, Chisholm NONE



78
79
80
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 78

def no_normalization(matrix)
  matrix
end

#normalized_log_term_frequency(document, term) ⇒ Object Also known as: normalized_log_tf

SMART L, Chisholm LOGN



162
163
164
165
166
167
168
169
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 162

def normalized_log_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    (1 + log(count)) / (1 + log(document.average_term_count))
  else
    0
  end
end

#normalized_term_frequency(document, term, a = 0) ⇒ Object Also known as: normalized_tf



115
116
117
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 115

def normalized_term_frequency(document, term, a = 0)
  a + (1 - a) * document.term_count(term) / document.maximum_term_count
end

#pivoted_unique_normalization(matrix) ⇒ Object

SMART u, Chisholm PUQN



85
86
87
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 85

def pivoted_unique_normalization(matrix)
  raise NotImplementedError
end

#plain_inverse_document_frequency(term, numerator = 0, denominator = 0) ⇒ Object Also known as: plain_idf

SMART t, Salton f, Chisholm IDFB



25
26
27
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 25

def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
  log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
end

#probabilistic_inverse_document_frequency(term) ⇒ Object Also known as: probabilistic_idf

SMART p, Salton p, Chisholm IDFP



31
32
33
34
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 31

def probabilistic_inverse_document_frequency(term)
  count = @model.document_count(term).to_f
  log((documents.size - count) / count)
end

#square_root_global_frequency_inverse_document_frequency(term) ⇒ Object Also known as: square_root_gfidf

Chisholm IGFS



56
57
58
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 56

def square_root_global_frequency_inverse_document_frequency(term)
  sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
end

#square_root_term_frequency(document, term) ⇒ Object Also known as: square_root_tf

Chisholm SQRT



184
185
186
187
188
189
190
191
# File 'lib/tf-idf-similarity/extras/tf_idf_model.rb', line 184

def square_root_term_frequency(document, term)
  count = document.term_count(term)
  if count > 0
    sqrt(count - 0.5) + 1
  else
    0
  end
end

#term_frequency(document, term) ⇒ Float Also known as: tf

Returns the term’s frequency in the document.

Parameters:

  • document (Document)

    a document

  • term (String)

    a term

Returns:

  • (Float)

    the term’s frequency in the document



21
22
23
24
# File 'lib/tf-idf-similarity/tf_idf_model.rb', line 21

def term_frequency(document, term)
  tf = document.term_count(term)
  sqrt(tf)
end