Class: TfIdfSimilarity::TermCountModel

Inherits:
Object
  • Object
show all
Includes:
MatrixMethods
Defined in:
lib/tf-idf-similarity/term_count_model.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(documents, opts = {}) ⇒ TermCountModel

Returns a new instance of TermCountModel.

Parameters:

  • documents (Array<Document>)

    documents

  • opts (Hash) (defaults to: {})

    optional arguments

Options Hash (opts):

  • :library (Symbol)

    :gsl, :narray, :nmatrix or :matrix (default)



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/tf-idf-similarity/term_count_model.rb', line 16

def initialize(documents, opts = {})
  @documents = documents
  @terms = Set.new(documents.map(&:terms).flatten).to_a
  @library = (opts[:library] || :matrix).to_sym

  array = Array.new(terms.size) do |i|
    Array.new(documents.size) do |j|
      documents[j].term_count(terms[i])
    end
  end

  @matrix = initialize_matrix(array)

  @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
end

Instance Attribute Details

#average_document_sizeObject (readonly)

The average number of tokens in a document.



11
12
13
# File 'lib/tf-idf-similarity/term_count_model.rb', line 11

def average_document_size
  @average_document_size
end

#documentsObject (readonly)

The documents in the corpus.



7
8
9
# File 'lib/tf-idf-similarity/term_count_model.rb', line 7

def documents
  @documents
end

#termsObject (readonly)

The set of terms in the corpus.



9
10
11
# File 'lib/tf-idf-similarity/term_count_model.rb', line 9

def terms
  @terms
end

Instance Method Details

#document_count(term) ⇒ Integer

Returns the number of documents the term appears in.

Parameters:

  • term (String)

    a term

Returns:

  • (Integer)

    the number of documents the term appears in



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/tf-idf-similarity/term_count_model.rb', line 34

def document_count(term)
  index = terms.index(term)
  if index
    case @library
    when :gsl, :narray
      row(index).where.size
    when :numo
      (row(index).ne 0).where.size
    when :nmatrix
      row(index).each.count(&:nonzero?)
    else
      vector = row(index)
      unless vector.respond_to?(:count)
        vector = vector.to_a
      end
      vector.count(&:nonzero?)
    end
  else
    0
  end
end

#term_count(term) ⇒ Integer

Returns the number of times the term appears in the corpus.

Parameters:

  • term (String)

    a term

Returns:

  • (Integer)

    the number of times the term appears in the corpus



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/tf-idf-similarity/term_count_model.rb', line 58

def term_count(term)
  index = terms.index(term)
  if index
    case @library
    when :gsl, :narray, :numo
      row(index).sum
    when :nmatrix
      row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
    else
      vector = row(index)
      unless vector.respond_to?(:reduce)
        vector = vector.to_a
      end
      vector.reduce(0, :+)
    end
  else
    0
  end
end