Class: SClust::Util::DocumentCollection

Inherits:
Object
  • Object
show all
Defined in:
lib/sclust/util/doccol.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeDocumentCollection

Returns a new instance of DocumentCollection.



45
46
47
48
49
50
# File 'lib/sclust/util/doccol.rb', line 45

def initialize()
    @logger = Log4r::Logger.new(self.class.to_s)
    @logger.add('default')
    @terms   = SClust::Util::SparseVector.new(0)
    @doclist = []
end

Instance Attribute Details

#doclistObject (readonly)

A list of documents



40
41
42
# File 'lib/sclust/util/doccol.rb', line 40

def doclist
  @doclist
end

#loggerObject (readonly)

Log4r::Logger for this document collection.



43
44
45
# File 'lib/sclust/util/doccol.rb', line 43

def logger
  @logger
end

#termsObject (readonly)

terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.



37
38
39
# File 'lib/sclust/util/doccol.rb', line 37

def terms
  @terms
end

Instance Method Details

#<<(d) ⇒ Object

Add a document to the collection and adjust the @terms attribute to store any new terms in the document. The document is also added to the @doclist attribute.



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/sclust/util/doccol.rb', line 54

def <<(d)
    
    seen_terms = {}
    
    d.each_term { |term, frequency| seen_terms[term] = 1 }
    
    if ( seen_terms.size > 0 )
    
        seen_terms.each_key { |term| @terms[term] += 1 }
        
        @doclist<<d
        
        #@logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
    end
    
    self
end

#average_terms_per_documentObject

The sum of the terms divided by the documents. If the document only has 1-gram terms, then this number will always be less than the number of words per document. If, however, you enable 2-grams, 3-grams, etc in a document, this value will not corrolate perfectly with the word count.



75
76
77
# File 'lib/sclust/util/doccol.rb', line 75

def average_terms_per_document()
    @terms.reduce(0.0) { |count, keyval_pair| count + keyval_pair[1] } / @doclist.size
end

#average_words_per_documentObject

Number of words that make up a document. Words are no unique like terms are. Two occurences of the word “the” are a single term “the”. Get it? :) Great. One caveate is that a “term” is typically a 1-gram, that is 1 word is 1 term. It is possible for a term to be constructed of two or more words (an 2-gram, 3-gram, … n-gram) in which case this relationship will vary widely.



84
85
86
# File 'lib/sclust/util/doccol.rb', line 84

def average_words_per_document()
    @doclist.reduce(0.0) { |count, doc| count + doc.words.size } / @doclist.size
end

#document_countObject

Return the size of the document list.



89
90
91
# File 'lib/sclust/util/doccol.rb', line 89

def document_count()
    @doclist.size
end

#drop_terms(min_frequency = 0.10, max_frequency = 0.80) ⇒ Object



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/sclust/util/doccol.rb', line 104

def drop_terms(min_frequency=0.10, max_frequency=0.80)
    
    min_docs = @doclist.length * min_frequency
    max_docs = @doclist.length * max_frequency
    
    @logger.info("Analyzing #{@terms.length} terms for removal.")
    @logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
    
    remove_list = []
    
    @terms.each do |term, frequency|
                    
        if ( frequency < min_docs or frequency > max_docs )
            @logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
            @terms.delete(term)
            remove_list << term
        end
    end
    
    @logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
    
    @doclist.each do |doc|
        remove_list.each do |term|
            doc.terms.delete(term)
        end
    end
end

#each_term(&c) ⇒ Object



138
139
140
# File 'lib/sclust/util/doccol.rb', line 138

def each_term(&c)
    @terms.each_key { |k| yield k }
end

#filter_df(min = 1, max = 0.20) ⇒ Object

Filter out documents that are not in the given range of document frequency as expressed as a percentage of the total number of documents in the collection. If floats are passed, then they are treated as percentages. If integers are passed, they are treated like docuent counts.



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/sclust/util/doccol.rb', line 146

def filter_df(min=1, max=0.20)
    
    delete_list = []
    delete_hash = {}
    
    mindocs = ( min.is_a?(Integer) ) ? min : ( min * @doclist.size )
    maxdocs = ( max.is_a?(Integer) ) ? max : ( max * @doclist.size )
    
    @logger.info("Building term to delete list for range #{mindocs} - #{maxdocs}.")
    
    @terms.each { |term, freq| delete_list << term if (freq <= mindocs or freq >= maxdocs ) }
    
    @logger.info("Identified #{delete_list.size} terms for removal.")
    
    # NOTE: We do a two-phase delete so we can delete from backing documents.
    
    delete_list.each do |term|
        @logger.debug { "Removing term #{term}."}
        @terms.delete(term)
        delete_hash[term] = 1
    end
    
    @logger.info("Updating documents.")
    
    i=0
    
    @doclist.each do |doc|
        @logger.debug { "Processing document #{i += 1} / #{@doclist.size}" }
        
        doc.delete_term_if { |term| delete_hash.member?(term) }
    end
    
    @logger.info("Deleting documents that now have no terms left in them. #{@doclist.size} documents.")
    
    @doclist.delete_if { |doc| doc.terms.size == 0 }
    
    @logger.info("Document count now #{@doclist.size} documents.")

end

#inverse_document_frequency(term) ⇒ Object Also known as: idf



132
133
134
# File 'lib/sclust/util/doccol.rb', line 132

def inverse_document_frequency(term)
    Math.log( @doclist.length / @terms[term] )
end

#term_countObject

Return the size of the term vector



99
100
101
# File 'lib/sclust/util/doccol.rb', line 99

def term_count()
    @terms.size
end

#word_countObject

Sum all words



94
95
96
# File 'lib/sclust/util/doccol.rb', line 94

def word_count()
    @doclist.reduce(0) { |count, doc| count+doc.words.size }
end