Class: SClust::Util::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/sclust/util/doc.rb

Overview

A typical document representation that is backed by a body of text but also breaks it up into a set of n-grams using a DocumentTokenizer and a DocumentTermFilter.

Constant Summary collapse

@@logger =
Log4r::Logger.new(self.class.to_s)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, opts = {}) ⇒ Document

Takes { :userData, :ngrams => [1,2,3], :filter => Filter, :term_limit => 100 }

also { :min_freq => [ minimum frequency below which a term is removed from the document. ] }
also { :max_freq => [ maximum frequency above which a term is removed from the document. ] }


45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/sclust/util/doc.rb', line 45

def initialize(text, opts={})
    
    @text     = text             # The raw document. Never changed.
    @userData = opts[:userData]  # Options!
        
    opts[:ngrams]    ||= [ 1, 2, 3 ]
    opts[:filter]    ||= DocumentTermFilter.new()
    opts[:tokenizer] ||= DocumentTokenizer.new()
    
    @words = opts[:tokenizer].apply(text).map { |word| 
        opts[:filter].apply(word) }.delete_if { |x| x.nil? or x=~/^\s+$/ }
        
    @word_count = @words.size
    @terms = Hash.new(0)
    
    # Array of counts of grams built.
    builtGramCounts = []
    
    # Build a set of n-grams from our requested ngram range.
    opts[:ngrams].each do |n|
        
        builtGramCounts[n] = 0
        
        # For each word in our list...
        @words.length.times do |j| 
            
            if ( n + j <= @words.length )
                
                term = @words[j]
                
                # Pick number of iterations based on how close to the end of the array we are.
                (( ( @words.length > n+j) ? n : @words.length-j)-1).times { |ngram| term += " #{@words[j+ngram+1]}" }
                
            end
        
            @terms[term] += 1.0 if term
            
            builtGramCounts[n] += 1
        
        end
    end
        
    
    if opts.key?(:min_freq) or opts.key?(:max_freq)
        minwords = @words.size * ( opts[:min_freq] || 0.0   )
        maxwords = @words.size * ( opts[:max_freq] || 1.0 )
        
        #@@logger.debug { "Keeping terms between #{minwords} and #{maxwords} out of a total of #{@words.size}" }

        @terms.delete_if do |term, freq|
            if ( freq < minwords or freq > maxwords ) 
                @words.delete_if { |x| term == x}
                true
            else 
                false
            end
        end
        
        @wordcount = @words.size
    end
end

Instance Attribute Details

#filterObject (readonly)

Returns the value of attribute filter.



40
41
42
# File 'lib/sclust/util/doc.rb', line 40

def filter
  @filter
end

#termsObject (readonly)

Returns the value of attribute terms.



40
41
42
# File 'lib/sclust/util/doc.rb', line 40

def terms
  @terms
end

#textObject (readonly)

Returns the value of attribute text.



40
41
42
# File 'lib/sclust/util/doc.rb', line 40

def text
  @text
end

#userDateObject (readonly)

Returns the value of attribute userDate.



40
41
42
# File 'lib/sclust/util/doc.rb', line 40

def userDate
  @userDate
end

#word_countObject (readonly)

Returns the value of attribute word_count.



40
41
42
# File 'lib/sclust/util/doc.rb', line 40

def word_count
  @word_count
end

#wordsObject (readonly)

Returns the value of attribute words.



40
41
42
# File 'lib/sclust/util/doc.rb', line 40

def words
  @words
end

Instance Method Details

#delete_term_if(&call) ⇒ Object

Frequency information is never updated.



108
109
110
111
# File 'lib/sclust/util/doc.rb', line 108

def delete_term_if(&call)
    @terms.delete_if { |term, val| call.call(term) }
    @words.delete_if { |term|      call.call(term) }
end

#each_term(&call) ⇒ Object

Each term and the term count passed to the given block. Divide the count by the total number of works to get the term frequency.



124
125
126
# File 'lib/sclust/util/doc.rb', line 124

def each_term(&call) 
    terms.each{ |k,v| yield(k, v) }
end

#has_term?(term) ⇒ Boolean

Returns:

  • (Boolean)


128
129
130
# File 'lib/sclust/util/doc.rb', line 128

def has_term?(term)
    @terms.has_key?(term)
end

#term_count(term) ⇒ Object



113
114
115
# File 'lib/sclust/util/doc.rb', line 113

def term_count(term)
    @terms[term]
end

#term_frequency(term) ⇒ Object Also known as: tf



117
118
119
# File 'lib/sclust/util/doc.rb', line 117

def term_frequency(term)
    @terms[term] / @words.size
end