Class: SClust::Util::Document
- Inherits:
-
Object
- Object
- SClust::Util::Document
- Defined in:
- lib/sclust/util/doc.rb
Overview
A typical document representation that is backed by a body of text but also breaks it up into a set of n-grams using a DocumentTokenizer and a DocumentTermFilter.
Constant Summary collapse
- @@logger =
Log4r::Logger.new(self.class.to_s)
Instance Attribute Summary collapse
-
#filter ⇒ Object
readonly
Returns the value of attribute filter.
-
#terms ⇒ Object
readonly
Returns the value of attribute terms.
-
#text ⇒ Object
readonly
Returns the value of attribute text.
-
#userDate ⇒ Object
readonly
Returns the value of attribute userDate.
-
#word_count ⇒ Object
readonly
Returns the value of attribute word_count.
-
#words ⇒ Object
readonly
Returns the value of attribute words.
Instance Method Summary collapse
-
#delete_term_if(&call) ⇒ Object
Frequency information is never updated.
-
#each_term(&call) ⇒ Object
Each term and the term count passed to the given block.
- #has_term?(term) ⇒ Boolean
-
#initialize(text, opts = {}) ⇒ Document
constructor
Takes { :userData, :ngrams => [1,2,3], :filter => Filter, :term_limit => 100 } also { :min_freq => [ minimum frequency below which a term is removed from the document. ] } also { :max_freq => [ maximum frequency above which a term is removed from the document. ] }.
- #term_count(term) ⇒ Object
- #term_frequency(term) ⇒ Object (also: #tf)
Constructor Details
#initialize(text, opts = {}) ⇒ Document
Takes { :userData, :ngrams => [1,2,3], :filter => Filter, :term_limit => 100 }
also { :min_freq => [ minimum frequency below which a term is removed from the document. ] }
also { :max_freq => [ maximum frequency above which a term is removed from the document. ] }
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/sclust/util/doc.rb', line 45 def initialize(text, opts={}) @text = text # The raw document. Never changed. @userData = opts[:userData] # Options! opts[:ngrams] ||= [ 1, 2, 3 ] opts[:filter] ||= DocumentTermFilter.new() opts[:tokenizer] ||= DocumentTokenizer.new() @words = opts[:tokenizer].apply(text).map { |word| opts[:filter].apply(word) }.delete_if { |x| x.nil? or x=~/^\s+$/ } @word_count = @words.size @terms = Hash.new(0) # Array of counts of grams built. builtGramCounts = [] # Build a set of n-grams from our requested ngram range. opts[:ngrams].each do |n| builtGramCounts[n] = 0 # For each word in our list... @words.length.times do |j| if ( n + j <= @words.length ) term = @words[j] # Pick number of iterations based on how close to the end of the array we are. (( ( @words.length > n+j) ? n : @words.length-j)-1).times { |ngram| term += " #{@words[j+ngram+1]}" } end @terms[term] += 1.0 if term builtGramCounts[n] += 1 end end if opts.key?(:min_freq) or opts.key?(:max_freq) minwords = @words.size * ( opts[:min_freq] || 0.0 ) maxwords = @words.size * ( opts[:max_freq] || 1.0 ) #@@logger.debug { "Keeping terms between #{minwords} and #{maxwords} out of a total of #{@words.size}" } @terms.delete_if do |term, freq| if ( freq < minwords or freq > maxwords ) @words.delete_if { |x| term == x} true else false end end @wordcount = @words.size end end |
Instance Attribute Details
#filter ⇒ Object (readonly)
Returns the value of attribute filter.
40 41 42 |
# File 'lib/sclust/util/doc.rb', line 40 def filter @filter end |
#terms ⇒ Object (readonly)
Returns the value of attribute terms.
40 41 42 |
# File 'lib/sclust/util/doc.rb', line 40 def terms @terms end |
#text ⇒ Object (readonly)
Returns the value of attribute text.
40 41 42 |
# File 'lib/sclust/util/doc.rb', line 40 def text @text end |
#userDate ⇒ Object (readonly)
Returns the value of attribute userDate.
40 41 42 |
# File 'lib/sclust/util/doc.rb', line 40 def userDate @userDate end |
#word_count ⇒ Object (readonly)
Returns the value of attribute word_count.
40 41 42 |
# File 'lib/sclust/util/doc.rb', line 40 def word_count @word_count end |
#words ⇒ Object (readonly)
Returns the value of attribute words.
40 41 42 |
# File 'lib/sclust/util/doc.rb', line 40 def words @words end |
Instance Method Details
#delete_term_if(&call) ⇒ Object
Frequency information is never updated.
108 109 110 111 |
# File 'lib/sclust/util/doc.rb', line 108 def delete_term_if(&call) @terms.delete_if { |term, val| call.call(term) } @words.delete_if { |term| call.call(term) } end |
#each_term(&call) ⇒ Object
Each term and the term count passed to the given block. Divide the count by the total number of works to get the term frequency.
124 125 126 |
# File 'lib/sclust/util/doc.rb', line 124 def each_term(&call) terms.each{ |k,v| yield(k, v) } end |
#has_term?(term) ⇒ Boolean
128 129 130 |
# File 'lib/sclust/util/doc.rb', line 128 def has_term?(term) @terms.has_key?(term) end |
#term_count(term) ⇒ Object
113 114 115 |
# File 'lib/sclust/util/doc.rb', line 113 def term_count(term) @terms[term] end |
#term_frequency(term) ⇒ Object Also known as: tf
117 118 119 |
# File 'lib/sclust/util/doc.rb', line 117 def term_frequency(term) @terms[term] / @words.size end |