Class: RDF::SAK::DocStats

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
lib/rdf/sak/docstats.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeDocStats

Returns a new instance of DocStats.



158
159
160
161
162
163
164
165
# File 'lib/rdf/sak/docstats.rb', line 158

def initialize
  @on     = false
  @text   = ''
  @stack  = [] # XXX i don't think we use this one
  @wpb    = []
  @counts = %i[chars words blocks sections images videos embeds
    tables lists forms scripts sheets].map { |k| [k, 0] }.to_h
end

Instance Attribute Details

#blocksObject (readonly)

Returns the value of attribute blocks.



111
112
113
# File 'lib/rdf/sak/docstats.rb', line 111

def blocks
  @blocks
end

#charsObject (readonly)

Returns the value of attribute chars.



111
112
113
# File 'lib/rdf/sak/docstats.rb', line 111

def chars
  @chars
end

#wordsObject (readonly)

Returns the value of attribute words.



111
112
113
# File 'lib/rdf/sak/docstats.rb', line 111

def words
  @words
end

Class Method Details

.scan(doc) ⇒ Object



178
179
180
# File 'lib/rdf/sak/docstats.rb', line 178

def self.scan doc
  new.scan doc
end

Instance Method Details

#cdata_block(string) ⇒ Object



135
136
137
# File 'lib/rdf/sak/docstats.rb', line 135

def cdata_block string
  characters string
end

#characters(string) ⇒ Object



131
132
133
# File 'lib/rdf/sak/docstats.rb', line 131

def characters string
  @text += string if @on
end

#countsObject



154
155
156
# File 'lib/rdf/sak/docstats.rb', line 154

def counts
  @counts.dup.freeze
end

#end_element_namespace(name, prefix = nil, uri = nil) ⇒ Object



120
121
122
123
124
125
126
127
128
129
# File 'lib/rdf/sak/docstats.rb', line 120

def end_element_namespace name, prefix = nil, uri = nil
  if uri == XHTMLNS
    SKIP.include?(name.to_sym) ? clear_text : do_block(name)
    COUNTS.each do |type, set|
      @counts[type] += 1 if set.include? name.to_sym
    end
    @counts[:sections] -= 1 if name == 'body'
    @on = false if name == 'body'
  end
end

#meanFloat

Returns mean of words per block.

Returns:

  • (Float)

    mean of words per block



140
141
142
# File 'lib/rdf/sak/docstats.rb', line 140

def mean
  @wpb.mean
end

#quartilesObject

Returns:



150
151
152
# File 'lib/rdf/sak/docstats.rb', line 150

def quartiles
  [0, 25, 50, 75, 100].map { |pct| @wpb.percentile(pct) }
end

#scan(doc) ⇒ Object



167
168
169
170
171
172
173
174
175
176
# File 'lib/rdf/sak/docstats.rb', line 167

def scan doc
  if doc.is_a? Nokogiri::XML::Node
    pretend_sax doc
  else
    parser = Nokogiri::XML::SAX::Parser.new self
    parser.parse doc
  end

  self
end

#sdFloat

Returns standard deviation of words per block.

Returns:

  • (Float)

    standard deviation of words per block



145
146
147
# File 'lib/rdf/sak/docstats.rb', line 145

def sd
  @wpb.standard_deviation
end

#start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) ⇒ Object



113
114
115
116
117
118
# File 'lib/rdf/sak/docstats.rb', line 113

def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = []
  unless uri != XHTMLNS or SKIP.include? name.to_sym
    @on = true 
    do_block name
  end
end

#to_hObject



182
183
184
# File 'lib/rdf/sak/docstats.rb', line 182

def to_h
  { mean: mean, sd: sd, quartiles: quartiles }.merge counts
end

#to_rdf(uri: nil, subject: nil) ⇒ Object



186
187
# File 'lib/rdf/sak/docstats.rb', line 186

def to_rdf uri: nil, subject: nil
end