Class: RDF::SAK::DocStats
- Inherits:
-
Nokogiri::XML::SAX::Document
- Object
- Nokogiri::XML::SAX::Document
- RDF::SAK::DocStats
- Defined in:
- lib/rdf/sak/docstats.rb
Instance Attribute Summary collapse
-
#blocks ⇒ Object
readonly
Returns the value of attribute blocks.
-
#chars ⇒ Object
readonly
Returns the value of attribute chars.
-
#words ⇒ Object
readonly
Returns the value of attribute words.
Class Method Summary collapse
Instance Method Summary collapse
- #cdata_block(string) ⇒ Object
- #characters(string) ⇒ Object
- #counts ⇒ Object
- #end_element_namespace(name, prefix = nil, uri = nil) ⇒ Object
-
#initialize ⇒ DocStats
constructor
A new instance of DocStats.
-
#mean ⇒ Float
Mean of words per block.
- #quartiles ⇒ Object
- #scan(doc) ⇒ Object
-
#sd ⇒ Float
Standard deviation of words per block.
- #start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) ⇒ Object
- #to_h ⇒ Object
- #to_rdf(uri: nil, subject: nil) ⇒ Object
Constructor Details
#initialize ⇒ DocStats
Returns a new instance of DocStats.
158 159 160 161 162 163 164 165 |
# File 'lib/rdf/sak/docstats.rb', line 158 def initialize @on = false @text = '' @stack = [] # XXX i don't think we use this one @wpb = [] @counts = %i[chars words blocks sections images videos embeds tables lists forms scripts sheets].map { |k| [k, 0] }.to_h end |
Instance Attribute Details
#blocks ⇒ Object (readonly)
Returns the value of attribute blocks.
111 112 113 |
# File 'lib/rdf/sak/docstats.rb', line 111 def blocks @blocks end |
#chars ⇒ Object (readonly)
Returns the value of attribute chars.
111 112 113 |
# File 'lib/rdf/sak/docstats.rb', line 111 def chars @chars end |
#words ⇒ Object (readonly)
Returns the value of attribute words.
111 112 113 |
# File 'lib/rdf/sak/docstats.rb', line 111 def words @words end |
Class Method Details
.scan(doc) ⇒ Object
178 179 180 |
# File 'lib/rdf/sak/docstats.rb', line 178 def self.scan doc new.scan doc end |
Instance Method Details
#cdata_block(string) ⇒ Object
135 136 137 |
# File 'lib/rdf/sak/docstats.rb', line 135 def cdata_block string characters string end |
#characters(string) ⇒ Object
131 132 133 |
# File 'lib/rdf/sak/docstats.rb', line 131 def characters string @text += string if @on end |
#counts ⇒ Object
154 155 156 |
# File 'lib/rdf/sak/docstats.rb', line 154 def counts @counts.dup.freeze end |
#end_element_namespace(name, prefix = nil, uri = nil) ⇒ Object
120 121 122 123 124 125 126 127 128 129 |
# File 'lib/rdf/sak/docstats.rb', line 120 def end_element_namespace name, prefix = nil, uri = nil if uri == XHTMLNS SKIP.include?(name.to_sym) ? clear_text : do_block(name) COUNTS.each do |type, set| @counts[type] += 1 if set.include? name.to_sym end @counts[:sections] -= 1 if name == 'body' @on = false if name == 'body' end end |
#mean ⇒ Float
Returns mean of words per block.
140 141 142 |
# File 'lib/rdf/sak/docstats.rb', line 140 def mean @wpb.mean end |
#quartiles ⇒ Object
150 151 152 |
# File 'lib/rdf/sak/docstats.rb', line 150 def quartiles [0, 25, 50, 75, 100].map { |pct| @wpb.percentile(pct) } end |
#scan(doc) ⇒ Object
167 168 169 170 171 172 173 174 175 176 |
# File 'lib/rdf/sak/docstats.rb', line 167 def scan doc if doc.is_a? Nokogiri::XML::Node pretend_sax doc else parser = Nokogiri::XML::SAX::Parser.new self parser.parse doc end self end |
#sd ⇒ Float
Returns standard deviation of words per block.
145 146 147 |
# File 'lib/rdf/sak/docstats.rb', line 145 def sd @wpb.standard_deviation end |
#start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) ⇒ Object
113 114 115 116 117 118 |
# File 'lib/rdf/sak/docstats.rb', line 113 def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = [] unless uri != XHTMLNS or SKIP.include? name.to_sym @on = true do_block name end end |
#to_h ⇒ Object
182 183 184 |
# File 'lib/rdf/sak/docstats.rb', line 182 def to_h { mean: mean, sd: sd, quartiles: quartiles }.merge counts end |
#to_rdf(uri: nil, subject: nil) ⇒ Object
186 187 |
# File 'lib/rdf/sak/docstats.rb', line 186 def to_rdf uri: nil, subject: nil end |