Class: Nokogiri::HTML::Document::EncodingReader
- Inherits:
-
Object
- Object
- Nokogiri::HTML::Document::EncodingReader
- Defined in:
- lib/nokogiri/html/document.rb
Overview
:nodoc:
Defined Under Namespace
Classes: JumpSAXHandler, SAXHandler
Instance Attribute Summary collapse
-
#encoding_found ⇒ Object
readonly
This method is used by the C extension so that Nokogiri::HTML::Document#read_io() does not leak memory when EncodingFound is raised.
Class Method Summary collapse
- .detect_encoding(chunk) ⇒ Object
- .detect_encoding_for_jruby_without_fix(chunk) ⇒ Object
- .is_jruby_without_fix? ⇒ Boolean
Instance Method Summary collapse
-
#initialize(io) ⇒ EncodingReader
constructor
A new instance of EncodingReader.
- #read(len) ⇒ Object
Constructor Details
#initialize(io) ⇒ EncodingReader
Returns a new instance of EncodingReader.
293 294 295 296 297 |
# File 'lib/nokogiri/html/document.rb', line 293 def initialize(io) @io = io @firstchunk = nil @encoding_found = nil end |
Instance Attribute Details
#encoding_found ⇒ Object (readonly)
This method is used by the C extension so that Nokogiri::HTML::Document#read_io() does not leak memory when EncodingFound is raised.
302 303 304 |
# File 'lib/nokogiri/html/document.rb', line 302 def encoding_found @encoding_found end |
Class Method Details
.detect_encoding(chunk) ⇒ Object
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 |
# File 'lib/nokogiri/html/document.rb', line 251 def self.detect_encoding(chunk) if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix? return EncodingReader.detect_encoding_for_jruby_without_fix(chunk) end m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and return Nokogiri.XML(m[1]).encoding if Nokogiri.jruby? m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and return m[4] catch(:encoding_found) { Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk) nil } else handler = SAXHandler.new parser = Nokogiri::HTML::SAX::PushParser.new(handler) parser << chunk rescue Nokogiri::SyntaxError handler.encoding end end |
.detect_encoding_for_jruby_without_fix(chunk) ⇒ Object
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 |
# File 'lib/nokogiri/html/document.rb', line 277 def self.detect_encoding_for_jruby_without_fix(chunk) m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and return Nokogiri.XML(m[1]).encoding m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and return m[4] catch(:encoding_found) { Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk) nil } rescue Nokogiri::SyntaxError, RuntimeError # Ignore parser errors that nokogiri may raise nil end |
.is_jruby_without_fix? ⇒ Boolean
273 274 275 |
# File 'lib/nokogiri/html/document.rb', line 273 def self.is_jruby_without_fix? JRUBY_VERSION.split('.').join.to_i < 165 end |
Instance Method Details
#read(len) ⇒ Object
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 |
# File 'lib/nokogiri/html/document.rb', line 304 def read(len) # no support for a call without len if !@firstchunk @firstchunk = @io.read(len) or return nil # This implementation expects that the first call from # htmlReadIO() is made with a length long enough (~1KB) to # achieve advanced encoding detection. if encoding = EncodingReader.detect_encoding(@firstchunk) # The first chunk is stored for the next read in retry. raise @encoding_found = EncodingFound.new(encoding) end end @encoding_found = nil ret = @firstchunk.slice!(0, len) if (len -= ret.length) > 0 rest = @io.read(len) and ret << rest end if ret.empty? nil else ret end end |