Class: Mechanize::Page
- Extended by:
- Forwardable, ElementMatcher
- Defined in:
- lib/mechanize/page.rb
Overview
This class encapsulates an HTML page. If Mechanize finds a content type of ‘text/html’, this class will be instantiated and returned.
Example:
require 'mechanize'
agent = Mechanize.new
agent.get('http://google.com/').class # => Mechanize::Page
Defined Under Namespace
Classes: Base, Frame, Image, Label, Link, MetaRefresh
Constant Summary collapse
- DEFAULT_RESPONSE =
{ 'content-type' => 'text/html', }.freeze
Constants included from Parser
Mechanize::Parser::SPECIAL_FILENAMES
Instance Attribute Summary collapse
-
#encodings ⇒ Object
readonly
Possible encodings for this page based on HTTP headers and meta elements.
-
#mech ⇒ Object
Returns the value of attribute mech.
Attributes inherited from File
Attributes included from Parser
Class Method Summary collapse
- .charset(content_type) ⇒ Object (also: charset_from_content_type)
-
.meta_charset(body) ⇒ Object
Retrieves all charsets from
metatags inbody. -
.meta_content_type(body) ⇒ Object
Retrieves the last
content-typeset by ametatag inbody. - .response_header_charset(response) ⇒ Object
Instance Method Summary collapse
-
#base ⇒ Object
:method: bases_with(criteria).
-
#bases ⇒ Object
Return a list of all base tags.
-
#canonical_uri ⇒ Object
Return the canonical URI for the page if there is a link tag with href=“canonical”.
-
#content_type ⇒ Object
Get the content type.
- #detected_encoding ⇒ Object
- #encoding ⇒ Object
- #encoding=(encoding) ⇒ Object
-
#encoding_error?(parser = nil) ⇒ Boolean
Return whether parser result has errors related to encoding or not.
-
#form ⇒ Object
:method: forms_with(criteria).
-
#forms ⇒ Object
Return a list of all form tags.
-
#frame ⇒ Object
:method: frames_with(criteria).
-
#frames ⇒ Object
Return a list of all frame tags.
-
#iframe ⇒ Object
:method: iframes_with(criteria).
-
#iframes ⇒ Object
Return a list of all iframe tags.
-
#image ⇒ Object
:method: images_with(criteria).
- #image_urls ⇒ Object
-
#images ⇒ Object
Return a list of all img tags.
-
#initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil) ⇒ Page
constructor
A new instance of Page.
-
#labels ⇒ Object
Return a list of all label tags.
- #labels_hash ⇒ Object
-
#link ⇒ Object
:method: links_with(criteria).
-
#links ⇒ Object
Return a list of all link and area tags.
- #meta_charset ⇒ Object
-
#meta_refresh ⇒ Object
Return a list of all meta refresh elements.
-
#parser ⇒ Object
(also: #root)
:method: at.
-
#pretty_print(q) ⇒ Object
:nodoc:.
- #reset ⇒ Object
- #response_header_charset ⇒ Object
- #title ⇒ Object
Methods included from ElementMatcher
Methods inherited from File
Methods included from Parser
#extract_filename, #fill_header, #find_free_name
Constructor Details
#initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil) ⇒ Page
Returns a new instance of Page.
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/mechanize/page.rb', line 27 def initialize(uri=nil, response=nil, body=nil, code=nil, mech=nil) response ||= DEFAULT_RESPONSE raise Mechanize::ContentTypeError, response['content-type'] unless response['content-type'] =~ %r{\A(?:text/html|application/xhtml\+xml)(?:$|\s*[\s;,])}i = nil @encoding = nil @encodings = [nil] raise 'no' if mech and not Mechanize === mech @mech = mech reset @encodings << Mechanize::Util.detect_charset(body) if body @encodings.concat self.class.response_header_charset(response) if body # Force the encoding to be 8BIT so we can perform regular expressions. # We'll set it to the detected encoding later body.force_encoding 'ASCII-8BIT' if body.respond_to? :force_encoding @encodings.concat self.class. body = self.class. body = if end @encodings << mech.default_encoding if mech and mech.default_encoding super uri, response, body, code end |
Instance Attribute Details
#encodings ⇒ Object (readonly)
Possible encodings for this page based on HTTP headers and meta elements
25 26 27 |
# File 'lib/mechanize/page.rb', line 25 def encodings @encodings end |
#mech ⇒ Object
Returns the value of attribute mech.
20 21 22 |
# File 'lib/mechanize/page.rb', line 20 def mech @mech end |
Class Method Details
.charset(content_type) ⇒ Object Also known as: charset_from_content_type
408 409 410 411 412 |
# File 'lib/mechanize/page.rb', line 408 def charset content_type charset = content_type[/;(?:\s*,)?\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i, 1] return nil if charset == 'none' charset end |
.meta_charset(body) ⇒ Object
Retrieves all charsets from meta tags in body
430 431 432 433 434 435 436 437 438 439 440 441 442 443 |
# File 'lib/mechanize/page.rb', line 430 def self. body # HACK use .map body.scan(/<meta .*?>/i).map do || if =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then $2 elsif =~ /http-equiv\s*=\s*(["'])?content-type\1/i then =~ /content\s*=\s*(["'])?(.*?)\1/i m_charset = charset $2 if $2 m_charset if m_charset end end.compact end |
.meta_content_type(body) ⇒ Object
Retrieves the last content-type set by a meta tag in body
448 449 450 451 452 453 454 455 456 457 458 |
# File 'lib/mechanize/page.rb', line 448 def self. body body.scan(/<meta .*?>/i).reverse.map do || if =~ /http-equiv\s*=\s*(["'])?content-type\1/i then =~ /content=(["'])?(.*?)\1/i return $2 end end nil end |
.response_header_charset(response) ⇒ Object
417 418 419 420 421 422 423 424 425 |
# File 'lib/mechanize/page.rb', line 417 def self.response_header_charset response charsets = [] response.each do |header, value| next unless header == 'content-type' next unless value =~ /charset/i charsets << charset(value) end charsets end |
Instance Method Details
#base ⇒ Object
:method: bases_with(criteria)
Find all base tags matching criteria. Example:
page.bases_with(:href => /foo/).each do |base|
puts base.href
end
270 |
# File 'lib/mechanize/page.rb', line 270 elements_with :base |
#bases ⇒ Object
Return a list of all base tags
359 360 361 362 |
# File 'lib/mechanize/page.rb', line 359 def bases @bases ||= search('base').map { |node| Base.new(node, @mech, self) } end |
#canonical_uri ⇒ Object
Return the canonical URI for the page if there is a link tag with href=“canonical”.
179 180 181 182 183 184 185 186 187 |
# File 'lib/mechanize/page.rb', line 179 def canonical_uri link = at('link[@rel="canonical"][@href]') return unless link href = link['href'] URI href rescue URI::InvalidURIError URI Mechanize::Util.uri_escape href end |
#content_type ⇒ Object
Get the content type
190 191 192 |
# File 'lib/mechanize/page.rb', line 190 def content_type || response['content-type'] end |
#detected_encoding ⇒ Object
76 77 78 |
# File 'lib/mechanize/page.rb', line 76 def detected_encoding Mechanize::Util.detect_charset(body) end |
#encoding ⇒ Object
96 97 98 |
# File 'lib/mechanize/page.rb', line 96 def encoding parser.respond_to?(:encoding) ? parser.encoding : nil end |
#encoding=(encoding) ⇒ Object
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/mechanize/page.rb', line 80 def encoding=(encoding) reset @encoding = encoding if @parser parser_encoding = @parser.encoding if parser_encoding && encoding && parser_encoding.casecmp(encoding) != 0 # lazy reinitialize the parser with the new encoding @parser = nil end end encoding end |
#encoding_error?(parser = nil) ⇒ Boolean
Return whether parser result has errors related to encoding or not. false indicates just parser has no encoding errors, not encoding is vaild.
102 103 104 105 106 107 108 109 110 |
# File 'lib/mechanize/page.rb', line 102 def encoding_error?(parser=nil) parser = self.parser unless parser return false if parser.errors.empty? parser.errors.any? do |error| error. =~ /(indicate\ encoding)| (Invalid\ char)| (input\ conversion\ failed)/x end end |
#form ⇒ Object
:method: forms_with(criteria)
Find all forms form matching criteria. Example:
page.forms_with(:action => '/post/login.php').each do |f|
...
end
234 |
# File 'lib/mechanize/page.rb', line 234 elements_with :form |
#forms ⇒ Object
Return a list of all form tags
338 339 340 341 342 343 344 |
# File 'lib/mechanize/page.rb', line 338 def forms @forms ||= search('form').map do |html_form| form = Mechanize::Form.new(html_form, @mech, self) form.action ||= @uri.to_s form end end |
#frame ⇒ Object
:method: frames_with(criteria)
Find all frame tags matching criteria. Example:
page.frames_with(:src => /foo/).each do |frame|
p frame.src
end
288 |
# File 'lib/mechanize/page.rb', line 288 elements_with :frame |
#frames ⇒ Object
Return a list of all frame tags
366 367 368 369 |
# File 'lib/mechanize/page.rb', line 366 def frames @frames ||= search('frame').map { |node| Frame.new(node, @mech, self) } end |
#iframe ⇒ Object
:method: iframes_with(criteria)
Find all iframe tags matching criteria. Example:
page.iframes_with(:src => /foo/).each do |iframe|
p iframe.src
end
306 |
# File 'lib/mechanize/page.rb', line 306 elements_with :iframe |
#iframes ⇒ Object
Return a list of all iframe tags
373 374 375 376 |
# File 'lib/mechanize/page.rb', line 373 def iframes @iframes ||= search('iframe').map { |node| Frame.new(node, @mech, self) } end |
#image ⇒ Object
:method: images_with(criteria)
Find all images matching criteria. Example:
page.images_with(:src => /jpg\Z/).each do |img|
img.fetch.save
end
324 |
# File 'lib/mechanize/page.rb', line 324 elements_with :image |
#image_urls ⇒ Object
385 386 387 |
# File 'lib/mechanize/page.rb', line 385 def image_urls @image_urls ||= images.map(&:url).uniq end |
#images ⇒ Object
Return a list of all img tags
380 381 382 383 |
# File 'lib/mechanize/page.rb', line 380 def images @images ||= search('img').map { |node| Image.new(node, self) } end |
#labels ⇒ Object
Return a list of all label tags
391 392 393 394 |
# File 'lib/mechanize/page.rb', line 391 def labels @labels ||= search('label').map { |node| Label.new(node, self) } end |
#labels_hash ⇒ Object
396 397 398 399 400 401 402 403 404 405 |
# File 'lib/mechanize/page.rb', line 396 def labels_hash unless @labels_hash hash = {} labels.each do |label| hash[label.node['for']] = label if label.for end @labels_hash = hash end return @labels_hash end |
#link ⇒ Object
:method: links_with(criteria)
Find all links matching criteria. Example:
page.links_with(:href => /foo/).each do |link|
puts link.href
end
252 |
# File 'lib/mechanize/page.rb', line 252 elements_with :link |
#links ⇒ Object
Return a list of all link and area tags
328 329 330 331 332 333 334 |
# File 'lib/mechanize/page.rb', line 328 def links @links ||= %w{ a area }.map do |tag| search(tag).map do |node| Link.new(node, @mech, self) end end.flatten end |
#meta_charset ⇒ Object
72 73 74 |
# File 'lib/mechanize/page.rb', line 72 def self.class.(body) end |
#meta_refresh ⇒ Object
Return a list of all meta refresh elements
349 350 351 352 353 354 355 |
# File 'lib/mechanize/page.rb', line 349 def query = @mech. == :anywhere ? 'meta' : 'head > meta' ||= search(query).map do |node| MetaRefresh.from_node node, self end.compact end |
#parser ⇒ Object Also known as: root
:method: at
Search through the page for path under namespace using Nokogiri’s #at. The path may be either a CSS or XPath expression.
See also Nokogiri::XML::Node#at
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
# File 'lib/mechanize/page.rb', line 202 def parser return @parser if @parser return nil unless @body if @encoding then @parser = @mech.html_parser.parse html_body, nil, @encoding elsif mech.force_default_encoding then @parser = @mech.html_parser.parse html_body, nil, @mech.default_encoding else @encodings.reverse_each do |encoding| @parser = @mech.html_parser.parse html_body, nil, encoding break unless encoding_error? @parser end end @parser end |
#pretty_print(q) ⇒ Object
:nodoc:
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/mechanize/page.rb', line 133 def pretty_print(q) # :nodoc: q.object_group(self) { q.breakable q.group(1, '{url', '}') {q.breakable; q.pp uri } q.breakable q.group(1, '{meta_refresh', '}') { .each { |link| q.breakable; q.pp link } } q.breakable q.group(1, '{title', '}') { q.breakable; q.pp title } q.breakable q.group(1, '{iframes', '}') { iframes.each { |link| q.breakable; q.pp link } } q.breakable q.group(1, '{frames', '}') { frames.each { |link| q.breakable; q.pp link } } q.breakable q.group(1, '{links', '}') { links.each { |link| q.breakable; q.pp link } } q.breakable q.group(1, '{forms', '}') { forms.each { |form| q.breakable; q.pp form } } } end |
#reset ⇒ Object
164 165 166 167 168 169 170 171 172 173 174 175 |
# File 'lib/mechanize/page.rb', line 164 def reset @bases = nil @forms = nil @frames = nil @iframes = nil @links = nil @labels = nil @labels_hash = nil = nil @parser = nil @title = nil end |
#response_header_charset ⇒ Object
68 69 70 |
# File 'lib/mechanize/page.rb', line 68 def response_header_charset self.class.response_header_charset(response) end |
#title ⇒ Object
60 61 62 63 64 65 66 |
# File 'lib/mechanize/page.rb', line 60 def title @title ||= if doc = parser title = doc.search('title').inner_text title.empty? ? nil : title end end |