Class: Mechanize::Page

Inherits:
File
  • Object
show all
Extended by:
Forwardable, ElementMatcher
Defined in:
lib/mechanize/page.rb

Overview

This class encapsulates an HTML page. If Mechanize finds a content type of ‘text/html’, this class will be instantiated and returned.

Example:

require 'mechanize'

agent = Mechanize.new
agent.get('http://google.com/').class # => Mechanize::Page

Defined Under Namespace

Classes: Base, Frame, Image, Label, Link, MetaRefresh

Constant Summary collapse

DEFAULT_RESPONSE =
{
  'content-type' => 'text/html',
}.freeze

Constants included from Parser

Mechanize::Parser::SPECIAL_FILENAMES

Instance Attribute Summary collapse

Attributes inherited from File

#body, #filename

Attributes included from Parser

#code, #response, #uri

Class Method Summary collapse

Instance Method Summary collapse

Methods included from ElementMatcher

elements_with

Methods inherited from File

#save, #save!

Methods included from Parser

#extract_filename, #fill_header, #find_free_name

Constructor Details

#initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil) ⇒ Page

Returns a new instance of Page.



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/mechanize/page.rb', line 28

def initialize(uri=nil, response=nil, body=nil, code=nil, mech=nil)
  response ||= DEFAULT_RESPONSE

  @meta_content_type = nil
  @encoding = nil
  @encodings = [nil]
  raise 'no' if mech and not Mechanize === mech
  @mech = mech

  reset

  @encodings << Mechanize::Util.detect_charset(body) if body

  @encodings.concat self.class.response_header_charset(response)

  if body
    @encodings.concat self.class.meta_charset body

    meta_content_type = self.class.meta_content_type body
    @meta_content_type = meta_content_type if meta_content_type
  end

  @encodings << mech.default_encoding if mech and mech.default_encoding

  super uri, response, body, code
end

Instance Attribute Details

#encodingsObject (readonly)

Possible encodings for this page based on HTTP headers and meta elements



26
27
28
# File 'lib/mechanize/page.rb', line 26

def encodings
  @encodings
end

#mechObject

Returns the value of attribute mech.



21
22
23
# File 'lib/mechanize/page.rb', line 21

def mech
  @mech
end

Class Method Details

.charset(content_type) ⇒ Object Also known as: charset_from_content_type



576
577
578
579
580
# File 'lib/mechanize/page.rb', line 576

def charset content_type
  charset = content_type[/;(?:\s*,)?\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i, 1]
  return nil if charset == 'none'
  charset
end

.meta_charset(body) ⇒ Object

Retrieves all charsets from meta tags in body



598
599
600
601
602
603
604
605
606
607
608
609
610
611
# File 'lib/mechanize/page.rb', line 598

def self.meta_charset body
  # HACK use .map
  body.scan(/<meta .*?>/i).map do |meta|
    if meta =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then
      $2
    elsif meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
      meta =~ /content\s*=\s*(["'])?(.*?)\1/i

      m_charset = charset $2 if $2

      m_charset if m_charset
    end
  end.compact
end

.meta_content_type(body) ⇒ Object

Retrieves the last content-type set by a meta tag in body



616
617
618
619
620
621
622
623
624
625
626
# File 'lib/mechanize/page.rb', line 616

def self.meta_content_type body
  body.scan(/<meta .*?>/i).reverse.map do |meta|
    if meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
      meta =~ /content=(["'])?(.*?)\1/i

      return $2
    end
  end

  nil
end

.response_header_charset(response) ⇒ Object



585
586
587
588
589
590
591
592
593
# File 'lib/mechanize/page.rb', line 585

def self.response_header_charset response
  charsets = []
  response.each do |header, value|
    next unless header == 'content-type'
    next unless value =~ /charset/i
    charsets << charset(value)
  end
  charsets
end

Instance Method Details

#baseObject

:method: bases_with

:call-seq: bases_with(criteria)

Find all base tags matching criteria. See forms_with for details of criteria, where for “form(s)” read “base tag(s)”.

Example:

page.bases_with(href: /foo/).each do |base|
  puts base.href
end


378
# File 'lib/mechanize/page.rb', line 378

elements_with :base

#basesObject

Return a list of all base tags



527
528
529
530
# File 'lib/mechanize/page.rb', line 527

def bases
  @bases ||=
    search('base').map { |node| Base.new(node, @mech, self) }
end

#canonical_uriObject

Return the canonical URI for the page if there is a link tag with href=“canonical”.



179
180
181
182
183
184
185
186
187
# File 'lib/mechanize/page.rb', line 179

def canonical_uri
  link = at('link[@rel="canonical"][@href]')
  return unless link
  href = link['href']

  URI href
rescue URI::InvalidURIError
  URI Mechanize::Util.uri_escape href
end

#content_typeObject

Get the content type



190
191
192
# File 'lib/mechanize/page.rb', line 190

def content_type
  @meta_content_type || response['content-type']
end

#detected_encodingObject



71
72
73
# File 'lib/mechanize/page.rb', line 71

def detected_encoding
  Mechanize::Util.detect_charset(body)
end

#encodingObject



91
92
93
94
95
# File 'lib/mechanize/page.rb', line 91

def encoding
  parser.encoding
rescue NoMethodError
  nil
end

#encoding=(encoding) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/mechanize/page.rb', line 75

def encoding=(encoding)
  reset

  @encoding = encoding

  if @parser
    parser_encoding = @parser.encoding
    if parser_encoding && encoding && parser_encoding.casecmp(encoding) != 0
      # lazy reinitialize the parser with the new encoding
      @parser = nil
    end
  end

  encoding
end

#encoding_error?(parser = nil) ⇒ Boolean

Return whether parser result has errors related to encoding or not. false indicates just parser has no encoding errors, not encoding is valid.

Returns:

  • (Boolean)


99
100
101
102
103
104
105
106
107
108
# File 'lib/mechanize/page.rb', line 99

def encoding_error?(parser=nil)
  parser = self.parser unless parser
  return false if parser.errors.empty?
  parser.errors.any? do |error|
    error.message.scrub =~ /(indicate\ encoding)|
                            (Invalid\ bytes)|
                            (Invalid\ char)|
                            (input\ conversion\ failed)/x
  end
end

#formObject

:method: forms_with

:call-seq:

forms_with(name)
forms_with(name: name_matcher, id: id_matcher, class: class_matcher,
           search: search_expression, xpath: xpath_expression, css: css_expression,
           action: action_matcher, ...)

Find all forms form matching criteria. If a string is given, it is taken as a name attribute value. If a hash is given, forms are narrowed by the key-value pairs as follows.

:id, :dom_id: selects forms with a #dom_id value that matches this value.

:class, :dom_class: selects forms with a #dom_class value that matches this value. Note that class attribute values are compared literally as string, so forms_with(class: “a”) does not match a form with class=“a b”. Use forms_with(css: “form.a”) instead.

:search: only selects forms matching this selector expression.

:xpath: only selects forms matching this XPath expression.

:css: only selects forms matching this CSS selector expression.

:action, :method, etc.: narrows forms by a given attribute value using the === operator.

Example:

page.forms_with(css: '#content table.login_box form', method: /\APOST\z/i, ).each do |f|
  ...
end


301
# File 'lib/mechanize/page.rb', line 301

elements_with :form

#formsObject

Return a list of all form tags



506
507
508
509
510
511
512
# File 'lib/mechanize/page.rb', line 506

def forms
  @forms ||= search('form').map do |html_form|
    form = Mechanize::Form.new(html_form, @mech, self)
    form.action ||= @uri.to_s
    form
  end
end

#frameObject

:method: frames_with

:call-seq: frames_with(criteria)

Find all frame tags matching criteria. See forms_with for details of criteria, where for “form(s)” read “frame tag(s)”.

Example:

page.frames_with(src: /foo/).each do |frame|
  p frame.src
end


416
# File 'lib/mechanize/page.rb', line 416

elements_with :frame

#framesObject

Return a list of all frame tags



534
535
536
537
# File 'lib/mechanize/page.rb', line 534

def frames
  @frames ||=
    search('frame').map { |node| Frame.new(node, @mech, self) }
end

#iframeObject

:method: iframes_with

:call-seq: iframes_with(criteria)

Find all iframe tags matching criteria. See forms_with for details of criteria, where for “form(s)” read “iframe tag(s)”.

Example:

page.iframes_with(src: /foo/).each do |iframe|
  p iframe.src
end


454
# File 'lib/mechanize/page.rb', line 454

elements_with :iframe

#iframesObject

Return a list of all iframe tags



541
542
543
544
# File 'lib/mechanize/page.rb', line 541

def iframes
  @iframes ||=
    search('iframe').map { |node| Frame.new(node, @mech, self) }
end

#imageObject

:method: images_with

:call-seq: images_with(criteria)

Find all images matching criteria. See forms_with for details of criteria, where for “form(s)” read “image(s)”.

Example:

page.images_with(src: /jpg\Z/).each do |img|
  img.fetch.save
end


492
# File 'lib/mechanize/page.rb', line 492

elements_with :image

#image_urlsObject



553
554
555
# File 'lib/mechanize/page.rb', line 553

def image_urls
  @image_urls ||= images.map(&:url).uniq
end

#imagesObject

Return a list of all img tags



548
549
550
551
# File 'lib/mechanize/page.rb', line 548

def images
  @images ||=
    search('img').map { |node| Image.new(node, self) }
end

#labelsObject

Return a list of all label tags



559
560
561
562
# File 'lib/mechanize/page.rb', line 559

def labels
  @labels ||=
    search('label').map { |node| Label.new(node, self) }
end

#labels_hashObject



564
565
566
567
568
569
570
571
572
573
# File 'lib/mechanize/page.rb', line 564

def labels_hash
  unless @labels_hash
    hash = {}
    labels.each do |label|
      hash[label.node['for']] = label if label.for
    end
    @labels_hash = hash
  end
  return @labels_hash
end

:method: links_with

:call-seq:

links_with(criteria)

Find all links matching criteria. See forms_with for details of criteria, where for “form(s)” read “link(s)”.

Example:

page.links_with(href: /foo/).each do |link|
  puts link.href
end


340
# File 'lib/mechanize/page.rb', line 340

elements_with :link

Return a list of all link and area tags



496
497
498
499
500
501
502
# File 'lib/mechanize/page.rb', line 496

def links
  @links ||= %w{ a area }.map do |tag|
    search(tag).map do |node|
      Link.new(node, @mech, self)
    end
  end.flatten
end

#meta_charsetObject



67
68
69
# File 'lib/mechanize/page.rb', line 67

def meta_charset
  self.class.meta_charset(body)
end

#meta_refreshObject

Return a list of all meta refresh elements



517
518
519
520
521
522
523
# File 'lib/mechanize/page.rb', line 517

def meta_refresh
  query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'

  @meta_refresh ||= search(query).map do |node|
    MetaRefresh.from_node node, self
  end.compact
end

#parserObject Also known as: root

:method: at_xpath

Shorthand for parser.at_xpath.

See also Nokogiri::XML::Node#at_xpath for details.



236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# File 'lib/mechanize/page.rb', line 236

def parser
  return @parser if @parser
  return unless @body

  url = @uri && @uri.to_s

  if @encoding
    @parser = mech.html_parser.parse html_body, url, @encoding
  elsif mech.force_default_encoding
    @parser = mech.html_parser.parse html_body, url, @mech.default_encoding
  else
    @encodings.reverse_each do |encoding|
      @parser = mech.html_parser.parse html_body, url, encoding

      break unless encoding_error? @parser
    end
  end

  @parser
end

#pretty_print(q) ⇒ Object

:nodoc:



133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/mechanize/page.rb', line 133

def pretty_print(q) # :nodoc:
  q.object_group(self) {
    q.breakable
    q.group(1, '{url', '}') {q.breakable; q.pp uri }
    q.breakable
    q.group(1, '{meta_refresh', '}') {
      meta_refresh.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{title', '}') { q.breakable; q.pp title }
    q.breakable
    q.group(1, '{iframes', '}') {
      iframes.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{frames', '}') {
      frames.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{links', '}') {
      links.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{forms', '}') {
      forms.each { |form| q.breakable; q.pp form }
    }
  }
end

#resetObject



164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/mechanize/page.rb', line 164

def reset
  @bases = nil
  @forms = nil
  @frames = nil
  @iframes = nil
  @links = nil
  @labels = nil
  @labels_hash = nil
  @meta_refresh = nil
  @parser = nil
  @title = nil
end

#response_header_charsetObject



63
64
65
# File 'lib/mechanize/page.rb', line 63

def response_header_charset
  self.class.response_header_charset(response)
end

#titleObject



55
56
57
58
59
60
61
# File 'lib/mechanize/page.rb', line 55

def title
  @title ||=
    if doc = parser
      title = doc.xpath('string(((/html/head | /html | /head | /)/title)[1])').to_s
      title.empty? ? nil : title
    end
end