Class: Mechanize::Page

Inherits:
File
  • Object
show all
Extended by:
Forwardable, ElementMatcher
Defined in:
lib/mechanize/page.rb

Overview

This class encapsulates an HTML page. If Mechanize finds a content type of ‘text/html’, this class will be instantiated and returned.

Example:

require 'mechanize'

agent = Mechanize.new
agent.get('http://google.com/').class # => Mechanize::Page

Defined Under Namespace

Classes: Base, Frame, Image, Label, Link, MetaRefresh

Constant Summary collapse

DEFAULT_RESPONSE =
{
  'content-type' => 'text/html',
}.freeze

Constants included from Parser

Mechanize::Parser::SPECIAL_FILENAMES

Instance Attribute Summary collapse

Attributes inherited from File

#body, #filename

Attributes included from Parser

#code, #response, #uri

Class Method Summary collapse

Instance Method Summary collapse

Methods included from ElementMatcher

elements_with

Methods inherited from File

#save

Methods included from Parser

#extract_filename, #fill_header, #find_free_name

Constructor Details

#initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil) ⇒ Page

Returns a new instance of Page.



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/mechanize/page.rb', line 27

def initialize(uri=nil, response=nil, body=nil, code=nil, mech=nil)
  response ||= DEFAULT_RESPONSE
  raise Mechanize::ContentTypeError, response['content-type'] unless
    response['content-type'] =~ %r{\A(?:text/html|application/xhtml\+xml)(?:$|\s*[\s;,])}i

  @meta_content_type = nil
  @encoding = nil
  @encodings = [nil]
  raise 'no' if mech and not Mechanize === mech
  @mech = mech

  reset

  @encodings << Mechanize::Util.detect_charset(body) if body

  @encodings.concat self.class.response_header_charset(response)

  if body
    # Force the encoding to be 8BIT so we can perform regular expressions.
    # We'll set it to the detected encoding later
    body.force_encoding 'ASCII-8BIT' if body.respond_to? :force_encoding

    @encodings.concat self.class.meta_charset body

    meta_content_type = self.class.meta_content_type body
    @meta_content_type = meta_content_type if meta_content_type
  end

  @encodings << mech.default_encoding if mech and mech.default_encoding

  super uri, response, body, code
end

Instance Attribute Details

#encodingsObject (readonly)

Possible encodings for this page based on HTTP headers and meta elements



25
26
27
# File 'lib/mechanize/page.rb', line 25

def encodings
  @encodings
end

#mechObject

Returns the value of attribute mech.



20
21
22
# File 'lib/mechanize/page.rb', line 20

def mech
  @mech
end

Class Method Details

.charset(content_type) ⇒ Object Also known as: charset_from_content_type



408
409
410
411
412
# File 'lib/mechanize/page.rb', line 408

def charset content_type
  charset = content_type[/;(?:\s*,)?\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i, 1]
  return nil if charset == 'none'
  charset
end

.meta_charset(body) ⇒ Object

Retrieves all charsets from meta tags in body



430
431
432
433
434
435
436
437
438
439
440
441
442
443
# File 'lib/mechanize/page.rb', line 430

def self.meta_charset body
  # HACK use .map
  body.scan(/<meta .*?>/i).map do |meta|
    if meta =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then
      $2
    elsif meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
      meta =~ /content\s*=\s*(["'])?(.*?)\1/i

      m_charset = charset $2 if $2

      m_charset if m_charset
    end
  end.compact
end

.meta_content_type(body) ⇒ Object

Retrieves the last content-type set by a meta tag in body



448
449
450
451
452
453
454
455
456
457
458
# File 'lib/mechanize/page.rb', line 448

def self.meta_content_type body
  body.scan(/<meta .*?>/i).reverse.map do |meta|
    if meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
      meta =~ /content=(["'])?(.*?)\1/i

      return $2
    end
  end

  nil
end

.response_header_charset(response) ⇒ Object



417
418
419
420
421
422
423
424
425
# File 'lib/mechanize/page.rb', line 417

def self.response_header_charset response
  charsets = []
  response.each do |header, value|
    next unless header == 'content-type'
    next unless value =~ /charset/i
    charsets << charset(value)
  end
  charsets
end

Instance Method Details

#baseObject

:method: bases_with(criteria)

Find all base tags matching criteria. Example:

page.bases_with(:href => /foo/).each do |base|
  puts base.href
end


270
# File 'lib/mechanize/page.rb', line 270

elements_with :base

#basesObject

Return a list of all base tags



359
360
361
362
# File 'lib/mechanize/page.rb', line 359

def bases
  @bases ||=
    search('base').map { |node| Base.new(node, @mech, self) }
end

#canonical_uriObject

Return the canonical URI for the page if there is a link tag with href=“canonical”.



179
180
181
182
183
184
185
186
187
# File 'lib/mechanize/page.rb', line 179

def canonical_uri
  link = at('link[@rel="canonical"][@href]')
  return unless link
  href = link['href']

  URI href
rescue URI::InvalidURIError
  URI Mechanize::Util.uri_escape href
end

#content_typeObject

Get the content type



190
191
192
# File 'lib/mechanize/page.rb', line 190

def content_type
  @meta_content_type || response['content-type']
end

#detected_encodingObject



76
77
78
# File 'lib/mechanize/page.rb', line 76

def detected_encoding
  Mechanize::Util.detect_charset(body)
end

#encodingObject



96
97
98
# File 'lib/mechanize/page.rb', line 96

def encoding
  parser.respond_to?(:encoding) ? parser.encoding : nil
end

#encoding=(encoding) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/mechanize/page.rb', line 80

def encoding=(encoding)
  reset

  @encoding = encoding

  if @parser
    parser_encoding = @parser.encoding
    if parser_encoding && encoding && parser_encoding.casecmp(encoding) != 0
      # lazy reinitialize the parser with the new encoding
      @parser = nil
    end
  end

  encoding
end

#encoding_error?(parser = nil) ⇒ Boolean

Return whether parser result has errors related to encoding or not. false indicates just parser has no encoding errors, not encoding is vaild.

Returns:

  • (Boolean)


102
103
104
105
106
107
108
109
110
# File 'lib/mechanize/page.rb', line 102

def encoding_error?(parser=nil)
  parser = self.parser unless parser
  return false if parser.errors.empty?
  parser.errors.any? do |error|
    error.message =~ /(indicate\ encoding)|
                      (Invalid\ char)|
                      (input\ conversion\ failed)/x
  end
end

#formObject

:method: forms_with(criteria)

Find all forms form matching criteria. Example:

page.forms_with(:action => '/post/login.php').each do |f|
  ...
end


234
# File 'lib/mechanize/page.rb', line 234

elements_with :form

#formsObject

Return a list of all form tags



338
339
340
341
342
343
344
# File 'lib/mechanize/page.rb', line 338

def forms
  @forms ||= search('form').map do |html_form|
    form = Mechanize::Form.new(html_form, @mech, self)
    form.action ||= @uri.to_s
    form
  end
end

#frameObject

:method: frames_with(criteria)

Find all frame tags matching criteria. Example:

page.frames_with(:src => /foo/).each do |frame|
  p frame.src
end


288
# File 'lib/mechanize/page.rb', line 288

elements_with :frame

#framesObject

Return a list of all frame tags



366
367
368
369
# File 'lib/mechanize/page.rb', line 366

def frames
  @frames ||=
    search('frame').map { |node| Frame.new(node, @mech, self) }
end

#iframeObject

:method: iframes_with(criteria)

Find all iframe tags matching criteria. Example:

page.iframes_with(:src => /foo/).each do |iframe|
  p iframe.src
end


306
# File 'lib/mechanize/page.rb', line 306

elements_with :iframe

#iframesObject

Return a list of all iframe tags



373
374
375
376
# File 'lib/mechanize/page.rb', line 373

def iframes
  @iframes ||=
    search('iframe').map { |node| Frame.new(node, @mech, self) }
end

#imageObject

:method: images_with(criteria)

Find all images matching criteria. Example:

page.images_with(:src => /jpg\Z/).each do |img|
  img.fetch.save
end


324
# File 'lib/mechanize/page.rb', line 324

elements_with :image

#image_urlsObject



385
386
387
# File 'lib/mechanize/page.rb', line 385

def image_urls
  @image_urls ||= images.map(&:url).uniq
end

#imagesObject

Return a list of all img tags



380
381
382
383
# File 'lib/mechanize/page.rb', line 380

def images
  @images ||=
    search('img').map { |node| Image.new(node, self) }
end

#labelsObject

Return a list of all label tags



391
392
393
394
# File 'lib/mechanize/page.rb', line 391

def labels
  @labels ||=
    search('label').map { |node| Label.new(node, self) }
end

#labels_hashObject



396
397
398
399
400
401
402
403
404
405
# File 'lib/mechanize/page.rb', line 396

def labels_hash
  unless @labels_hash
    hash = {}
    labels.each do |label|
      hash[label.node['for']] = label if label.for
    end
    @labels_hash = hash
  end
  return @labels_hash
end

:method: links_with(criteria)

Find all links matching criteria. Example:

page.links_with(:href => /foo/).each do |link|
  puts link.href
end


252
# File 'lib/mechanize/page.rb', line 252

elements_with :link

Return a list of all link and area tags



328
329
330
331
332
333
334
# File 'lib/mechanize/page.rb', line 328

def links
  @links ||= %w{ a area }.map do |tag|
    search(tag).map do |node|
      Link.new(node, @mech, self)
    end
  end.flatten
end

#meta_charsetObject



72
73
74
# File 'lib/mechanize/page.rb', line 72

def meta_charset
  self.class.meta_charset(body)
end

#meta_refreshObject

Return a list of all meta refresh elements



349
350
351
352
353
354
355
# File 'lib/mechanize/page.rb', line 349

def meta_refresh
  query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'

  @meta_refresh ||= search(query).map do |node|
    MetaRefresh.from_node node, self
  end.compact
end

#parserObject Also known as: root

:method: at

Search through the page for path under namespace using Nokogiri’s #at. The path may be either a CSS or XPath expression.

See also Nokogiri::XML::Node#at



202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/mechanize/page.rb', line 202

def parser
  return @parser if @parser
  return nil unless @body

  if @encoding then
    @parser = @mech.html_parser.parse html_body, nil, @encoding
  elsif mech.force_default_encoding then
    @parser = @mech.html_parser.parse html_body, nil, @mech.default_encoding
  else
    @encodings.reverse_each do |encoding|
      @parser = @mech.html_parser.parse html_body, nil, encoding

      break unless encoding_error? @parser
    end
  end

  @parser
end

#pretty_print(q) ⇒ Object

:nodoc:



133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/mechanize/page.rb', line 133

def pretty_print(q) # :nodoc:
  q.object_group(self) {
    q.breakable
    q.group(1, '{url', '}') {q.breakable; q.pp uri }
    q.breakable
    q.group(1, '{meta_refresh', '}') {
      meta_refresh.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{title', '}') { q.breakable; q.pp title }
    q.breakable
    q.group(1, '{iframes', '}') {
      iframes.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{frames', '}') {
      frames.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{links', '}') {
      links.each { |link| q.breakable; q.pp link }
    }
    q.breakable
    q.group(1, '{forms', '}') {
      forms.each { |form| q.breakable; q.pp form }
    }
  }
end

#resetObject



164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/mechanize/page.rb', line 164

def reset
  @bases = nil
  @forms = nil
  @frames = nil
  @iframes = nil
  @links = nil
  @labels = nil
  @labels_hash = nil
  @meta_refresh = nil
  @parser = nil
  @title = nil
end

#response_header_charsetObject



68
69
70
# File 'lib/mechanize/page.rb', line 68

def response_header_charset
  self.class.response_header_charset(response)
end

#titleObject



60
61
62
63
64
65
66
# File 'lib/mechanize/page.rb', line 60

def title
  @title ||=
    if doc = parser
      title = doc.search('title').inner_text
      title.empty? ? nil : title
    end
end