Class: WWW::Mechanize

Inherits:
Object show all
Defined in:
lib/graybook/importer/page_scraper.rb

Overview

Patch Mechanize’s broken html unescaping Mechanize 0.6.11

Instance Method Summary collapse

Instance Method Details

#to_absolute_uri(url, cur_page = current_page()) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/graybook/importer/page_scraper.rb', line 8

def to_absolute_uri(url, cur_page=current_page())
  unless url.is_a? URI
    url = url.to_s.strip
    url = URI.parse(
            Util.html_unescape(
              SyncEnumerator.new(
                url.split(/%[0-9A-Fa-f]{2}/), url.scan(/%[0-9A-Fa-f]{2}/)
              ).map { |x,y|
                "#{URI.escape(x||'')}#{y}"
              }.join('').gsub(/%23/, '#')
            )
          )
    # Mechanize here uses #zip to combine the two arrays, which will ignore 
    # excessive elements of the second array (the one which is passed as an 
    # argument). That means if the URL ends with more than one already escaped
    # character, then only the first one will be restored into the resulting
    # URL.
  end

  # construct an absolute uri
  if url.relative?
    raise 'no history. please specify an absolute URL' unless cur_page.uri
    url = cur_page.uri + url
    # Strip initial "/.." bits from the path
    url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
  end

  return url
end