Module: Scraper::Reader

Defined in:: lib/scraper/reader.rb

Defined Under Namespace

Classes: HTMLParseError, HTTPError, HTTPInvalidURLError, HTTPNoAccessError, HTTPNotFoundError, HTTPRedirectLimitError, HTTPTimeoutError, HTTPUnspecifiedError, Page, Parsed

Constant Summary collapse

REDIRECT_LIMIT =

DEFAULT_TIMEOUT =

PARSERS =

[:tidy, :html_parser]

TIDY_OPTIONS =

{
  :output_xhtml=>true,
  :show_errors=>0,
  :show_warnings=>false,
  :wrap=>0,
  :wrap_sections=>false,
  :force_output=>true,
  :quiet=>true,
  :tidy_mark=>false
}

Class Method Summary collapse

.parse_page(content, encoding = nil, options = nil, parser = :tidy) ⇒ Object

:call-seq: parse_page(html, encoding?, options?, parser) => html.
.read_page(url, options = nil) ⇒ Object

:call-seq: read_page(url, options?) => response.

Class Method Details

.parse_page(content, encoding = nil, options = nil, parser = :tidy) ⇒ `Object`

:call-seq:

parse_page(html, encoding?, options?, parser) => html

Parses an HTML page and returns the encoding and HTML element. Raises HTMLParseError exceptions if it cannot parse the HTML.

Options are passed to the parser. For example, when using Tidy you can pass Tidy cleanup options in the hash.

The last option specifies which parser to use (see PARSERS). By default Tidy is used.

# File 'lib/scraper/reader.rb', line 191

def parse_page(content, encoding = nil, options = nil, parser = :tidy)
  begin
    # Get the document encoding from the meta header.
    if meta = content.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
      if meta = meta[0].match(/charset=([\w-]*)/i)
        encoding = meta[1]
      end
    end
    encoding ||= "utf8"
    case (parser || :tidy)
    when :tidy
      # Make sure the Tidy path is set and always apply the default
      # options (these only control things like errors, output type).
      find_tidy
      options = (options || {}).update(TIDY_OPTIONS)
      options[:input_encoding] = encoding.gsub("-", "").downcase
      html = TidyFFI::Tidy.with_options(options).clean(content)
      document = HTML::Document.new(html).find(:tag=>"html")
    when :html_parser
      document = HTML::HTMLParser.parse(content).root
    else
      raise HTMLParseError, "No parser #{parser || "unspecified"}"
    end
    return Parsed[document, encoding]
  rescue Exception=>error
    raise HTMLParseError.new(error)
  end
end

.read_page(url, options = nil) ⇒ `Object`

:call-seq:

read_page(url, options?) => response

Reads a Web page and return its URL, content and cache control headers.

The request reads a Web page at the specified URL (must be a URI object). It accepts the following options:

:last_modified – Last modified header (from a previous request).
:etag – ETag header (from a previous request).
:redirect_limit – Number of redirects allowed (default is 3).
:user_agent – The User-Agent header to send.
:timeout – HTTP open connection/read timeouts (in second).
:ssl_verify_mode – SSL verification mode, defaults to OpenSSL::SSL::VERIFY_NONE

It returns a hash with the following information:

:url – The URL of the requested page (may change by permanent redirect)
:content – The content of the response (may be nil if cached)
:content_type – The HTML page Content-Type header
:last_modified – Last modified cache control header (may be nil)
:etag – ETag cache control header (may be nil)
:encoding – Document encoding for the page

If the page has not been modified from the last request, the content is nil.

Raises HTTPError if an error prevents it from reading the page.

Raises:

(HTTPRedirectLimitError)

# File 'lib/scraper/reader.rb', line 110

def read_page(url, options = nil)
  options ||= {}
  redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT
  raise HTTPRedirectLimitError if redirect_limit == 0
  if url.is_a?(URI)
    uri = url
  else
    begin
      uri = URI.parse(url)
    rescue Exception=>error
      raise HTTPInvalidURLError.new(error)
    end
  end
  raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/
  begin
    http = Net::HTTP.new(uri.host, uri.port)
    http.use_ssl = (uri.scheme == "https")
    http.verify_mode = options[:ssl_verify_mode] || OpenSSL::SSL::VERIFY_NONE
    http.close_on_empty_response = true
    http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
    path = uri.path.dup # required so we don't modify path
    path << "?#{uri.query}" if uri.query
    # TODO: Specify which content types are accepted.
    # TODO: GZip support.
    headers = {}
    headers["User-Agent"] = options[:user_agent] if options[:user_agent]
    headers["Last-Modified"] = options[:last_modified] if options[:last_modified]
    headers["ETag"] = options[:etag] if options[:etag]
    response = http.request_get(path, headers)
    # TODO: Ignore content types that do not map to HTML.
  rescue TimeoutError=>error
    raise HTTPTimeoutError.new(error)
  rescue Exception=>error
    raise HTTPUnspecifiedError.new(error)
  end
  case response
  when Net::HTTPSuccess
    encoding = if content_type = response["Content-Type"]
      if match = content_type.match(/charset=([^\s]+)/i)
        match[1]
      end
    end
    return Page[(options[:source_url] || uri), response.body, encoding,
                response["Last-Modified"], response["ETag"]]
  when Net::HTTPNotModified
    return Page[(options[:source_url] || uri), nil, nil,
                options[:last_modified], options[:etag]]
  when Net::HTTPMovedPermanently
    return read_page((uri.merge(response["location"]) rescue nil), # New URL takes effect
                     :last_modified=>options[:last_modified],
                     :etag=>options[:etag],
                     :redirect_limit=>redirect_limit-1)
  when Net::HTTPRedirection
    return read_page((uri.merge(response["location"]) rescue nil),
                     :last_modified=>options[:last_modified],
                     :etag=>options[:etag],
                     :redirect_limit=>redirect_limit-1,
                     :source_url=>(options[:source_url] || uri)) # Old URL still in effect
  when Net::HTTPNotFound
    raise HTTPNotFoundError
  when Net::HTTPUnauthorized, Net::HTTPForbidden
    raise HTTPNoAccessError
  when Net::HTTPRequestTimeOut
    raise HTTPTimeoutError
  else
    raise HTTPUnspecifiedError
  end
end