Module: Scraper::Reader

Defined in:
lib/scraper/reader.rb

Defined Under Namespace

Classes: HTMLParseError, HTTPError, HTTPInvalidURLError, HTTPNoAccessError, HTTPNotFoundError, HTTPRedirectLimitError, HTTPTimeoutError, HTTPUnspecifiedError, Page, Parsed

Constant Summary collapse

REDIRECT_LIMIT =
3
DEFAULT_TIMEOUT =
30
PARSERS =
[:tidy, :html_parser]
TIDY_OPTIONS =
{
  :output_xhtml=>true,
  :show_errors=>0,
  :show_warnings=>false,
  :wrap=>0,
  :wrap_sections=>false,
  :force_output=>true,
  :quiet=>true,
  :tidy_mark=>false
}

Class Method Summary collapse

Class Method Details

.parse_page(content, encoding = nil, options = nil, parser = :tidy) ⇒ Object

:call-seq:

parse_page(html, encoding?, options?, parser) => html

Parses an HTML page and returns the encoding and HTML element. Raises HTMLParseError exceptions if it cannot parse the HTML.

Options are passed to the parser. For example, when using Tidy you can pass Tidy cleanup options in the hash.

The last option specifies which parser to use (see PARSERS). By default Tidy is used.



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/scraper/reader.rb', line 191

def parse_page(content, encoding = nil, options = nil, parser = :tidy)
  begin
    # Get the document encoding from the meta header.
    if meta = content.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
      if meta = meta[0].match(/charset=([\w-]*)/i)
        encoding = meta[1]
      end
    end
    encoding ||= "utf8"
    case (parser || :tidy)
    when :tidy
      # Make sure the Tidy path is set and always apply the default
      # options (these only control things like errors, output type).
      find_tidy
      options = (options || {}).update(TIDY_OPTIONS)
      options[:input_encoding] = encoding.gsub("-", "").downcase
      html = TidyFFI::Tidy.with_options(options).clean(content)
      document = HTML::Document.new(html).find(:tag=>"html")
    when :html_parser
      document = HTML::HTMLParser.parse(content).root
    else
      raise HTMLParseError, "No parser #{parser || "unspecified"}"
    end
    return Parsed[document, encoding]
  rescue Exception=>error
    raise HTMLParseError.new(error)
  end
end

.read_page(url, options = nil) ⇒ Object

:call-seq:

read_page(url, options?) => response

Reads a Web page and return its URL, content and cache control headers.

The request reads a Web page at the specified URL (must be a URI object). It accepts the following options:

  • :last_modified – Last modified header (from a previous request).

  • :etag – ETag header (from a previous request).

  • :redirect_limit – Number of redirects allowed (default is 3).

  • :user_agent – The User-Agent header to send.

  • :timeout – HTTP open connection/read timeouts (in second).

  • :ssl_verify_mode – SSL verification mode, defaults to OpenSSL::SSL::VERIFY_NONE

It returns a hash with the following information:

  • :url – The URL of the requested page (may change by permanent redirect)

  • :content – The content of the response (may be nil if cached)

  • :content_type – The HTML page Content-Type header

  • :last_modified – Last modified cache control header (may be nil)

  • :etag – ETag cache control header (may be nil)

  • :encoding – Document encoding for the page

If the page has not been modified from the last request, the content is nil.

Raises HTTPError if an error prevents it from reading the page.



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/scraper/reader.rb', line 110

def read_page(url, options = nil)
  options ||= {}
  redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT
  raise HTTPRedirectLimitError if redirect_limit == 0
  if url.is_a?(URI)
    uri = url
  else
    begin
      uri = URI.parse(url)
    rescue Exception=>error
      raise HTTPInvalidURLError.new(error)
    end
  end
  raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/
  begin
    http = Net::HTTP.new(uri.host, uri.port)
    http.use_ssl = (uri.scheme == "https")
    http.verify_mode = options[:ssl_verify_mode] || OpenSSL::SSL::VERIFY_NONE
    http.close_on_empty_response = true
    http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
    path = uri.path.dup # required so we don't modify path
    path << "?#{uri.query}" if uri.query
    # TODO: Specify which content types are accepted.
    # TODO: GZip support.
    headers = {}
    headers["User-Agent"] = options[:user_agent] if options[:user_agent]
    headers["Last-Modified"] = options[:last_modified] if options[:last_modified]
    headers["ETag"] = options[:etag] if options[:etag]
    response = http.request_get(path, headers)
    # TODO: Ignore content types that do not map to HTML.
  rescue TimeoutError=>error
    raise HTTPTimeoutError.new(error)
  rescue Exception=>error
    raise HTTPUnspecifiedError.new(error)
  end
  case response
  when Net::HTTPSuccess
    encoding = if content_type = response["Content-Type"]
      if match = content_type.match(/charset=([^\s]+)/i)
        match[1]
      end
    end
    return Page[(options[:source_url] || uri), response.body, encoding,
                response["Last-Modified"], response["ETag"]]
  when Net::HTTPNotModified
    return Page[(options[:source_url] || uri), nil, nil,
                options[:last_modified], options[:etag]]
  when Net::HTTPMovedPermanently
    return read_page((uri.merge(response["location"]) rescue nil), # New URL takes effect
                     :last_modified=>options[:last_modified],
                     :etag=>options[:etag],
                     :redirect_limit=>redirect_limit-1)
  when Net::HTTPRedirection
    return read_page((uri.merge(response["location"]) rescue nil),
                     :last_modified=>options[:last_modified],
                     :etag=>options[:etag],
                     :redirect_limit=>redirect_limit-1,
                     :source_url=>(options[:source_url] || uri)) # Old URL still in effect
  when Net::HTTPNotFound
    raise HTTPNotFoundError
  when Net::HTTPUnauthorized, Net::HTTPForbidden
    raise HTTPNoAccessError
  when Net::HTTPRequestTimeOut
    raise HTTPTimeoutError
  else
    raise HTTPUnspecifiedError
  end
end