Class: RWGet::Fetch
- Inherits:
-
Object
- Object
- RWGet::Fetch
- Defined in:
- lib/rwget/fetch.rb
Constant Summary collapse
- DEFAULT_TIMEOUT =
30
- DEFAULT_REDIRECTS =
30
Instance Method Summary collapse
- #fetch(uri, user_agent) ⇒ Object
-
#initialize(options = {}) ⇒ Fetch
constructor
A new instance of Fetch.
Constructor Details
#initialize(options = {}) ⇒ Fetch
Returns a new instance of Fetch.
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/rwget/fetch.rb', line 10 def initialize( = {}) @robots = {} @curl = Curl::Easy.new @curl.connect_timeout = [:connect_timeout] || DEFAULT_TIMEOUT @curl.timeout = [:timeout] || DEFAULT_TIMEOUT @curl.max_redirects = [:max_redirect] || DEFAULT_REDIRECTS @curl.follow_location = true if [:http_proxy] @curl.proxy_url = [:http_proxy] if [:proxy_user] @curl.proxypwd = "#{[:proxy_user]}:#{[:proxy_password]}" end end puts "timeout: #{@curl.timeout}" end |
Instance Method Details
#fetch(uri, user_agent) ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/rwget/fetch.rb', line 26 def fetch(uri, user_agent) @robots[user_agent] ||= Robots.new(user_agent) unless @robots[user_agent].allowed?(uri) puts "disallowed by robots.txt" return nil end @curl.headers["User-Agent"] = user_agent @curl.url = uri.to_s @curl.perform tmp = nil Tempfile.open("curl") {|file| file.print(@curl.body_str); tmp = file } tmp.open [@curl.last_effective_url, tmp] rescue Exception => e STDERR.puts "#{uri} not retrieved: #{e.}" nil end |