Class: RWGet::Fetch

Inherits:
Object
  • Object
show all
Defined in:
lib/rwget/fetch.rb

Constant Summary collapse

DEFAULT_TIMEOUT =
30
DEFAULT_REDIRECTS =
30

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Fetch

Returns a new instance of Fetch.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/rwget/fetch.rb', line 10

def initialize(options = {})
  @robots = {}
  @curl = Curl::Easy.new
  @curl.connect_timeout = options[:connect_timeout] || DEFAULT_TIMEOUT
  @curl.timeout = options[:timeout] || DEFAULT_TIMEOUT
  @curl.max_redirects = options[:max_redirect] || DEFAULT_REDIRECTS
  @curl.follow_location = true
  if options[:http_proxy]
    @curl.proxy_url = options[:http_proxy]
    if options[:proxy_user]
      @curl.proxypwd = "#{options[:proxy_user]}:#{options[:proxy_password]}"
    end
  end
  puts "timeout: #{@curl.timeout}"
end

Instance Method Details

#fetch(uri, user_agent) ⇒ Object



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/rwget/fetch.rb', line 26

def fetch(uri, user_agent)
  @robots[user_agent] ||= Robots.new(user_agent)
  unless @robots[user_agent].allowed?(uri)
    puts "disallowed by robots.txt"
    return nil 
  end
  
  @curl.headers["User-Agent"] = user_agent
  @curl.url = uri.to_s
  @curl.perform
  tmp = nil
  Tempfile.open("curl") {|file| file.print(@curl.body_str); tmp = file }
  tmp.open
  [@curl.last_effective_url, tmp]
rescue Exception => e 
  STDERR.puts "#{uri} not retrieved: #{e.message}"
  nil
end