Class: Polipus::HTTP

Inherits:
Object
  • Object
show all
Defined in:
lib/polipus/http.rb

Constant Summary collapse

REDIRECT_LIMIT =

Maximum number of redirects to follow on each get_response

5
RESCUABLE_ERRORS =
[
  EOFError,
  Errno::ECONNREFUSED,
  Errno::ECONNRESET,
  Errno::EHOSTUNREACH,
  Errno::EINVAL,
  Errno::EPIPE,
  Errno::ETIMEDOUT,
  Net::HTTPBadResponse,
  Net::HTTPHeaderSyntaxError,
  Net::ProtocolError,
  SocketError,
  Timeout::Error,
  Zlib::DataError,
  Zlib::GzipFile::Error
]

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ HTTP

Returns a new instance of HTTP.



28
29
30
31
32
# File 'lib/polipus/http.rb', line 28

def initialize(opts = {})
  @connections = {}
  @connections_hits = {}
  @opts = opts
end

Instance Method Details

#accept_cookies?Boolean

Does this HTTP client accept cookies from the server?

Returns:

  • (Boolean)


145
146
147
# File 'lib/polipus/http.rb', line 145

def accept_cookies?
  @opts[:accept_cookies]
end


149
150
151
152
# File 'lib/polipus/http.rb', line 149

def cookie_jar
  @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
  @opts[:cookie_jar]
end

#fetch_page(url, referer = nil, depth = nil, user_data = nil) ⇒ Object

Fetch a single Page from the response of an HTTP request to url. Just gets the final destination page.



38
39
40
# File 'lib/polipus/http.rb', line 38

def fetch_page(url, referer = nil, depth = nil, user_data = nil)
  fetch_pages(url, referer, depth, user_data).last
end

#fetch_pages(url, referer = nil, depth = nil, user_data = nil) ⇒ Object

Create new Pages from the response of an HTTP request to url, including redirects



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/polipus/http.rb', line 46

def fetch_pages(url, referer = nil, depth = nil, user_data = nil)
  url = URI(url)
  pages = []
  get(url, referer) do |response, code, location, redirect_to, response_time|
    handle_compression response
    page = Page.new(location, body: response.body,
                              code: code,
                              headers: response.to_hash,
                              referer: referer,
                              depth: depth,
                              redirect_to: redirect_to,
                              response_time: response_time,
                              fetched_at: Time.now.to_i)
    page.user_data = user_data unless user_data.nil?
    pages << page
  end
  pages
rescue *RESCUABLE_ERRORS => e
  if verbose?
    puts e.inspect
    puts e.backtrace
  end
  page = Page.new(url, error: e, referer: referer, depth: depth)
  page.user_data = user_data unless user_data.nil?
  [page]
end

#open_timeoutObject

HTTP open timeout in seconds



139
140
141
# File 'lib/polipus/http.rb', line 139

def open_timeout
  @opts[:open_timeout]
end

#proxy_hostObject

The proxy address string



95
96
97
# File 'lib/polipus/http.rb', line 95

def proxy_host
  @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
end

#proxy_host_portObject

Shorthand to get proxy info with a single call It returns an array of [‘addr’, port, ‘user’, ‘pass’]



125
126
127
# File 'lib/polipus/http.rb', line 125

def proxy_host_port
  @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
end

#proxy_passObject

The proxy password



116
117
118
119
# File 'lib/polipus/http.rb', line 116

def proxy_pass
  # return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
  @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
end

#proxy_portObject

The proxy port



102
103
104
# File 'lib/polipus/http.rb', line 102

def proxy_port
  @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
end

#proxy_userObject

The proxy username



109
110
111
# File 'lib/polipus/http.rb', line 109

def proxy_user
  @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
end

#read_timeoutObject

HTTP read timeout in seconds



132
133
134
# File 'lib/polipus/http.rb', line 132

def read_timeout
  @opts[:read_timeout]
end

#redirect_limitObject

The maximum number of redirects to follow



76
77
78
# File 'lib/polipus/http.rb', line 76

def redirect_limit
  @opts[:redirect_limit] || REDIRECT_LIMIT
end

#user_agentObject

The user-agent string which will be sent with each request, or nil if no such option is set



84
85
86
87
88
89
90
# File 'lib/polipus/http.rb', line 84

def user_agent
  if @opts[:user_agent].respond_to?(:sample)
    @opts[:user_agent].sample
  else
    @opts[:user_agent]
  end
end