Class: RDaneel
Constant Summary collapse
- DEFAULT_OPTIONS =
{:head => {'user-agent' => 'RDaneel'}}
Instance Attribute Summary collapse
-
#error ⇒ Object
readonly
Returns the value of attribute error.
-
#http_client ⇒ Object
readonly
Returns the value of attribute http_client.
-
#redirects ⇒ Object
readonly
Returns the value of attribute redirects.
-
#uri ⇒ Object
Returns the value of attribute uri.
Class Method Summary collapse
Instance Method Summary collapse
- #get(opts = {}) ⇒ Object
-
#initialize(uri, options = {}) ⇒ RDaneel
constructor
A new instance of RDaneel.
- #robots_cache ⇒ Object
Constructor Details
#initialize(uri, options = {}) ⇒ RDaneel
Returns a new instance of RDaneel.
22 23 24 25 26 27 28 |
# File 'lib/rdaneel.rb', line 22 def initialize(uri, = {}) @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri) @uri.path = "/" if @uri.path.nil? || @uri.path == "" @redirects = [] @verbose = [:verbose] @hash = @uri.hash if @verbose end |
Instance Attribute Details
#error ⇒ Object (readonly)
Returns the value of attribute error.
20 21 22 |
# File 'lib/rdaneel.rb', line 20 def error @error end |
#http_client ⇒ Object (readonly)
Returns the value of attribute http_client.
20 21 22 |
# File 'lib/rdaneel.rb', line 20 def http_client @http_client end |
#redirects ⇒ Object (readonly)
Returns the value of attribute redirects.
20 21 22 |
# File 'lib/rdaneel.rb', line 20 def redirects @redirects end |
#uri ⇒ Object
Returns the value of attribute uri.
19 20 21 |
# File 'lib/rdaneel.rb', line 19 def uri @uri end |
Class Method Details
.robots_cache ⇒ Object
14 15 16 |
# File 'lib/rdaneel.rb', line 14 def robots_cache @robots_cache end |
.robots_cache=(c) ⇒ Object
10 11 12 |
# File 'lib/rdaneel.rb', line 10 def robots_cache=(c) @robots_cache = c end |
Instance Method Details
#get(opts = {}) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/rdaneel.rb', line 30 def get(opts = {}) current_uri = @uri = DEFAULT_OPTIONS.merge(opts) max_redirects = .delete(:redirects).to_i useragent = [:head]['user-agent'] _get = lambda {} _handle_uri_callback = lambda {|h| if success?(h) @uri = current_uri if current_uri != @uri @http_client = h verbose("Succeded fetching: #{current_uri}", h, :status, :response) succeed(self) elsif redirected?(h) if @redirects.size >= max_redirects @http_client = h @error = "Exceeded maximum number of redirects: #{max_redirects}" verbose(@error, h, :status, :response) fail(self) return end @redirects << current_uri.to_s current_uri = redirect_url(h, current_uri) begin verbose("Redirected to: #{current_uri.to_s} from: #{@redirects[-1]}", h, :status, :response) if @redirects.include?(current_uri.to_s) @http_client = h @error = "Infinite redirect detected for: #{current_uri.to_s}" verbose(@error, h, :status, :response) fail(self) return end _get.call rescue StandardError => se @http_client = h @error = "Error trying to follow a redirect #{current_uri.to_s}: #{h.response_header.location}" verbose(@error, h, :status, :response) fail(self) end else # other error @http_client = h @error = "Not success neither redirect" verbose(@error, h, :status, :response) fail(self) end } _get = lambda { robots_url = robots_txt_url(current_uri) if robots_cache && robots_file = robots_cache[robots_url.to_s] verbose("Found cached robots.txt:\n#{robots_cache[robots_url.to_s]} for: #{current_uri}") if robots_allowed?(robots_file, useragent, robots_url, current_uri) verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}") begin h = EM::HttpRequest.new(current_uri).get() verbose("Started fetching: #{current_uri}",h,:request) h.callback(&_handle_uri_callback) h.errback { @http_client = h @error = (h) verbose("#{@error} for: #{current_uri}",h,:status,:response) fail(self) } rescue StandardError => se @http_client = EM::HttpClient.new("") @error = "#{se.}\n#{se.backtrace.inspect}" verbose("For: #{current_uri} something went wrong: #{@error}") fail(self) end else @http_client = EM::HttpClient.new("") @error = "Robots are not allowed" verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}") fail(self) end else robots_url = robots_txt_url(current_uri) robots = EM::HttpRequest.new(robots_url).get(:redirects => 2) # get the robots.txt following redirects verbose("Started fetching robots.txt from: #{robots_url} for: #{current_uri}",robots,:request) robots.callback { if success?(robots) robots_file = robots.response verbose("Found robots.txt at #{robots_url}:\n#{robots_file}", robots, :status, :response) else robots_file = '' verbose("Didn't find robots.txt at #{robots_url}", robots, :status, :response) end robots_cache[robots_txt_url(robots_url).to_s] = robots_file if robots_cache if robots_allowed?(robots_file, useragent, robots_url, current_uri) verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}") begin h = EM::HttpRequest.new(current_uri).get() verbose("Started fetching: #{current_uri}",h,:request) h.callback(&_handle_uri_callback) h.errback { @http_client = h @error = (h) verbose("#{@error} for: #{current_uri}", h, :status, :response) fail(self) } rescue StandardError => se @http_client = EM::HttpClient.new("") @error = "#{se.}\n#{se.backtrace.inspect}" verbose("For: #{current_uri} something went wrong: #{@error}") fail(self) end else @http_client = EM::HttpClient.new("") @error = "Robots are not allowed" verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}") fail(self) end } robots.errback { verbose("Failed to fetch robots.txt: from: #{robots_url} for: #{current_uri}", robots, :status, :response) robots_cache[robots_url.to_s] = "" if robots_cache h = EM::HttpRequest.new(current_uri).get() verbose("Started fetching: #{current_uri}",h,:request) h.callback(&_handle_uri_callback) h.errback { @http_client = h @error = (h) verbose("#{@error} for: #{current_uri}", h, :status, :response) fail(self) } } end } _get.call end |
#robots_cache ⇒ Object
162 163 164 |
# File 'lib/rdaneel.rb', line 162 def robots_cache self.class.robots_cache end |