Class: RDaneel

Inherits:
Object
  • Object
show all
Includes:
EM::Deferrable
Defined in:
lib/rdaneel.rb

Constant Summary collapse

DEFAULT_OPTIONS =
{:head => {'user-agent' => 'RDaneel'}}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(uri, options = {}) ⇒ RDaneel

Returns a new instance of RDaneel.



22
23
24
25
26
27
28
# File 'lib/rdaneel.rb', line 22

def initialize(uri,options = {})
  @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
  @uri.path = "/" if @uri.path.nil? || @uri.path == ""
  @redirects = []
  @verbose = options[:verbose]
  @hash = @uri.hash if @verbose
end

Instance Attribute Details

#errorObject (readonly)

Returns the value of attribute error.



20
21
22
# File 'lib/rdaneel.rb', line 20

def error
  @error
end

#http_clientObject (readonly)

Returns the value of attribute http_client.



20
21
22
# File 'lib/rdaneel.rb', line 20

def http_client
  @http_client
end

#redirectsObject (readonly)

Returns the value of attribute redirects.



20
21
22
# File 'lib/rdaneel.rb', line 20

def redirects
  @redirects
end

#uriObject

Returns the value of attribute uri.



19
20
21
# File 'lib/rdaneel.rb', line 19

def uri
  @uri
end

Class Method Details

.robots_cacheObject



14
15
16
# File 'lib/rdaneel.rb', line 14

def robots_cache
  @robots_cache
end

.robots_cache=(c) ⇒ Object



10
11
12
# File 'lib/rdaneel.rb', line 10

def robots_cache=(c)
  @robots_cache = c
end

Instance Method Details

#get(opts = {}) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/rdaneel.rb', line 30

def get(opts = {})
  current_uri = @uri
  options = DEFAULT_OPTIONS.merge(opts)
  max_redirects = options.delete(:redirects).to_i
  useragent = options[:head]['user-agent']

  _get = lambda {}

  _handle_uri_callback = lambda {|h|
    if success?(h)
      @uri = current_uri if current_uri != @uri
      @http_client = h
      verbose("Succeded fetching: #{current_uri}", h, :status, :response)
      succeed(self)
    elsif redirected?(h)
      if @redirects.size >= max_redirects
        @http_client = h
        @error = "Exceeded maximum number of redirects: #{max_redirects}"
        verbose(@error, h, :status, :response)
        fail(self)
        return
      end
      @redirects << current_uri.to_s
      current_uri = redirect_url(h, current_uri)
      begin
        verbose("Redirected to: #{current_uri.to_s} from: #{@redirects[-1]}", h, :status, :response)
        if @redirects.include?(current_uri.to_s)
          @http_client = h
          @error = "Infinite redirect detected for: #{current_uri.to_s}"
          verbose(@error, h, :status, :response)
          fail(self)
          return
        end
        _get.call
      rescue StandardError => se
        @http_client = h
        @error = "Error trying to follow a redirect #{current_uri.to_s}: #{h.response_header.location}"
        verbose(@error, h, :status, :response)
        fail(self)
      end
    else
      # other error
      @http_client = h
      @error = "Not success neither redirect"
      verbose(@error, h, :status, :response)
      fail(self)
    end
  }
  _get = lambda {
    robots_url = robots_txt_url(current_uri)
    if robots_cache && robots_file = robots_cache[robots_url.to_s]
      verbose("Found cached robots.txt:\n#{robots_cache[robots_url.to_s]} for: #{current_uri}")
      if robots_allowed?(robots_file, useragent, robots_url, current_uri)
        verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
        begin
          h = EM::HttpRequest.new(current_uri).get(options)
          verbose("Started fetching: #{current_uri}",h,:request)
          h.callback(&_handle_uri_callback)
          h.errback {
            @http_client = h
            @error = error_message(h)
            verbose("#{@error} for: #{current_uri}",h,:status,:response)
            fail(self)
          }
        rescue StandardError => se
          @http_client = EM::HttpClient.new("")
          @error = "#{se.message}\n#{se.backtrace.inspect}"
          verbose("For: #{current_uri} something went wrong: #{@error}")
          fail(self)
        end
      else
        @http_client = EM::HttpClient.new("")
        @error = "Robots are not allowed"
        verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
        fail(self)
      end
    else
      robots_url = robots_txt_url(current_uri)
      robots = EM::HttpRequest.new(robots_url).get(:redirects => 2) # get the robots.txt following redirects
      verbose("Started fetching robots.txt from: #{robots_url} for: #{current_uri}",robots,:request)
      robots.callback {
        if success?(robots)
          robots_file = robots.response
          verbose("Found robots.txt at #{robots_url}:\n#{robots_file}", robots, :status, :response)
        else
          robots_file = ''
          verbose("Didn't find robots.txt at #{robots_url}", robots, :status, :response)
        end
        robots_cache[robots_txt_url(robots_url).to_s] = robots_file if robots_cache
        if robots_allowed?(robots_file, useragent, robots_url, current_uri)
          verbose("Robots identified by user agent: #{useragent} are allowed to access: #{current_uri}")
          begin
            h = EM::HttpRequest.new(current_uri).get(options)
            verbose("Started fetching: #{current_uri}",h,:request)
            h.callback(&_handle_uri_callback)
            h.errback {
              @http_client = h
              @error = error_message(h)
              verbose("#{@error} for: #{current_uri}", h, :status, :response)
              fail(self)
            }
          rescue StandardError => se
            @http_client = EM::HttpClient.new("")
            @error = "#{se.message}\n#{se.backtrace.inspect}"
            verbose("For: #{current_uri} something went wrong: #{@error}")
            fail(self)
          end
        else
          @http_client = EM::HttpClient.new("")
          @error = "Robots are not allowed"
          verbose("#{@error} to access: #{current_uri} when identified by user agent: #{useragent}")
          fail(self)
        end
      }
      robots.errback {
        verbose("Failed to fetch robots.txt: from: #{robots_url} for: #{current_uri}", robots, :status, :response)
        robots_cache[robots_url.to_s] = "" if robots_cache
        h = EM::HttpRequest.new(current_uri).get(options)
        verbose("Started fetching: #{current_uri}",h,:request)
        h.callback(&_handle_uri_callback)
        h.errback {
          @http_client = h
          @error = error_message(h)
          verbose("#{@error} for: #{current_uri}", h, :status, :response)
          fail(self)
        }
      }
    end
  }
  _get.call
end

#robots_cacheObject



162
163
164
# File 'lib/rdaneel.rb', line 162

def robots_cache
  self.class.robots_cache
end