Class: Spidey::AbstractSpider

Inherits:
Object
  • Object
show all
Defined in:
lib/spidey/abstract_spider.rb

Constant Summary collapse

DEFAULT_REQUEST_INTERVAL =

seconds

3

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(attrs = {}) ⇒ AbstractSpider

Accepts:

request_interval: number of seconds to wait between requests (default: 3)


17
18
19
20
21
22
23
# File 'lib/spidey/abstract_spider.rb', line 17

def initialize(attrs = {})
  @urls = []
  @handlers = {}
  @results = []
  self.class.start_urls.each { |url| handle url, *self.class.handlers[url] }
  @request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL
end

Instance Attribute Details

#errorsObject

Returns the value of attribute errors.



6
7
8
# File 'lib/spidey/abstract_spider.rb', line 6

def errors
  @errors
end

#handlersObject

Returns the value of attribute handlers.



6
7
8
# File 'lib/spidey/abstract_spider.rb', line 6

def handlers
  @handlers
end

#request_intervalObject

Returns the value of attribute request_interval.



6
7
8
# File 'lib/spidey/abstract_spider.rb', line 6

def request_interval
  @request_interval
end

#resultsObject

Returns the value of attribute results.



6
7
8
# File 'lib/spidey/abstract_spider.rb', line 6

def results
  @results
end

#urlsObject

Returns the value of attribute urls.



6
7
8
# File 'lib/spidey/abstract_spider.rb', line 6

def urls
  @urls
end

Class Method Details

.handle(url, handler, default_data = {}) ⇒ Object



10
11
12
13
# File 'lib/spidey/abstract_spider.rb', line 10

def self.handle(url, handler, default_data = {})
  start_urls << url
  handlers[url] = [handler, default_data]
end

Instance Method Details

#crawl(options = {}) ⇒ Object

Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:

max_urls: maximum number of URLs to crawl before returning (optional)


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/spidey/abstract_spider.rb', line 27

def crawl(options = {})
  @errors = []
  i = 0
  each_url do |url, handler, default_data|
    break if options[:max_urls] && i >= options[:max_urls]
    begin
      page = agent.get(url)
      Spidey.logger.info "Handling #{url.inspect}"
      send handler, page, default_data
    rescue => ex
      add_error url: url, handler: handler, error: ex
    end
    sleep request_interval if request_interval > 0
    i += 1
  end
end