Class: Sledgehammer::CrawlWorker
- Inherits:
-
Object
- Object
- Sledgehammer::CrawlWorker
- Includes:
- Sidekiq::Worker
- Defined in:
- lib/sledgehammer/workers/crawl_worker.rb
Constant Summary collapse
- MAIL_REGEX =
/[A-Z0-9._%+-]+@[A-Z0-9.-]+\.(?!jpg|gif|png)[A-Z0-9]+/i
- URL_REGEX =
/<a\s+(?:[^>]*?\s+)?href="((?:http|\/)[^"]+)"/
- DEFAULT_OPTIONS =
{ depth: 0, depth_limit: 1, queue: 'default' }
Instance Method Summary collapse
- #after_queue(urls) ⇒ Object
-
#before_queue(urls) ⇒ Object
Callbacks to overload in application.
- #on_complete(response) ⇒ Object
-
#on_queue(url) ⇒ Object
Stops element from being added to queue if returns false.
-
#perform(urls, opts = {}) ⇒ Object
There shouldn’t be any need to overload methods below.
- #queue(url) ⇒ Object
- #run_queue ⇒ Object
Instance Method Details
#after_queue(urls) ⇒ Object
21 22 23 |
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 21 def after_queue(urls) # stub end |
#before_queue(urls) ⇒ Object
Callbacks to overload in application
10 11 12 |
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 10 def before_queue(urls) # stub end |
#on_complete(response) ⇒ Object
25 26 27 28 29 30 31 32 |
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 25 def on_complete(response) page = self.find_or_create_page!(response.request.url) unless page.completed? self.parse_emails(response, page) self.parse_urls(response) page.update_attributes completed: true end end |
#on_queue(url) ⇒ Object
Stops element from being added to queue if returns false
17 18 19 |
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 17 def on_queue(url) true end |
#perform(urls, opts = {}) ⇒ Object
There shouldn’t be any need to overload methods below
38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 38 def perform(urls, opts = {}) @options = HashWithIndifferentAccess.new(DEFAULT_OPTIONS) @options.merge!(opts) return if @options[:depth] == @options[:depth_limit] before_queue(urls) urls.each { |site| self.queue(site) } run_queue after_queue(urls) end |
#queue(url) ⇒ Object
50 51 52 53 54 55 56 57 |
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 50 def queue(url) return unless self.on_queue(url) && valid_url?(url) request = Typhoeus::Request.new(url) request.on_complete { |response| self.on_complete(response) } Typhoeus::Hydra.hydra.queue(request) end |
#run_queue ⇒ Object
59 60 61 |
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 59 def run_queue Typhoeus::Hydra.hydra.run end |