Class: Sledgehammer::CrawlWorker

Inherits:
Object
  • Object
show all
Includes:
Sidekiq::Worker
Defined in:
lib/sledgehammer/workers/crawl_worker.rb

Constant Summary collapse

MAIL_REGEX =
/[A-Z0-9._%+-]+@[A-Z0-9.-]+\.(?!jpg|gif|png)[A-Z0-9]+/i
URL_REGEX =
/<a\s+(?:[^>]*?\s+)?href="((?:http|\/)[^"]+)"/
DEFAULT_OPTIONS =
{ depth: 0, depth_limit: 1, queue: 'default' }

Instance Method Summary collapse

Instance Method Details

#after_queue(urls) ⇒ Object



21
22
23
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 21

def after_queue(urls)
  # stub
end

#before_queue(urls) ⇒ Object

Callbacks to overload in application



10
11
12
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 10

def before_queue(urls)
  # stub
end

#on_complete(response) ⇒ Object



25
26
27
28
29
30
31
32
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 25

def on_complete(response)
  page = self.find_or_create_page!(response.request.url)
  unless page.completed?
    self.parse_emails(response, page)
    self.parse_urls(response)
    page.update_attributes completed: true
  end
end

#on_queue(url) ⇒ Object

Stops element from being added to queue if returns false



17
18
19
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 17

def on_queue(url)
  true
end

#perform(urls, opts = {}) ⇒ Object

There shouldn’t be any need to overload methods below



38
39
40
41
42
43
44
45
46
47
48
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 38

def perform(urls, opts = {})
  @options = HashWithIndifferentAccess.new(DEFAULT_OPTIONS)
  @options.merge!(opts)

  return if @options[:depth] == @options[:depth_limit]

  before_queue(urls)
  urls.each { |site| self.queue(site) }
  run_queue
  after_queue(urls)
end

#queue(url) ⇒ Object



50
51
52
53
54
55
56
57
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 50

def queue(url)
  return unless self.on_queue(url) && valid_url?(url)

  request = Typhoeus::Request.new(url)
  request.on_complete { |response| self.on_complete(response) }

  Typhoeus::Hydra.hydra.queue(request)
end

#run_queueObject



59
60
61
# File 'lib/sledgehammer/workers/crawl_worker.rb', line 59

def run_queue
  Typhoeus::Hydra.hydra.run
end