Module: Spiderman

Extended by:
ActiveSupport::Concern
Defined in:
lib/spiderman.rb,
lib/spiderman/runner.rb,
lib/spiderman/railtie.rb,
lib/spiderman/version.rb

Overview

Turn any class into a crawler by including this module.

Example:

 class MySpider < ApplicationJob # Yup, you can define this in a job
   queue_as :crawler

   include Spiderman

   crawl "https://example.com/" do |response|
     response.css('.selector a').each do |a|
       process! a["href"], :listing
     end
   end

   process :listing do |response|
     process! response.css('img'), :image
     save_the_thing response.css('.some_selector')
   end

   process :image do |response|
     # Do something with the image file
   end

   def save_the_thing(thing)
     # logic here for saving the thing
   end
end

Defined Under Namespace

Classes: Railtie, Runner

Constant Summary collapse

VERSION =
"2.0.0"

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.add(clazz) ⇒ Object



118
119
120
# File 'lib/spiderman.rb', line 118

def add(clazz)
  list.push(clazz)
end

.find(name) ⇒ Object



114
115
116
# File 'lib/spiderman.rb', line 114

def find(name)
  self.list.detect { |crawler| crawler.name.demodulize.underscore == name }
end

.listObject



105
106
107
# File 'lib/spiderman.rb', line 105

def list
  @list ||= []
end

.run(crawler = nil) ⇒ Object



109
110
111
112
# File 'lib/spiderman.rb', line 109

def run(crawler = nil)
  crawlers = crawler ? [find(crawler)] : list
  crawlers.each(&:crawl!)
end

Instance Method Details

#crawl!Object



79
80
81
82
83
# File 'lib/spiderman.rb', line 79

def crawl!
  crawler.urls.each do |url|
    process! url
  end
end

#nameObject



99
100
101
# File 'lib/spiderman.rb', line 99

def name
  self.class.name.demodulize
end

#perform(url, with = nil) ⇒ Object



93
94
95
96
97
# File 'lib/spiderman.rb', line 93

def perform(url, with = nil)
  handler = crawler.handler_for(with || url)
  response = crawler.request(url)
  instance_exec response, &handler
end

#process!(url, with = nil) ⇒ Object



85
86
87
88
89
90
91
# File 'lib/spiderman.rb', line 85

def process!(url, with = nil)
  if defined?(ActiveJob) && self.is_a?(ActiveJob::Base)
    self.class.perform_later(url.to_s, with)
  else
    perform(url, with)
  end
end