Class: Kabutops::Spider

Inherits:
Crawler show all
Defined in:
lib/kabutops/spider.rb

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Crawler

#<<, adapters, crawl!, #perform

Methods included from Extensions::CallbackSupport

#callbacks, #notify

Methods included from Extensions::Includable

#append_features, #included

Methods included from CrawlerExtensions::PStoreStorage

#storage

Methods included from Extensions::Logging

#logger

Class Method Details

.<<(resource) ⇒ Object



26
27
28
29
30
31
# File 'lib/kabutops/spider.rb', line 26

def << resource
  if resource_status(resource).nil?
    resource_status(resource, 'new')
    super
  end
end

.crawl(collection = nil) ⇒ Object



17
18
19
# File 'lib/kabutops/spider.rb', line 17

def crawl collection=nil
  super(collection || [{ url: params.url, }])
end

.debug_spiderObject



10
11
12
13
14
15
# File 'lib/kabutops/spider.rb', line 10

def debug_spider
  enable_debug
  self.new.perform({
    url: params[:url]
  })
end

.follow(link) ⇒ Object



33
34
35
36
37
# File 'lib/kabutops/spider.rb', line 33

def follow link
  self << {
    url: URI.join(params.url, URI.escape(link)).to_s
  }
end

.reset!Object



21
22
23
24
# File 'lib/kabutops/spider.rb', line 21

def reset!
  super
  redis.keys.each{ |k| redis.del(k) }
end

.resource_status(resource, status = nil) ⇒ Object



39
40
41
# File 'lib/kabutops/spider.rb', line 39

def resource_status resource, status=nil
  url_status(resource[:url], status)
end

.url_status(url, status = nil) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/kabutops/spider.rb', line 43

def url_status url, status=nil
  key = redis_key(url)

  if status
    redis.set(
      key,
      JSON.dump({
        url: url,
        status: status,
      })
    )
  else
    item = redis.get(key)
    item ? JSON.parse(item)['status'] : nil
  end
end

Instance Method Details

#after_crawl(resource, page) ⇒ Object



85
86
87
88
89
90
91
92
# File 'lib/kabutops/spider.rb', line 85

def after_crawl resource, page
  page.css('a').each do |a|
    next if a['href'].nil?

    follow = self.class.notify(:follow_if, a['href']).any?
    self.class.follow(a['href']) if follow
  end
end

#crawl(resource) ⇒ Object



78
79
80
81
82
83
# File 'lib/kabutops/spider.rb', line 78

def crawl resource
  page = super
  after_crawl(resource, page)
  self.class.resource_status(resource, 'done')
  page
end