Class: Kabutops::Spider
Class Method Summary
collapse
Instance Method Summary
collapse
Methods inherited from Crawler
#<<, adapters, crawl!, #perform
#callbacks, #notify
#append_features, #included
#storage
#logger
Class Method Details
.<<(resource) ⇒ Object
26
27
28
29
30
31
|
# File 'lib/kabutops/spider.rb', line 26
def << resource
if resource_status(resource).nil?
resource_status(resource, 'new')
super
end
end
|
.crawl(collection = nil) ⇒ Object
17
18
19
|
# File 'lib/kabutops/spider.rb', line 17
def crawl collection=nil
super(collection || [{ url: params.url, }])
end
|
.debug_spider ⇒ Object
10
11
12
13
14
15
|
# File 'lib/kabutops/spider.rb', line 10
def debug_spider
enable_debug
self.new.perform({
url: params[:url]
})
end
|
.follow(link) ⇒ Object
33
34
35
36
37
|
# File 'lib/kabutops/spider.rb', line 33
def follow link
self << {
url: URI.join(params.url, URI.escape(link)).to_s
}
end
|
.reset! ⇒ Object
21
22
23
24
|
# File 'lib/kabutops/spider.rb', line 21
def reset!
super
redis.keys.each{ |k| redis.del(k) }
end
|
.resource_status(resource, status = nil) ⇒ Object
39
40
41
|
# File 'lib/kabutops/spider.rb', line 39
def resource_status resource, status=nil
url_status(resource[:url], status)
end
|
.url_status(url, status = nil) ⇒ Object
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
# File 'lib/kabutops/spider.rb', line 43
def url_status url, status=nil
key = redis_key(url)
if status
redis.set(
key,
JSON.dump({
url: url,
status: status,
})
)
else
item = redis.get(key)
item ? JSON.parse(item)['status'] : nil
end
end
|
Instance Method Details
#after_crawl(resource, page) ⇒ Object
85
86
87
88
89
90
91
92
|
# File 'lib/kabutops/spider.rb', line 85
def after_crawl resource, page
page.css('a').each do |a|
next if a['href'].nil?
follow = self.class.notify(:follow_if, a['href']).any?
self.class.follow(a['href']) if follow
end
end
|
#crawl(resource) ⇒ Object
78
79
80
81
82
83
|
# File 'lib/kabutops/spider.rb', line 78
def crawl resource
page = super
after_crawl(resource, page)
self.class.resource_status(resource, 'done')
page
end
|