Class: GruCrawler
- Inherits:
-
Object
show all
- Defined in:
- lib/grucrawler.rb,
lib/grucrawler/queue.rb,
lib/grucrawler/version.rb
Defined Under Namespace
Classes: DoNotCrawlFurther, Queue
Constant Summary
collapse
- VERSION =
"0.0.5"
Instance Method Summary
collapse
Constructor Details
#initialize(rules) ⇒ GruCrawler
Returns a new instance of GruCrawler.
12
13
14
15
16
17
18
19
|
# File 'lib/grucrawler.rb', line 12
def initialize(rules)
@crawler = rules
@options = @crawler.options()
domain_wait = @options[:domain_wait] || 20
@queue = GruCrawler::Queue.new(@crawler.class.name, @options[:visit_urls_only_once], domain_wait)
@crawler.on_init(self)
end
|
Instance Method Details
#add_from_queue ⇒ Object
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
# File 'lib/grucrawler.rb', line 32
def add_from_queue
url = @queue.next_url()
return false unless url
request = Typhoeus::Request.new(url, followlocation: @options[:follow_redirects], accept_encoding: 'gzip')
@queue.started(url)
request.on_complete do |response|
on_response(response)
end
@crawler.debug("#{Time.now} started URL #{url}")
@hydra.queue(request)
true
end
|
#add_url(url) ⇒ Object
28
29
30
|
# File 'lib/grucrawler.rb', line 28
def add_url(url)
@queue.push(url)
end
|
#crawl_more ⇒ Object
77
78
79
80
81
|
# File 'lib/grucrawler.rb', line 77
def crawl_more
while @queue.count < @concurrency
break unless add_from_queue()
end
end
|
#on_response(response) ⇒ Object
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
# File 'lib/grucrawler.rb', line 53
def on_response(response)
@crawler.debug("#{Time.now} ended URL #{response.request.url}")
@queue.finished(response.request.url)
crawl_more()
if response.body.length > (@options[:max_page_size] || 1000*1000*1000)
@crawler.debug("URL response size too big: #{response.body.length} from #{response.request.url}")
return
end
nokogiri = Nokogiri::HTML(response.body)
begin
@crawler.on_page_received(response, nokogiri)
rescue
@crawler.log_error(response, $!)
end
queue_links(response, nokogiri)
crawl_more()
end
|
#queue_links(response, nokogiri) ⇒ Object
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
# File 'lib/grucrawler.rb', line 83
def queue_links(response, nokogiri)
nokogiri.css('a').each do |link|
next unless link['href']
begin
url = URI.join(response.effective_url, link['href']).to_s
rescue
next
end
if @crawler.follow_link(url, response, nokogiri)
added = add_url(url)
@crawler.debug("#{Time.now} queued #{url}") if added
end
end
end
|
#reset ⇒ Object
49
50
51
|
# File 'lib/grucrawler.rb', line 49
def reset
@queue.reset
end
|
#run ⇒ Object
21
22
23
24
25
26
|
# File 'lib/grucrawler.rb', line 21
def run
@hydra = Typhoeus::Hydra.new()
@concurrency = @options[:concurrency] || 5
crawl_more()
@hydra.run
end
|