10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
# File 'lib/crawl_job.rb', line 10
def self.perform(content_request)
content_request = self.deep_symbolize_keys(content_request)
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
@redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
@stats = Stats.new(content_request)
@debug = content_request[:debug]
refresh_counters
unless @redis.sismember "crawled", content_request[:url]
@redis.srem "queued", content_request[:url]
decrement_queue_counter
@redis.sadd "crawled", content_request[:url]
increment_crawl_counter
if within_crawl_limits?(content_request[:crawl_limit])
content = Cobweb.new(content_request).get(content_request[:url], content_request)
@stats.update_statistics(content)
set_base_url @redis, content, content_request
if within_queue_limits?(content_request[:crawl_limit])
internal_links = all_links_from_content(content).map{|link| link.to_s}
internal_links.reject!{|link| @redis.sismember("crawled", link)}
internal_links.reject!{|link| @redis.sismember("queued", link)}
internal_links.select!{|link| internal_link?(link)}
internal_links.each do |link|
enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
end
end
send_to_processing_queue(content, content_request)
if content_request.has_key? :enqueue_counter_key
enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
end
if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
if @redis.scard("queued") == 0
finished(content_request)
end
elsif @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
finished(content_request)
end
end
else
@redis.srem "queued", content_request[:url]
decrement_queue_counter
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
end
end
|