Class: CrawlJob
- Inherits:
-
Object
- Object
- CrawlJob
- Defined in:
- lib/crawl_job.rb
Class Method Summary collapse
-
.perform(content_request) ⇒ Object
redis params used.
Class Method Details
.perform(content_request) ⇒ Object
redis params used
crawl-counter crawled queue-counter statistics statistics statistics statistics statistics statistics statistics statistics statistics]:finished_at] total_pages total_assets statistics[“mime_type”] statistics[xxx]
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# File 'lib/crawl_job.rb', line 28 def self.perform(content_request) # change all hash keys to symbols content_request.deep_symbolize_keys redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{content_request[:crawl_id]}") @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true) # check we haven't crawled this url before crawl_counter = redis.get("crawl-counter").to_i queue_counter = redis.get("queue-counter").to_i unless redis.sismember "crawled", content_request[:url] # increment counter and check we haven't hit our crawl limit redis.incr "crawl-counter" crawl_counter += 1 if crawl_counter <= content_request[:crawl_limit].to_i content = Cobweb.new(content_request).get(content_request[:url]) ## update statistics if redis.hexists "statistics", "average_response_time" redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / crawl_counter + 1)) else redis.hset("statistics", "average_response_time", content[:response_time].to_f) end redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f if redis.hexists "statistics", "average_length" redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / crawl_counter + 1)) else redis.hset("statistics", "average_length", content[:length].to_i) end redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml") redis.incr "total_pages" else redis.incr "total_assets" end mime_counts = {} if redis.hexists "statistics", "mime_counts" mime_counts = JSON.parse(redis.hget("statistics", "mime_counts")) if mime_counts.has_key? content[:mime_type] mime_counts[content[:mime_type]] += 1 else mime_counts[content[:mime_type]] = 1 end else mime_counts = {content[:mime_type] => 1} end redis.hset "statistics", "mime_counts", mime_counts.to_json status_counts = {} if redis.hexists "statistics", "status_counts" status_counts = JSON.parse(redis.hget("statistics", "status_counts")) if status_counts.has_key? content[:status_code].to_i status_counts[content[:status_code].to_i] += 1 else status_counts[content[:status_code].to_i] = 1 end else status_counts = {content[:status_code].to_i => 1} end redis.hset "statistics", "status_counts", status_counts.to_json redis.srem "queued", content_request[:url] redis.sadd "crawled", content_request[:url] set_base_url redis, content, content_request[:base_url] content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link| unless redis.sismember "crawled", link puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug] if link.to_s.match(Regexp.new("^#{redis.get("base_url")}")) puts "Matched as #{link} as internal" if content_request[:debug] unless redis.sismember("crawled", link) or redis.sismember("queued", link) if queue_counter <= content_request[:crawl_limit].to_i new_request = content_request.clone new_request[:url] = link new_request[:parent] = content_request[:url] Resque.enqueue(CrawlJob, new_request) redis.sadd "queued", link redis.incr "queue-counter" queue_counter += 1 end end end end end # enqueue to processing queue Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})) puts "#{content_request[:url]} has been sent for processing." if content_request[:debug] puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug] else puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug] end else puts "Already crawled #{content_request[:url]}" if content_request[:debug] end # detect finished state if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter puts "queue_counter: #{queue_counter}" puts "crawl_counter: #{crawl_counter}" puts "crawl_limit: #{content_request[:crawl_limit]}" # finished puts "FINISHED" stats = redis.hgetall "statistics" stats[:total_pages] = redis.get "total_pages" stats[:total_assets] = redis.get "total_assets" stats[:crawl_counter] = redis.get "crawl_counter" stats[:queue_counter] = redis.get "queue_counter" stats[:crawled] = redis.smembers "crawled" Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:source_id => content_request[:source_id]})) ap stats end end |