Class: CrawlJob

Inherits:
Object
  • Object
show all
Defined in:
lib/crawl_job.rb

Class Method Summary collapse

Class Method Details

.perform(content_request) ⇒ Object

redis params used

crawl-counter crawled queue-counter statistics statistics statistics statistics statistics statistics statistics statistics statistics]:finished_at] total_pages total_assets statistics[“mime_type”] statistics[xxx]



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/crawl_job.rb', line 28

def self.perform(content_request)
  # change all hash keys to symbols    
  content_request.deep_symbolize_keys
  redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{content_request[:crawl_id]}")
  @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)

  # check we haven't crawled this url before
  crawl_counter = redis.get("crawl-counter").to_i
  queue_counter = redis.get("queue-counter").to_i
  unless redis.sismember "crawled", content_request[:url]
    
    # increment counter and check we haven't hit our crawl limit
    redis.incr "crawl-counter"
    crawl_counter += 1
    if crawl_counter <= content_request[:crawl_limit].to_i
      content = Cobweb.new(content_request).get(content_request[:url])

      ## update statistics
      if redis.hexists "statistics", "average_response_time"
        redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / crawl_counter + 1))
      else
        redis.hset("statistics", "average_response_time", content[:response_time].to_f)
      end
      redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
      redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
      if redis.hexists "statistics", "average_length"
        redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / crawl_counter + 1))
      else
        redis.hset("statistics", "average_length", content[:length].to_i)
      end
      redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
      redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i

      if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
        redis.incr "total_pages"
      else
        redis.incr "total_assets"
      end

      mime_counts = {}
      if redis.hexists "statistics", "mime_counts"
        mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
        if mime_counts.has_key? content[:mime_type]
          mime_counts[content[:mime_type]] += 1
        else
          mime_counts[content[:mime_type]] = 1
        end
      else
        mime_counts = {content[:mime_type] => 1}
      end
      redis.hset "statistics", "mime_counts", mime_counts.to_json

      status_counts = {}
      if redis.hexists "statistics", "status_counts"
        status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
        if status_counts.has_key? content[:status_code].to_i
          status_counts[content[:status_code].to_i] += 1
        else
          status_counts[content[:status_code].to_i] = 1
        end
      else
        status_counts = {content[:status_code].to_i => 1}
      end
      redis.hset "statistics", "status_counts", status_counts.to_json

      redis.srem "queued", content_request[:url]
      redis.sadd "crawled", content_request[:url]
      set_base_url redis, content, content_request[:base_url]
      content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
        unless redis.sismember "crawled", link
          puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
          if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
            puts "Matched as #{link} as internal" if content_request[:debug]
            unless redis.sismember("crawled", link) or redis.sismember("queued", link)   
              if queue_counter <= content_request[:crawl_limit].to_i
                new_request = content_request.clone
                new_request[:url] = link
                new_request[:parent] = content_request[:url]
                Resque.enqueue(CrawlJob, new_request)
                redis.sadd "queued", link
                redis.incr "queue-counter"
                queue_counter += 1
              end
            end
          end
        end
      end

      # enqueue to processing queue
      Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
      puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
      puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]


    else
      puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
    end
  else
    puts "Already crawled #{content_request[:url]}" if content_request[:debug]
  end

  # detect finished state

  if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter 
   
    puts "queue_counter: #{queue_counter}"
    puts "crawl_counter: #{crawl_counter}"
    puts "crawl_limit: #{content_request[:crawl_limit]}"

    # finished
    puts "FINISHED"
    stats = redis.hgetall "statistics"
    stats[:total_pages] = redis.get "total_pages"
    stats[:total_assets] = redis.get "total_assets"
    stats[:crawl_counter] = redis.get "crawl_counter"
    stats[:queue_counter] = redis.get "queue_counter"
    stats[:crawled] = redis.smembers "crawled"
    
    Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:source_id => content_request[:source_id]}))      
    
    ap stats
  end
end