Class: CobwebCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/cobweb_crawler.rb

Overview

CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ CobwebCrawler

See README for more information on options available



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/cobweb_crawler.rb', line 10

def initialize(options={})
  @options = options
  
  @statistic = {}
  
  @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
  if @options.has_key? :crawl_id
    @crawl_id = @options[:crawl_id]
  else
    @crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
    @options[:crawl_id] = @crawl_id
  end
  
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => RedisConnection.new(@options[:redis_options]))
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
  @options[:seed_urls] = [] if @options[:seed_urls].nil?
  @options[:seed_urls].map{|link| @redis.sadd "queued", link }

  @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
  
  @debug = @options[:debug]
  
  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
  if @options[:web_statistics]
    Server.start(@options)
  end
  
  @cobweb = Cobweb.new(@options)
end

Instance Method Details

#crawl(base_url, crawl_options = {}, &block) ⇒ Object

Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash’



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/cobweb_crawler.rb', line 42

def crawl(base_url, crawl_options = {}, &block)
  @options[:base_url] = base_url unless @options.has_key? :base_url
  @options[:thread_count] = 1 unless @options.has_key? :thread_count
  
  @options[:internal_urls] << base_url if @options[:internal_urls].empty?
  @redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty?
  
  @crawl_options = crawl_options
  
  @redis.sadd("queued", base_url) unless base_url.nil? || @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url)
  @crawl_counter = @redis.scard("crawled").to_i
  @queue_counter = @redis.scard("queued").to_i

  @threads = []
  begin
    @stats.start_crawl(@options)
    
    @threads << Thread.new do
      Thread.abort_on_exception = true
      spawn_thread(&block)
    end

    sleep 5
    while running_thread_count > 0
      if @queue_counter > 0
        (@options[:thread_count]-running_thread_count).times.each do
          @threads << Thread.new do
            Thread.abort_on_exception = true
            spawn_thread(&block)
          end
        end
      end
      sleep 1
    end
    
  ensure
    @stats.end_crawl(@options)
  end
  @stats
end

#running_thread_countObject



153
154
155
# File 'lib/cobweb_crawler.rb', line 153

def running_thread_count
  @threads.map{|t| t.status}.select{|status| status=="run" || status == "sleep"}.count
end

#spawn_thread(&block) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/cobweb_crawler.rb', line 83

def spawn_thread(&block)
    while @queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > @crawl_counter)
      url = @redis.spop "queued"
    @queue_counter = 0 if url.nil?

    @options[:url] = url
    unless @redis.sismember("crawled", url.to_s)
      begin
        @stats.update_status("Requesting #{url}...")
        content = @cobweb.get(url) unless url.nil?
        if content.nil?
          @queue_counter = @queue_counter - 1 #@redis.scard("queued").to_i
        else
          @stats.update_status("Processing #{url}...")

          @redis.sadd "crawled", url.to_s
          @redis.incr "crawl-counter" 
        
          document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq

          # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
          cobweb_links = CobwebLinks.new(@options)

          internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
          
          # reject the link if we've crawled it or queued it
          internal_links.reject!{|link| @redis.sismember("crawled", link)}
          internal_links.reject!{|link| @redis.sismember("queued", link)}
          internal_links.reject!{|link| link.nil? || link.empty?}
        
          internal_links.each do |link|
            puts "Added #{link.to_s} to queue" if @debug
            @redis.sadd "queued", link unless link.nil?
            children = @redis.hget("navigation", url)
            children = [] if children.nil?
            children << link
            @redis.hset "navigation", url, children
            @queue_counter += 1
          end

          if @options[:store_inbound_links]
            document_links.each do |target_link|
              target_uri = UriHelper.parse(target_link)
              @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
            end
          end
          
          @crawl_counter = @redis.scard("crawled").to_i
          @queue_counter = @redis.scard("queued").to_i
        
          @stats.update_statistics(content, @crawl_counter, @queue_counter)
          @stats.update_status("Completed #{url}.")
          yield content, @stats.get_statistics if block_given?
        end
      rescue => e
        puts "Error loading #{url}: #{e}"
        #puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
        #ap e
        #ap e.backtrace
      ensure
        @crawl_counter = @redis.scard("crawled").to_i
        @queue_counter = @redis.scard("queued").to_i
      end
    else
      puts "Already crawled #{@options[:url]}" if @debug
    end
  end
  Thread.exit
end