Class: CobwebCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/cobweb_crawler.rb

Overview

CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ CobwebCrawler

See README for more information on options available



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/cobweb_crawler.rb', line 9

def initialize(options={})
  @options = options

  @statistic = {}

  @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
  if @options.has_key? :crawl_id
    @crawl_id = @options[:crawl_id]
  else
    @crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
    @options[:crawl_id] = @crawl_id
  end

  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => RedisConnection.new(@options[:redis_options]))
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
  @options[:seed_urls] = [] if @options[:seed_urls].nil?
  @options[:seed_urls].map{|link| @redis.sadd "queued", link }

  @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external

  @options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
  @debug = @options[:debug]

  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
  if @options[:web_statistics]
    require "server"
    Server.start(@options)
  end

  @cobweb = Cobweb.new(@options)
end

Instance Method Details

#crawl(base_url, crawl_options = {}, &block) ⇒ Object

Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash’



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/cobweb_crawler.rb', line 43

def crawl(base_url, crawl_options = {}, &block)
  @options[:base_url] = base_url unless @options.has_key? :base_url
  @options[:thread_count] = 1 unless @options.has_key? :thread_count

  @options[:internal_urls] << base_url if @options[:internal_urls].empty?
  @redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty?

  @crawl_options = crawl_options

  @redis.sadd("queued", base_url) unless base_url.nil? || @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url)
  @crawl_counter = @redis.scard("crawled").to_i
  @queue_counter = @redis.scard("queued").to_i

  @threads = []
  begin
    @stats.start_crawl(@options)

    @threads << Thread.new do
      Thread.abort_on_exception = true
      spawn_thread(&block)
    end

    sleep 5
    while running_thread_count > 0
      if @queue_counter > 0
        (@options[:thread_count]-running_thread_count).times.each do
          @threads << Thread.new do
            Thread.abort_on_exception = true
            spawn_thread(&block)
          end
        end
      end
      sleep 1
    end

  ensure
    @stats.end_crawl(@options)
  end
  @stats
end

#running_thread_countObject



162
163
164
# File 'lib/cobweb_crawler.rb', line 162

def running_thread_count
  @threads.map{|t| t.status}.select{|status| status=="run" || status == "sleep"}.count
end

#spawn_thread(&block) ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/cobweb_crawler.rb', line 84

def spawn_thread(&block)
    while @queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > @crawl_counter)
      url = @redis.spop "queued"
    @queue_counter = 0 if url.nil?

    @options[:url] = url
    unless @redis.sismember("crawled", url.to_s)
      begin
        @stats.update_status("Requesting #{url}...")
        content = @cobweb.get(url) unless url.nil?
        if content.nil?
          @queue_counter = @queue_counter - 1 #@redis.scard("queued").to_i
        else
          @stats.update_status("Processing #{url}...")

          @redis.sadd "crawled", url.to_s
          @redis.incr "crawl-counter"

          document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq


          # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
          cobweb_links = CobwebLinks.new(@options)

          internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}

          # if the site has the same content for http and https then normalize to http

          if @options[:treat_https_as_http]
            internal_links.map!{|link| link.gsub(/^https/, "http")}
          end


          # reject the link if we've crawled it or queued it
          internal_links.reject!{|link| @redis.sismember("crawled", link)}
          internal_links.reject!{|link| @redis.sismember("queued", link)}
          internal_links.reject!{|link| link.nil? || link.empty?}

          internal_links.each do |link|
            puts "Added #{link.to_s} to queue" if @debug
            @redis.sadd "queued", link unless link.nil?
            children = @redis.hget("navigation", url)
            children = [] if children.nil?
            children << link
            @redis.hset "navigation", url, children
            @queue_counter += 1
          end

          if @options[:store_inbound_links]
            document_links.each do |target_link|
              target_uri = UriHelper.parse(target_link).normalize
              @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
            end
          end

          @crawl_counter = @redis.scard("crawled").to_i
          @queue_counter = @redis.scard("queued").to_i

          @stats.update_statistics(content, @crawl_counter, @queue_counter)
          @stats.update_status("Completed #{url}.")
          yield content, @stats.get_statistics if block_given?
        end
      rescue => e
        puts "Error loading #{url}: #{e}"
        #puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
        #ap e
        #ap e.backtrace
      ensure
        @crawl_counter = @redis.scard("crawled").to_i
        @queue_counter = @redis.scard("queued").to_i
      end
    else
      puts "Already crawled #{@options[:url]}" if @debug
    end
  end
  Thread.exit
end