Class: CobwebCrawler
- Inherits:
-
Object
- Object
- CobwebCrawler
- Defined in:
- lib/cobweb_crawler.rb
Overview
CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
Instance Method Summary collapse
-
#crawl(base_url, crawl_options = {}, &block) ⇒ Object
Initiates a crawl starting at the base_url and applying the options supplied.
-
#initialize(options = {}) ⇒ CobwebCrawler
constructor
See README for more information on options available.
- #running_thread_count ⇒ Object
- #spawn_thread(&block) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ CobwebCrawler
See README for more information on options available
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/cobweb_crawler.rb', line 9 def initialize(={}) @options = @statistic = {} @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options if @options.has_key? :crawl_id @crawl_id = @options[:crawl_id] else @crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s) @options[:crawl_id] = @crawl_id end @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => RedisConnection.new(@options[:redis_options])) @options[:internal_urls] = [] if @options[:internal_urls].nil? @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)} @options[:seed_urls] = [] if @options[:seed_urls].nil? @options[:seed_urls].map{|link| @redis.sadd "queued", link } @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external @options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http @debug = @options[:debug] @stats = Stats.new(@options.merge(:crawl_id => @crawl_id)) if @options[:web_statistics] require "server" Server.start(@options) end @cobweb = Cobweb.new(@options) end |
Instance Method Details
#crawl(base_url, crawl_options = {}, &block) ⇒ Object
Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash’
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/cobweb_crawler.rb', line 43 def crawl(base_url, = {}, &block) @options[:base_url] = base_url unless @options.has_key? :base_url @options[:thread_count] = 1 unless @options.has_key? :thread_count @options[:internal_urls] << base_url if @options[:internal_urls].empty? @redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty? @crawl_options = @redis.sadd("queued", base_url) unless base_url.nil? || @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url) @crawl_counter = @redis.scard("crawled").to_i @queue_counter = @redis.scard("queued").to_i @threads = [] begin @stats.start_crawl(@options) @threads << Thread.new do Thread.abort_on_exception = true spawn_thread(&block) end sleep 5 while running_thread_count > 0 if @queue_counter > 0 (@options[:thread_count]-running_thread_count).times.each do @threads << Thread.new do Thread.abort_on_exception = true spawn_thread(&block) end end end sleep 1 end ensure @stats.end_crawl(@options) end @stats end |
#running_thread_count ⇒ Object
162 163 164 |
# File 'lib/cobweb_crawler.rb', line 162 def running_thread_count @threads.map{|t| t.status}.select{|status| status=="run" || status == "sleep"}.count end |
#spawn_thread(&block) ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/cobweb_crawler.rb', line 84 def spawn_thread(&block) while @queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > @crawl_counter) url = @redis.spop "queued" @queue_counter = 0 if url.nil? @options[:url] = url unless @redis.sismember("crawled", url.to_s) begin @stats.update_status("Requesting #{url}...") content = @cobweb.get(url) unless url.nil? if content.nil? @queue_counter = @queue_counter - 1 #@redis.scard("queued").to_i else @stats.update_status("Processing #{url}...") @redis.sadd "crawled", url.to_s @redis.incr "crawl-counter" document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq # select the link if its internal (eliminate external before expensive lookups in queued and crawled) cobweb_links = CobwebLinks.new(@options) internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))} # if the site has the same content for http and https then normalize to http if @options[:treat_https_as_http] internal_links.map!{|link| link.gsub(/^https/, "http")} end # reject the link if we've crawled it or queued it internal_links.reject!{|link| @redis.sismember("crawled", link)} internal_links.reject!{|link| @redis.sismember("queued", link)} internal_links.reject!{|link| link.nil? || link.empty?} internal_links.each do |link| puts "Added #{link.to_s} to queue" if @debug @redis.sadd "queued", link unless link.nil? children = @redis.hget("navigation", url) children = [] if children.nil? children << link @redis.hset "navigation", url, children @queue_counter += 1 end if @options[:store_inbound_links] document_links.each do |target_link| target_uri = UriHelper.parse(target_link).normalize @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s) end end @crawl_counter = @redis.scard("crawled").to_i @queue_counter = @redis.scard("queued").to_i @stats.update_statistics(content, @crawl_counter, @queue_counter) @stats.update_status("Completed #{url}.") yield content, @stats.get_statistics if block_given? end rescue => e puts "Error loading #{url}: #{e}" #puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!" #ap e #ap e.backtrace ensure @crawl_counter = @redis.scard("crawled").to_i @queue_counter = @redis.scard("queued").to_i end else puts "Already crawled #{@options[:url]}" if @debug end end Thread.exit end |