Class: GruCrawler::Queue
- Inherits:
-
Object
- Object
- GruCrawler::Queue
- Defined in:
- lib/grucrawler/queue.rb
Constant Summary collapse
- VISITED_ALREADY_KEY =
'visited_already'
- DOMAIN_VISITS_KEY =
'domain_visits'
- QUEUE_KEY =
'queue'
Instance Method Summary collapse
- #can_visit_now(url) ⇒ Object
- #count ⇒ Object
-
#domain(url) ⇒ Object
TODO: PublicSuffix.
- #finished(url) ⇒ Object
-
#initialize(namespace, visit_once, domain_wait) ⇒ Queue
constructor
A new instance of Queue.
- #last_visit_to_domain(url) ⇒ Object
- #next_url ⇒ Object
- #push(url) ⇒ Object
- #random_url_from_queue ⇒ Object
- #remove_url_from_queue(url) ⇒ Object
- #reset ⇒ Object
- #set_last_visit_to_domain(url) ⇒ Object
- #set_visited_already(url) ⇒ Object
- #started(url) ⇒ Object
- #visited_already(url) ⇒ Object
Constructor Details
#initialize(namespace, visit_once, domain_wait) ⇒ Queue
Returns a new instance of Queue.
10 11 12 13 14 15 16 17 18 |
# File 'lib/grucrawler/queue.rb', line 10 def initialize(namespace, visit_once, domain_wait) @redis = Redis.new @rns = namespace + ':' @concurrent_requests = 0 @tmp_block = {} @domains_throttle = Hash.new(0.0) @visit_once = visit_once @domain_wait = domain_wait end |
Instance Method Details
#can_visit_now(url) ⇒ Object
45 46 47 48 49 50 51 52 |
# File 'lib/grucrawler/queue.rb', line 45 def can_visit_now(url) return false if @tmp_block[url] last_visit = last_visit_to_domain(url) time_passed = Time.now.to_f - last_visit time_passed > @domain_wait end |
#count ⇒ Object
67 68 69 |
# File 'lib/grucrawler/queue.rb', line 67 def count @concurrent_requests end |
#domain(url) ⇒ Object
TODO: PublicSuffix
106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/grucrawler/queue.rb', line 106 def domain(url) begin uri = URI.parse(url) rescue URI::InvalidURIError return nil end return nil if uri.host.nil? host = uri.host.downcase host = host.start_with?('www.') ? host[4..-1] : host match = [host.match(/\w+\.\w+$/)] return nil unless match match[0] end |
#finished(url) ⇒ Object
60 61 62 63 64 65 |
# File 'lib/grucrawler/queue.rb', line 60 def finished(url) @tmp_block.delete(url) set_visited_already(url) remove_url_from_queue(url) if url @concurrent_requests -= 1 end |
#last_visit_to_domain(url) ⇒ Object
76 77 78 |
# File 'lib/grucrawler/queue.rb', line 76 def last_visit_to_domain(url) @redis.hget(@rns + DOMAIN_VISITS_KEY, domain(url)).to_f end |
#next_url ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/grucrawler/queue.rb', line 26 def next_url url = '' 100.times do url = random_url_from_queue() if visited_already(url) or not can_visit_now(url) url = nil next end break end @tmp_block[url] = true url end |
#push(url) ⇒ Object
89 90 91 |
# File 'lib/grucrawler/queue.rb', line 89 def push(url) @redis.sadd(@rns + QUEUE_KEY, url) == 1 end |
#random_url_from_queue ⇒ Object
85 86 87 |
# File 'lib/grucrawler/queue.rb', line 85 def random_url_from_queue @redis.srandmember(@rns + QUEUE_KEY) end |
#remove_url_from_queue(url) ⇒ Object
81 82 83 |
# File 'lib/grucrawler/queue.rb', line 81 def remove_url_from_queue(url) @redis.srem(@rns + QUEUE_KEY, url) end |
#reset ⇒ Object
20 21 22 23 24 |
# File 'lib/grucrawler/queue.rb', line 20 def reset @redis.del(@rns + DOMAIN_VISITS_KEY) @redis.del(@rns + QUEUE_KEY) @redis.del(@rns + VISITED_ALREADY_KEY) end |
#set_last_visit_to_domain(url) ⇒ Object
71 72 73 74 |
# File 'lib/grucrawler/queue.rb', line 71 def set_last_visit_to_domain(url) time = Time.now.to_f @redis.hset(@rns + DOMAIN_VISITS_KEY, domain(url), time) end |
#set_visited_already(url) ⇒ Object
99 100 101 102 |
# File 'lib/grucrawler/queue.rb', line 99 def set_visited_already(url) return unless @visit_once @redis.sadd(@rns + VISITED_ALREADY_KEY, url) end |
#started(url) ⇒ Object
54 55 56 57 58 |
# File 'lib/grucrawler/queue.rb', line 54 def started(url) set_last_visit_to_domain(url) @concurrent_requests += 1 end |
#visited_already(url) ⇒ Object
94 95 96 97 |
# File 'lib/grucrawler/queue.rb', line 94 def visited_already(url) return false unless @visit_once @redis.sismember(@rns + VISITED_ALREADY_KEY, url) end |