Class: GruCrawler::Queue

Inherits:
Object
  • Object
show all
Defined in:
lib/grucrawler/queue.rb

Constant Summary collapse

VISITED_ALREADY_KEY =
'visited_already'
DOMAIN_VISITS_KEY =
'domain_visits'
QUEUE_KEY =
'queue'

Instance Method Summary collapse

Constructor Details

#initialize(namespace, visit_once, domain_wait) ⇒ Queue

Returns a new instance of Queue.



10
11
12
13
14
15
16
17
18
# File 'lib/grucrawler/queue.rb', line 10

def initialize(namespace, visit_once, domain_wait)
  @redis = Redis.new
  @rns = namespace + ':'
  @concurrent_requests = 0
  @tmp_block = {}
  @domains_throttle = Hash.new(0.0)
  @visit_once = visit_once
  @domain_wait = domain_wait
end

Instance Method Details

#can_visit_now(url) ⇒ Object



45
46
47
48
49
50
51
52
# File 'lib/grucrawler/queue.rb', line 45

def can_visit_now(url)
  return false if @tmp_block[url]

  last_visit = last_visit_to_domain(url)
  time_passed = Time.now.to_f - last_visit

  time_passed > @domain_wait
end

#countObject



67
68
69
# File 'lib/grucrawler/queue.rb', line 67

def count
  @concurrent_requests
end

#domain(url) ⇒ Object

TODO: PublicSuffix



106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/grucrawler/queue.rb', line 106

def domain(url)
  begin
    uri = URI.parse(url)
  rescue URI::InvalidURIError
    return nil
  end

  return nil if uri.host.nil?
  host = uri.host.downcase
  host = host.start_with?('www.') ? host[4..-1] : host
  match = [host.match(/\w+\.\w+$/)]
  return nil unless match
  match[0]
end

#finished(url) ⇒ Object



60
61
62
63
64
65
# File 'lib/grucrawler/queue.rb', line 60

def finished(url)
  @tmp_block.delete(url)
  set_visited_already(url)
  remove_url_from_queue(url) if url
  @concurrent_requests -= 1
end

#last_visit_to_domain(url) ⇒ Object



76
77
78
# File 'lib/grucrawler/queue.rb', line 76

def last_visit_to_domain(url)
  @redis.hget(@rns + DOMAIN_VISITS_KEY, domain(url)).to_f
end

#next_urlObject



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/grucrawler/queue.rb', line 26

def next_url
  url = ''

  100.times do
    url = random_url_from_queue()

    if visited_already(url) or not can_visit_now(url)
      url = nil
      next
    end

    break
  end

  @tmp_block[url] = true

  url
end

#push(url) ⇒ Object



89
90
91
# File 'lib/grucrawler/queue.rb', line 89

def push(url)
  @redis.sadd(@rns + QUEUE_KEY, url) == 1
end

#random_url_from_queueObject



85
86
87
# File 'lib/grucrawler/queue.rb', line 85

def random_url_from_queue
  @redis.srandmember(@rns + QUEUE_KEY)
end

#remove_url_from_queue(url) ⇒ Object



81
82
83
# File 'lib/grucrawler/queue.rb', line 81

def remove_url_from_queue(url)
  @redis.srem(@rns + QUEUE_KEY, url)
end

#resetObject



20
21
22
23
24
# File 'lib/grucrawler/queue.rb', line 20

def reset
  @redis.del(@rns + DOMAIN_VISITS_KEY)
  @redis.del(@rns + QUEUE_KEY)
  @redis.del(@rns + VISITED_ALREADY_KEY)
end

#set_last_visit_to_domain(url) ⇒ Object



71
72
73
74
# File 'lib/grucrawler/queue.rb', line 71

def set_last_visit_to_domain(url)
  time = Time.now.to_f
  @redis.hset(@rns + DOMAIN_VISITS_KEY, domain(url), time)
end

#set_visited_already(url) ⇒ Object



99
100
101
102
# File 'lib/grucrawler/queue.rb', line 99

def set_visited_already(url)
  return unless @visit_once
  @redis.sadd(@rns + VISITED_ALREADY_KEY, url)
end

#started(url) ⇒ Object



54
55
56
57
58
# File 'lib/grucrawler/queue.rb', line 54

def started(url)
  set_last_visit_to_domain(url)

  @concurrent_requests += 1
end

#visited_already(url) ⇒ Object



94
95
96
97
# File 'lib/grucrawler/queue.rb', line 94

def visited_already(url)
  return false unless @visit_once
  @redis.sismember(@rns + VISITED_ALREADY_KEY, url)
end