Class: CoolCrawler::CrawlerPool

Inherits:
Object
  • Object
show all
Defined in:
lib/cool_crawler.rb

Overview

This is the class that handles the queue and async requests

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(start, max_connections, delay, max_pages = 50) ⇒ CrawlerPool

Returns a new instance of CrawlerPool.



16
17
18
19
20
21
22
23
24
25
# File 'lib/cool_crawler.rb', line 16

def initialize(start, max_connections, delay, max_pages=50)
  @uri = URI(start)
  @max_pages = max_pages
  @site = "#{uri.scheme}://#{uri.host}"
  @max_connections = max_connections
  @delay = delay
  @visited_pages = 0
  visited[uri.path] = 1
  queue << uri.path
end

Instance Attribute Details

#callbackObject (readonly)

Returns the value of attribute callback.



27
28
29
# File 'lib/cool_crawler.rb', line 27

def callback
  @callback
end

#delayObject (readonly)

Returns the value of attribute delay.



27
28
29
# File 'lib/cool_crawler.rb', line 27

def delay
  @delay
end

#max_connectionsObject (readonly)

Returns the value of attribute max_connections.



27
28
29
# File 'lib/cool_crawler.rb', line 27

def max_connections
  @max_connections
end

#siteObject (readonly)

Returns the value of attribute site.



27
28
29
# File 'lib/cool_crawler.rb', line 27

def site
  @site
end

#uriObject (readonly)

Returns the value of attribute uri.



27
28
29
# File 'lib/cool_crawler.rb', line 27

def uri
  @uri
end

Instance Method Details

#add_to_visited(path) ⇒ Object



102
103
104
105
106
107
108
# File 'lib/cool_crawler.rb', line 102

def add_to_visited(path)
  if visited?(path)
    visited[path] += 1
  else
    visited[path] = 1
  end
end

#after(page, links, body) ⇒ Object



40
41
42
# File 'lib/cool_crawler.rb', line 40

def after(page, links, body)
  callback.call(page, links, body) unless callback.nil?
end

#enqueue(path) ⇒ Object



114
115
116
117
118
# File 'lib/cool_crawler.rb', line 114

def enqueue(path)
  unless visited.include?(path)
    queue << path
  end
end


75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/cool_crawler.rb', line 75

def gather_links_uri(body, page)
  links = []
  doc = Nokogiri::HTML(body)
  doc.xpath("//a").each do |a|
    next if a["href"].nil?
    uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
    begin
    links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
    rescue
      # do nothing
    end
  end
  links
end

#queueObject



90
91
92
# File 'lib/cool_crawler.rb', line 90

def queue
  @queue ||= Queue.new
end

#runObject



33
34
35
36
37
38
# File 'lib/cool_crawler.rb', line 33

def run
  until queue.empty? || @visited_pages >= @max_pages
    send_crawlers
    sleep(delay)
  end
end

#send_crawlersObject



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/cool_crawler.rb', line 44

def send_crawlers
  pages = []
  until queue.empty? || pages.size >= max_connections || @visited_pages >= @max_pages
    pages << queue.pop
    @visited_pages += 1
  end
  Async do
    internet = Async::HTTP::Internet.new
    barrier = Async::Barrier.new

    pages.each do |page|
      barrier.async do
        response = internet.get URI.join(@site, page).to_s
        body = Nokogiri::HTML(response.read)
        body.search('//img').remove
        body.search('//style').remove
        body.search('//script').remove
        links = gather_links_uri(body.to_s, URI.join(uri, page))
        after(page, links, body.to_s)
        links.each do |link|
          enqueue(link)
          add_to_visited(link)
        end
      end
    end
    barrier.wait
  ensure
    internet&.close
  end
end

#set_callback(proc) ⇒ Object



29
30
31
# File 'lib/cool_crawler.rb', line 29

def set_callback(proc)
  @callback=proc
end

#sorted_visitedObject



110
111
112
# File 'lib/cool_crawler.rb', line 110

def sorted_visited
  visited.sort_by { |_k, v| v }
end

#sum_pagesObject



120
121
122
123
124
125
126
# File 'lib/cool_crawler.rb', line 120

def sum_pages
  sum = 0
  visited.each do |_k, v|
    sum += v
  end
  sum
end

#visitedObject



94
95
96
# File 'lib/cool_crawler.rb', line 94

def visited
  @visited ||= {}
end

#visited?(path) ⇒ Boolean

Returns:

  • (Boolean)


98
99
100
# File 'lib/cool_crawler.rb', line 98

def visited?(path)
  visited.include?(path)
end