Class: CoolCrawler::CrawlerPool
- Inherits:
-
Object
- Object
- CoolCrawler::CrawlerPool
- Defined in:
- lib/cool_crawler.rb
Overview
This is the class that handles the queue and async requests
Instance Attribute Summary collapse
-
#callback ⇒ Object
readonly
Returns the value of attribute callback.
-
#delay ⇒ Object
readonly
Returns the value of attribute delay.
-
#max_connections ⇒ Object
readonly
Returns the value of attribute max_connections.
-
#site ⇒ Object
readonly
Returns the value of attribute site.
-
#uri ⇒ Object
readonly
Returns the value of attribute uri.
Instance Method Summary collapse
- #add_to_visited(path) ⇒ Object
- #after(page, links, body) ⇒ Object
- #enqueue(path) ⇒ Object
- #gather_links_uri(body, page) ⇒ Object
-
#initialize(start, max_connections, delay, max_pages = 50) ⇒ CrawlerPool
constructor
A new instance of CrawlerPool.
- #queue ⇒ Object
- #run ⇒ Object
- #send_crawlers ⇒ Object
- #set_callback(proc) ⇒ Object
- #sorted_visited ⇒ Object
- #sum_pages ⇒ Object
- #visited ⇒ Object
- #visited?(path) ⇒ Boolean
Constructor Details
#initialize(start, max_connections, delay, max_pages = 50) ⇒ CrawlerPool
Returns a new instance of CrawlerPool.
16 17 18 19 20 21 22 23 24 25 |
# File 'lib/cool_crawler.rb', line 16 def initialize(start, max_connections, delay, max_pages=50) @uri = URI(start) @max_pages = max_pages @site = "#{uri.scheme}://#{uri.host}" @max_connections = max_connections @delay = delay @visited_pages = 0 visited[uri.path] = 1 queue << uri.path end |
Instance Attribute Details
#callback ⇒ Object (readonly)
Returns the value of attribute callback.
27 28 29 |
# File 'lib/cool_crawler.rb', line 27 def callback @callback end |
#delay ⇒ Object (readonly)
Returns the value of attribute delay.
27 28 29 |
# File 'lib/cool_crawler.rb', line 27 def delay @delay end |
#max_connections ⇒ Object (readonly)
Returns the value of attribute max_connections.
27 28 29 |
# File 'lib/cool_crawler.rb', line 27 def max_connections @max_connections end |
#site ⇒ Object (readonly)
Returns the value of attribute site.
27 28 29 |
# File 'lib/cool_crawler.rb', line 27 def site @site end |
#uri ⇒ Object (readonly)
Returns the value of attribute uri.
27 28 29 |
# File 'lib/cool_crawler.rb', line 27 def uri @uri end |
Instance Method Details
#add_to_visited(path) ⇒ Object
102 103 104 105 106 107 108 |
# File 'lib/cool_crawler.rb', line 102 def add_to_visited(path) if visited?(path) visited[path] += 1 else visited[path] = 1 end end |
#after(page, links, body) ⇒ Object
40 41 42 |
# File 'lib/cool_crawler.rb', line 40 def after(page, links, body) callback.call(page, links, body) unless callback.nil? end |
#enqueue(path) ⇒ Object
114 115 116 117 118 |
# File 'lib/cool_crawler.rb', line 114 def enqueue(path) unless visited.include?(path) queue << path end end |
#gather_links_uri(body, page) ⇒ Object
75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/cool_crawler.rb', line 75 def gather_links_uri(body, page) links = [] doc = Nokogiri::HTML(body) doc.xpath("//a").each do |a| next if a["href"].nil? uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, "")) begin links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path rescue # do nothing end end links end |
#queue ⇒ Object
90 91 92 |
# File 'lib/cool_crawler.rb', line 90 def queue @queue ||= Queue.new end |
#run ⇒ Object
33 34 35 36 37 38 |
# File 'lib/cool_crawler.rb', line 33 def run until queue.empty? || @visited_pages >= @max_pages send_crawlers sleep(delay) end end |
#send_crawlers ⇒ Object
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/cool_crawler.rb', line 44 def send_crawlers pages = [] until queue.empty? || pages.size >= max_connections || @visited_pages >= @max_pages pages << queue.pop @visited_pages += 1 end Async do internet = Async::HTTP::Internet.new = Async::Barrier.new pages.each do |page| .async do response = internet.get URI.join(@site, page).to_s body = Nokogiri::HTML(response.read) body.search('//img').remove body.search('//style').remove body.search('//script').remove links = gather_links_uri(body.to_s, URI.join(uri, page)) after(page, links, body.to_s) links.each do |link| enqueue(link) add_to_visited(link) end end end .wait ensure internet&.close end end |
#set_callback(proc) ⇒ Object
29 30 31 |
# File 'lib/cool_crawler.rb', line 29 def set_callback(proc) @callback=proc end |
#sorted_visited ⇒ Object
110 111 112 |
# File 'lib/cool_crawler.rb', line 110 def sorted_visited visited.sort_by { |_k, v| v } end |
#sum_pages ⇒ Object
120 121 122 123 124 125 126 |
# File 'lib/cool_crawler.rb', line 120 def sum_pages sum = 0 visited.each do |_k, v| sum += v end sum end |
#visited ⇒ Object
94 95 96 |
# File 'lib/cool_crawler.rb', line 94 def visited @visited ||= {} end |
#visited?(path) ⇒ Boolean
98 99 100 |
# File 'lib/cool_crawler.rb', line 98 def visited?(path) visited.include?(path) end |