Class: Spider

Inherits:
Object
  • Object
show all
Includes:
UrlUtils
Defined in:
lib/spider.rb

Overview

Young Spider

Instance Method Summary collapse

Methods included from UrlUtils

#create_abs_url_from_ctx, #create_absolute_url_from_base, #get_domain, #make_absolute, #relative?, #remove_extra_paths, #urls_on_same_domain?

Constructor Details

#initializeSpider

Returns a new instance of Spider.



11
12
13
# File 'lib/spider.rb', line 11

def initialize
  @already_visited = {}
end

Instance Method Details

#crawl_domain(url, page_limit = 100) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/spider.rb', line 36

def crawl_domain(url, page_limit = 100)
  return if @already_visited.size == page_limit

  url_object = open_url(url)
  return if url_object.nil?

  parsed_doc = parse_url(url_object)
  return if parsed_doc.nil?

  @already_visited[url] == true if @already_visited[url].nil?
  page_urls = find_urls_on_page(parsed_doc, url)
  page_urls.each do |page_url|
    if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
      crawl_domain(page_url)
    end
  end
end

#crawl_web(urls, depth = 2, page_limit = 100) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/spider.rb', line 15

def crawl_web(urls, depth = 2, page_limit = 100)
  depth.times do
    next_urls = []
    urls.each do |url|
      url_object = open_url(url)
      next if url_object.nil?

      url = upate_url_if_redirected(url_object)
      parsed_doc = parse_url(url_object)
      next if parsed_doc.nil?

      @already_visited[url] == true if @already_visited[url].nil?
      return if @already_visited.size == page_limit

      next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
      next_urls.uniq!
    end
    urls = next_urls
  end
end