Class: Spider
Overview
Young Spider
Instance Method Summary collapse
- #crawl_domain(url, page_limit = 100) ⇒ Object
- #crawl_web(urls, depth = 2, page_limit = 100) ⇒ Object
-
#initialize ⇒ Spider
constructor
A new instance of Spider.
Methods included from UrlUtils
#create_abs_url_from_ctx, #create_absolute_url_from_base, #get_domain, #make_absolute, #relative?, #remove_extra_paths, #urls_on_same_domain?
Constructor Details
#initialize ⇒ Spider
Returns a new instance of Spider.
11 12 13 |
# File 'lib/spider.rb', line 11 def initialize @already_visited = {} end |
Instance Method Details
#crawl_domain(url, page_limit = 100) ⇒ Object
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/spider.rb', line 36 def crawl_domain(url, page_limit = 100) return if @already_visited.size == page_limit url_object = open_url(url) return if url_object.nil? parsed_doc = parse_url(url_object) return if parsed_doc.nil? @already_visited[url] == true if @already_visited[url].nil? page_urls = find_urls_on_page(parsed_doc, url) page_urls.each do |page_url| if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil? crawl_domain(page_url) end end end |
#crawl_web(urls, depth = 2, page_limit = 100) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/spider.rb', line 15 def crawl_web(urls, depth = 2, page_limit = 100) depth.times do next_urls = [] urls.each do |url| url_object = open_url(url) next if url_object.nil? url = upate_url_if_redirected(url_object) parsed_doc = parse_url(url_object) next if parsed_doc.nil? @already_visited[url] == true if @already_visited[url].nil? return if @already_visited.size == page_limit next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys) next_urls.uniq! end urls = next_urls end end |