Class: GeoSpider::Site

Inherits:
Object
  • Object
show all
Defined in:
lib/geo-spider/site.rb

Constant Summary collapse

DEFAULT_REGEXP =

By default match every URL

/.+/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Site

Returns a new instance of Site.



9
10
11
# File 'lib/geo-spider/site.rb', line 9

def initialize(url)
  @url = URI.parse(url)
end

Instance Attribute Details

#urlObject (readonly)

Returns the value of attribute url.



5
6
7
# File 'lib/geo-spider/site.rb', line 5

def url
  @url
end

Instance Method Details

#each_page(options = {}, &block) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/geo-spider/site.rb', line 13

def each_page(options = {}, &block)
  regexp = options.delete(:regexp) || DEFAULT_REGEXP
  options = options.merge( { :site => self } )
  
  queue = [self.url.to_s]
  seen = []
  
  until queue.empty? do
    url = queue.shift
    begin
      page = Page.new(url, options)
      if url =~ regexp
        yield page
      end
      seen << url
      next_links = (page.internal_links - seen - queue) # only add internal links that we've not seen or already have queued.
      queue.concat(next_links)
    rescue Timeout::Error, OpenURI::HTTPError, InvalidElement
      next
    end
  end
end

#pages(options = {}) ⇒ Object



36
37
38
39
40
41
42
43
44
# File 'lib/geo-spider/site.rb', line 36

def pages(options = {})
  pages = []
  
  self.each_page(options) do |page|
    pages << page
  end
  
  pages
end