Class: GeoSpider::Site
- Inherits:
-
Object
- Object
- GeoSpider::Site
- Defined in:
- lib/geo-spider/site.rb
Constant Summary collapse
- DEFAULT_REGEXP =
By default match every URL
/.+/
Instance Attribute Summary collapse
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #each_page(options = {}, &block) ⇒ Object
-
#initialize(url) ⇒ Site
constructor
A new instance of Site.
- #pages(options = {}) ⇒ Object
Constructor Details
#initialize(url) ⇒ Site
Returns a new instance of Site.
9 10 11 |
# File 'lib/geo-spider/site.rb', line 9 def initialize(url) @url = URI.parse(url) end |
Instance Attribute Details
#url ⇒ Object (readonly)
Returns the value of attribute url.
5 6 7 |
# File 'lib/geo-spider/site.rb', line 5 def url @url end |
Instance Method Details
#each_page(options = {}, &block) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/geo-spider/site.rb', line 13 def each_page( = {}, &block) regexp = .delete(:regexp) || DEFAULT_REGEXP = .merge( { :site => self } ) queue = [self.url.to_s] seen = [] until queue.empty? do url = queue.shift begin page = Page.new(url, ) if url =~ regexp yield page end seen << url next_links = (page.internal_links - seen - queue) # only add internal links that we've not seen or already have queued. queue.concat(next_links) rescue Timeout::Error, OpenURI::HTTPError, InvalidElement next end end end |
#pages(options = {}) ⇒ Object
36 37 38 39 40 41 42 43 44 |
# File 'lib/geo-spider/site.rb', line 36 def pages( = {}) pages = [] self.each_page() do |page| pages << page end pages end |