Class: GeoSpider::Page
- Inherits:
-
Object
- Object
- GeoSpider::Page
- Defined in:
- lib/geo-spider/page.rb
Constant Summary collapse
- DEFAULT_CONTENT_CSS_SELECTOR =
Find locations within the entire body by default
"body"
- DEFAULT_TITLE_CSS_SELECTOR =
Use the title in the head by deault
"title"
- DO_NOT_SPIDER_REGEXP =
do not spider these extensions by default
/(mp3|m4a|mov|jpg|png|gif|zip|pdf)$/i
Instance Attribute Summary collapse
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
-
#initialize(url, options = {}) ⇒ Page
constructor
Create a new page based on the URL.
-
#internal_links ⇒ Object
Returns a unique array of internal URLs present in the page as string, normalized to remove anchors.
-
#links ⇒ Object
Returns a unique array of URLs present in the page as strings, normalized to remove anchors.
-
#locations ⇒ Object
Returns an array of Location objects based on the locations found in the page.
- #title ⇒ Object
Constructor Details
#initialize(url, options = {}) ⇒ Page
Create a new page based on the URL.
16 17 18 19 20 21 22 23 |
# File 'lib/geo-spider/page.rb', line 16 def initialize(url, = {}) @url = url @site = [:site] @content_css_selector = [:content_css_selector] || DEFAULT_CONTENT_CSS_SELECTOR @title_css_selector = [:title_css_selector] || DEFAULT_TITLE_CSS_SELECTOR @do_not_spider_regexp = [:do_not_spider_regexp] || DO_NOT_SPIDER_REGEXP hpricot_doc end |
Instance Attribute Details
#url ⇒ Object (readonly)
Returns the value of attribute url.
8 9 10 |
# File 'lib/geo-spider/page.rb', line 8 def url @url end |
Instance Method Details
#internal_links ⇒ Object
Returns a unique array of internal URLs present in the page as string, normalized to remove anchors. Needs the page to know what site it is part of, or it cannot decide what is an internal link.
47 48 49 50 |
# File 'lib/geo-spider/page.rb', line 47 def internal_links raise("Cannot discover internal links without knowing what site this page is part of.") if @site.nil? links.select { |l| internal_url?(l) } end |
#links ⇒ Object
Returns a unique array of URLs present in the page as strings, normalized to remove anchors.
39 40 41 42 43 |
# File 'lib/geo-spider/page.rb', line 39 def links hpricot_doc.search("a[@href]").map do |a| normalize_url(a.attributes["href"]) end.uniq.reject { |b| rejected_url?(b) } end |
#locations ⇒ Object
Returns an array of Location objects based on the locations found in the page.
31 32 33 34 35 |
# File 'lib/geo-spider/page.rb', line 31 def locations body_element = hpricot_doc.at(@content_css_selector) master_extractor = Extractors::Master.new(body_element) master_extractor.locations end |
#title ⇒ Object
25 26 27 |
# File 'lib/geo-spider/page.rb', line 25 def title hpricot_doc.at(@title_css_selector).inner_text end |