Class: GeoSpider::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/geo-spider/page.rb

Constant Summary collapse

DEFAULT_CONTENT_CSS_SELECTOR =

Find locations within the entire body by default

"body"
DEFAULT_TITLE_CSS_SELECTOR =

Use the title in the head by deault

"title"
DO_NOT_SPIDER_REGEXP =

do not spider these extensions by default

/(mp3|m4a|mov|jpg|png|gif|zip|pdf)$/i

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Page

Create a new page based on the URL.



16
17
18
19
20
21
22
23
# File 'lib/geo-spider/page.rb', line 16

def initialize(url, options = {})
  @url = url
  @site = options[:site]
  @content_css_selector = options[:content_css_selector] || DEFAULT_CONTENT_CSS_SELECTOR
  @title_css_selector = options[:title_css_selector] || DEFAULT_TITLE_CSS_SELECTOR
  @do_not_spider_regexp = options[:do_not_spider_regexp] || DO_NOT_SPIDER_REGEXP
  hpricot_doc
end

Instance Attribute Details

#urlObject (readonly)

Returns the value of attribute url.



8
9
10
# File 'lib/geo-spider/page.rb', line 8

def url
  @url
end

Instance Method Details

Returns a unique array of internal URLs present in the page as string, normalized to remove anchors. Needs the page to know what site it is part of, or it cannot decide what is an internal link.



47
48
49
50
# File 'lib/geo-spider/page.rb', line 47

def internal_links
  raise("Cannot discover internal links without knowing what site this page is part of.") if @site.nil?
  links.select { |l| internal_url?(l) }
end

Returns a unique array of URLs present in the page as strings, normalized to remove anchors.



39
40
41
42
43
# File 'lib/geo-spider/page.rb', line 39

def links
  hpricot_doc.search("a[@href]").map do |a|
    normalize_url(a.attributes["href"])
  end.uniq.reject { |b| rejected_url?(b) }
end

#locationsObject

Returns an array of Location objects based on the locations found in the page.



31
32
33
34
35
# File 'lib/geo-spider/page.rb', line 31

def locations
  body_element = hpricot_doc.at(@content_css_selector)
  master_extractor = Extractors::Master.new(body_element)
  master_extractor.locations
end

#titleObject



25
26
27
# File 'lib/geo-spider/page.rb', line 25

def title
  hpricot_doc.at(@title_css_selector).inner_text
end