Class: Scrape::Site
- Inherits:
-
Object
- Object
- Scrape::Site
- Defined in:
- lib/scrape/site.rb
Instance Attribute Summary collapse
-
#matches ⇒ Object
readonly
Returns the value of attribute matches.
-
#options ⇒ Object
readonly
Returns the value of attribute options.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #accept?(url) ⇒ Boolean
- #add_match(matcher, &proc) ⇒ Object
-
#initialize(url, options = {}) ⇒ Site
constructor
A new instance of Site.
- #normalize(url, base_url = self.url) ⇒ Object
- #open(url) ⇒ Object
- #parse(url) ⇒ Object
- #robots_txt ⇒ Object
- #to_s ⇒ Object
Constructor Details
#initialize(url, options = {}) ⇒ Site
Returns a new instance of Site.
7 8 9 10 11 12 13 |
# File 'lib/scrape/site.rb', line 7 def initialize url, = {} @url = Addressable::URI.parse url @url.query = nil @url.fragment = nil @options = {:ignore_robots_txt => true}.merge @matches = [] end |
Instance Attribute Details
#matches ⇒ Object (readonly)
Returns the value of attribute matches.
5 6 7 |
# File 'lib/scrape/site.rb', line 5 def matches @matches end |
#options ⇒ Object (readonly)
Returns the value of attribute options.
5 6 7 |
# File 'lib/scrape/site.rb', line 5 def @options end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
5 6 7 |
# File 'lib/scrape/site.rb', line 5 def url @url end |
Instance Method Details
#accept?(url) ⇒ Boolean
39 40 41 42 |
# File 'lib/scrape/site.rb', line 39 def accept? url url = normalize url url.starts_with(to_s) && !disallowed?(url) end |
#add_match(matcher, &proc) ⇒ Object
15 16 17 18 19 |
# File 'lib/scrape/site.rb', line 15 def add_match matcher, &proc match = Scrape::Match.new(matcher, &proc) @matches << match match end |
#normalize(url, base_url = self.url) ⇒ Object
44 45 46 |
# File 'lib/scrape/site.rb', line 44 def normalize url, base_url = self.url Addressable::URI.join(base_url, url).to_s end |
#open(url) ⇒ Object
21 22 23 24 25 |
# File 'lib/scrape/site.rb', line 21 def open url headers = Hash.new headers[:cookie] = if [:cookie] Scrape.open url, headers end |
#parse(url) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/scrape/site.rb', line 27 def parse url url = normalize url doc = Nokogiri::HTML open(url) @matches.each{|match| match.invoke doc, url if match =~ url } doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url } rescue Scrape::HTTPError => e Scrape.logger.info "Error loading #{url}: #{e.}" nil end |
#robots_txt ⇒ Object
48 49 50 |
# File 'lib/scrape/site.rb', line 48 def robots_txt @robots_txt = Scrape::RobotsTxt.load url unless defined? @robots_txt end |
#to_s ⇒ Object
52 53 54 |
# File 'lib/scrape/site.rb', line 52 def to_s url.to_s end |