Class: Scrape::Site

Inherits:

Object

Object
Scrape::Site

show all

Defined in:: lib/scrape/site.rb

Instance Attribute Summary collapse

#matches ⇒ Object readonly

Returns the value of attribute matches.
#options ⇒ Object readonly

Returns the value of attribute options.
#url ⇒ Object readonly

Returns the value of attribute url.

Instance Method Summary collapse

#accept?(url) ⇒ Boolean
#add_match(matcher, &proc) ⇒ Object
#initialize(url, options = {}) ⇒ Site constructor

A new instance of Site.
#normalize(url, base_url = self.url) ⇒ Object
#open(url) ⇒ Object
#parse(url) ⇒ Object
#robots_txt ⇒ Object
#to_s ⇒ Object

Constructor Details

#initialize(url, options = {}) ⇒ `Site`

Returns a new instance of Site.

# File 'lib/scrape/site.rb', line 7

def initialize url, options = {}
  @url = Addressable::URI.parse url
  @url.query = nil
  @url.fragment = nil
  @options = {:ignore_robots_txt => true}.merge options
  @matches = []
end

Instance Attribute Details

#matches ⇒ `Object` (readonly)

Returns the value of attribute matches.



5
6
7

# File 'lib/scrape/site.rb', line 5

def matches
  @matches
end

#options ⇒ `Object` (readonly)

Returns the value of attribute options.



5
6
7

# File 'lib/scrape/site.rb', line 5

def options
  @options
end

#url ⇒ `Object` (readonly)

Returns the value of attribute url.



5
6
7

# File 'lib/scrape/site.rb', line 5

def url
  @url
end

Instance Method Details

#accept?(url) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/scrape/site.rb', line 39

def accept? url
  url = normalize url
  url.starts_with(to_s) && !disallowed?(url)
end

#add_match(matcher, &proc) ⇒ `Object`

# File 'lib/scrape/site.rb', line 15

def add_match matcher, &proc
  match = Scrape::Match.new(matcher, &proc)
  @matches << match
  match
end

#normalize(url, base_url = self.url) ⇒ `Object`



44
45
46

# File 'lib/scrape/site.rb', line 44

def normalize url, base_url = self.url
  Addressable::URI.join(base_url, url).to_s
end

#open(url) ⇒ `Object`

# File 'lib/scrape/site.rb', line 21

def open url
  headers = Hash.new
  headers[:cookie] = cookie if options[:cookie]
  Scrape.open url, headers
end

#parse(url) ⇒ `Object`

# File 'lib/scrape/site.rb', line 27

def parse url
  url = normalize url
  doc = Nokogiri::HTML open(url)

  @matches.each{|match| match.invoke doc, url if match =~ url }

  doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
rescue Scrape::HTTPError => e
  Scrape.logger.info "Error loading #{url}: #{e.message}"
  nil
end

#robots_txt ⇒ `Object`



48
49
50

# File 'lib/scrape/site.rb', line 48

def robots_txt
  @robots_txt = Scrape::RobotsTxt.load url unless defined? @robots_txt
end

#to_s ⇒ `Object`



52
53
54

# File 'lib/scrape/site.rb', line 52

def to_s
  url.to_s
end

Class: Scrape::Site

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Site

Instance Attribute Details

#matches ⇒ Object (readonly)

#options ⇒ Object (readonly)

#url ⇒ Object (readonly)

Instance Method Details

#accept?(url) ⇒ Boolean

#add_match(matcher, &proc) ⇒ Object

#normalize(url, base_url = self.url) ⇒ Object

#open(url) ⇒ Object

#parse(url) ⇒ Object

#robots_txt ⇒ Object

#to_s ⇒ Object

#initialize(url, options = {}) ⇒ `Site`

#matches ⇒ `Object` (readonly)

#options ⇒ `Object` (readonly)

#url ⇒ `Object` (readonly)

#accept?(url) ⇒ `Boolean`

#add_match(matcher, &proc) ⇒ `Object`

#normalize(url, base_url = self.url) ⇒ `Object`

#open(url) ⇒ `Object`

#parse(url) ⇒ `Object`

#robots_txt ⇒ `Object`

#to_s ⇒ `Object`