Class: Scrape::Site

Inherits:
Object
  • Object
show all
Defined in:
lib/scrape/site.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Site

Returns a new instance of Site.



7
8
9
10
11
12
13
# File 'lib/scrape/site.rb', line 7

def initialize url, options = {}
  @url = Addressable::URI.parse url
  @url.query = nil
  @url.fragment = nil
  @options = {:ignore_robots_txt => true}.merge options
  @matches = []
end

Instance Attribute Details

#matchesObject (readonly)

Returns the value of attribute matches.



5
6
7
# File 'lib/scrape/site.rb', line 5

def matches
  @matches
end

#optionsObject (readonly)

Returns the value of attribute options.



5
6
7
# File 'lib/scrape/site.rb', line 5

def options
  @options
end

#urlObject (readonly)

Returns the value of attribute url.



5
6
7
# File 'lib/scrape/site.rb', line 5

def url
  @url
end

Instance Method Details

#accept?(url) ⇒ Boolean

Returns:

  • (Boolean)


39
40
41
42
# File 'lib/scrape/site.rb', line 39

def accept? url
  url = normalize url
  url.starts_with(to_s) && !disallowed?(url)
end

#add_match(matcher, &proc) ⇒ Object



15
16
17
18
19
# File 'lib/scrape/site.rb', line 15

def add_match matcher, &proc
  match = Scrape::Match.new(matcher, &proc)
  @matches << match
  match
end

#normalize(url, base_url = self.url) ⇒ Object



44
45
46
# File 'lib/scrape/site.rb', line 44

def normalize url, base_url = self.url
  Addressable::URI.join(base_url, url).to_s
end

#open(url) ⇒ Object



21
22
23
24
25
# File 'lib/scrape/site.rb', line 21

def open url
  headers = Hash.new
  headers[:cookie] = cookie if options[:cookie]
  Scrape.open url, headers
end

#parse(url) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
# File 'lib/scrape/site.rb', line 27

def parse url
  url = normalize url
  doc = Nokogiri::HTML open(url)

  @matches.each{|match| match.invoke doc, url if match =~ url }

  doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
rescue Scrape::HTTPError => e
  Scrape.logger.info "Error loading #{url}: #{e.message}"
  nil
end

#robots_txtObject



48
49
50
# File 'lib/scrape/site.rb', line 48

def robots_txt
  @robots_txt = Scrape::RobotsTxt.load url unless defined? @robots_txt
end

#to_sObject



52
53
54
# File 'lib/scrape/site.rb', line 52

def to_s
  url.to_s
end