Class: Scrapes::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/scrapes/crawler.rb

Overview

Try to suck down a URI

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(session) ⇒ Crawler

Create a new crawler for the given session



43
44
45
46
47
48
49
# File 'lib/scrapes/crawler.rb', line 43

def initialize (session)
  @session = session
  @log = nil
  @verbose = 0
  @delay = 0.5
  @cache = Cache.new
end

Instance Attribute Details

#cacheObject

The cache object that this crawler is using



35
36
37
# File 'lib/scrapes/crawler.rb', line 35

def cache
  @cache
end

#logObject

The optional log object that this crawler is using



39
40
41
# File 'lib/scrapes/crawler.rb', line 39

def log
  @log
end

Instance Method Details

#fetch(uri, post = {}, headers = {}) ⇒ Object

Fetch a URI, using HTTP GET unless you supply post.



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/scrapes/crawler.rb', line 53

def fetch (uri, post={}, headers={})
  @session.refresh
  uri = URI.parse(@session.absolute_uri(uri))

  post.empty? and cached = @cache.check(uri)
  @log.info((cached ? 'C ' : 'N ') + uri.to_s) if @log

  return cached if cached # FIXME
  sleep(@delay) if @delay != 0

  path = uri.path.dup
  path << "/" if path.empty?
  path << "?" + uri.query if uri.query

  req = post.empty? ? Net::HTTP::Get.new(path) : Net::HTTP::Post.new(path)
  req.set_form_data(post) unless post.empty?

  req['Cookie'] = @session.cookies.to_header
  headers.each {|k,v| req[k] = v}

  res = Net::HTTP.new(uri.host, uri.port).start {|http| http.request(req)}

  if @verbose >= 2
    STDERR.puts "-----------------------------------------------"
    STDERR.puts res.class
    res.each_header {|k,v| STDERR.puts "#{k}: #{v}"}
  end

  # FIXME, what to do about more than one cookie
  @session.cookies.from_header(res['set-cookie']) if res.key?('set-cookie')

  case res
  when Net::HTTPRedirection
    @session.base_uris[-1] = @session.absolute_uri(res['location'])
    res = fetch(res['location'], {}, headers)
  end

  post.empty? and @cache.update(uri, res.body)
  res
end