Class: Rider::Crawler
- Inherits:
-
Object
- Object
- Rider::Crawler
- Defined in:
- lib/rider/crawler.rb
Constant Summary collapse
- SKIPPABLE_EXCEPTIONS =
[Errno::ETIMEDOUT, WWW::Mechanize::ResponseCodeError, Errno::EHOSTUNREACH, SocketError, Errno::ECONNREFUSED, Timeout::Error, Net::HTTPBadResponse, Hpricot::ParseError]
Instance Method Summary collapse
- #add_follow_urls(urls) ⇒ Object
-
#each_document ⇒ Object
Crawls documents and passes their URL, response headers, and data to the supplied block.
- #follow_url?(url) ⇒ Boolean
-
#get(url) ⇒ Object
Gets the document at the specified
url
. - #get_file(uri) ⇒ Object
- #get_http(uri) ⇒ Object
-
#initialize(mask, queue) ⇒ Crawler
constructor
Creates a new Crawler, with the specified
mask
(a Regexp) and queue (aRider::Queue
instance). -
#match_mask?(url) ⇒ Boolean
Returns true if
url
passes themask
. -
#next_document ⇒ Object
Returns the next retrievable document from the next valid URL in the queue.
-
#next_url ⇒ Object
Retrieves the next URL in the queue that matches the
mask
. - #saw_url(url) ⇒ Object
- #seen_url?(url) ⇒ Boolean
- #valid_url?(url) ⇒ Boolean
Constructor Details
#initialize(mask, queue) ⇒ Crawler
Creates a new Crawler, with the specified mask
(a Regexp) and queue (a Rider::Queue
instance).
6 7 8 9 10 11 12 13 14 |
# File 'lib/rider/crawler.rb', line 6 def initialize(mask, queue) @mask = mask @queue = queue @seen_urls = [] @www = WWW::Mechanize.new do |a| a.log = Logger.new("tmp/www.log") a.pluggable_parser.default = Hpricot end end |
Instance Method Details
#add_follow_urls(urls) ⇒ Object
29 30 31 |
# File 'lib/rider/crawler.rb', line 29 def add_follow_urls(urls) urls.each { |url| @queue.push(url) if follow_url?(url) } end |
#each_document ⇒ Object
Crawls documents and passes their URL, response headers, and data to the supplied block.
22 23 24 25 26 27 |
# File 'lib/rider/crawler.rb', line 22 def each_document while doc_data = next_document() follow_urls = yield(doc_data) || [] add_follow_urls(follow_urls) end end |
#follow_url?(url) ⇒ Boolean
33 34 35 |
# File 'lib/rider/crawler.rb', line 33 def follow_url?(url) match_mask?(url) and !seen_url?(url) end |
#get(url) ⇒ Object
Gets the document at the specified url
. Returns an Array [uri, metadata, contents]
58 59 60 61 62 63 64 65 66 67 68 69 70 |
# File 'lib/rider/crawler.rb', line 58 def get(url) uri = URI.parse(url) Timeout::timeout(8, Timeout::Error) do case uri.scheme when 'http' get_http(uri) when 'file' get_file(uri) else raise(ArgumentError, "don't know how to get #{url}") end end end |
#get_file(uri) ⇒ Object
72 73 74 75 |
# File 'lib/rider/crawler.rb', line 72 def get_file(uri) filename = uri.gsub(/^file:\/\//, '') [uri, {}, File.read(filename)] end |
#get_http(uri) ⇒ Object
77 78 79 80 81 |
# File 'lib/rider/crawler.rb', line 77 def get_http(uri) page = @www.get(uri) = page.response [uri, , page] end |
#match_mask?(url) ⇒ Boolean
Returns true if url
passes the mask
.
17 18 19 |
# File 'lib/rider/crawler.rb', line 17 def match_mask?(url) @mask.match(url) != nil end |
#next_document ⇒ Object
Returns the next retrievable document from the next valid URL in the queue.
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/rider/crawler.rb', line 40 def next_document begin url = next_url() return nil if url.nil? doc_data = get(url) saw_url(url) return doc_data rescue Exception=>ex if SKIPPABLE_EXCEPTIONS.include?(ex.class) Rider.log.debug("EXCEPTION: #{ex.inspect}, skipping...") retry # go on to the next document else raise ex end end end |
#next_url ⇒ Object
Retrieves the next URL in the queue that matches the mask
.
84 85 86 87 88 |
# File 'lib/rider/crawler.rb', line 84 def next_url while url = @queue.shift return url if valid_url?(url) end end |
#saw_url(url) ⇒ Object
98 99 100 |
# File 'lib/rider/crawler.rb', line 98 def saw_url(url) @seen_urls << url end |
#seen_url?(url) ⇒ Boolean
94 95 96 |
# File 'lib/rider/crawler.rb', line 94 def seen_url?(url) @seen_urls.include?(url) end |
#valid_url?(url) ⇒ Boolean
90 91 92 |
# File 'lib/rider/crawler.rb', line 90 def valid_url?(url) !seen_url?(url) && match_mask?(url) end |