Class: Rider::Crawler

Inherits:

Object

Object
Rider::Crawler

show all

Defined in:: lib/rider/crawler.rb

Constant Summary collapse

SKIPPABLE_EXCEPTIONS =

[Errno::ETIMEDOUT, WWW::Mechanize::ResponseCodeError, Errno::EHOSTUNREACH, SocketError,
Errno::ECONNREFUSED, Timeout::Error, Net::HTTPBadResponse, Hpricot::ParseError]

Instance Method Summary collapse

#add_follow_urls(urls) ⇒ Object
#each_document ⇒ Object

Crawls documents and passes their URL, response headers, and data to the supplied block.
#follow_url?(url) ⇒ Boolean
#get(url) ⇒ Object

Gets the document at the specified url.
#get_file(uri) ⇒ Object
#get_http(uri) ⇒ Object
#initialize(mask, queue) ⇒ Crawler constructor

Creates a new Crawler, with the specified mask (a Regexp) and queue (a Rider::Queue instance).
#match_mask?(url) ⇒ Boolean

Returns true if url passes the mask.
#next_document ⇒ Object

Returns the next retrievable document from the next valid URL in the queue.
#next_url ⇒ Object

Retrieves the next URL in the queue that matches the mask.
#saw_url(url) ⇒ Object
#seen_url?(url) ⇒ Boolean
#valid_url?(url) ⇒ Boolean

Constructor Details

#initialize(mask, queue) ⇒ `Crawler`

Creates a new Crawler, with the specified mask (a Regexp) and queue (a Rider::Queue instance).

# File 'lib/rider/crawler.rb', line 6

def initialize(mask, queue)
  @mask = mask
  @queue = queue
  @seen_urls = []
  @www = WWW::Mechanize.new do |a|
    a.log = Logger.new("tmp/www.log")
    a.pluggable_parser.default = Hpricot
  end
end

Instance Method Details

#add_follow_urls(urls) ⇒ `Object`



29
30
31

# File 'lib/rider/crawler.rb', line 29

def add_follow_urls(urls)
  urls.each { |url| @queue.push(url) if follow_url?(url) }
end

#each_document ⇒ `Object`

Crawls documents and passes their URL, response headers, and data to the supplied block.

# File 'lib/rider/crawler.rb', line 22

def each_document
  while doc_data = next_document()
    follow_urls = yield(doc_data) || []
    add_follow_urls(follow_urls)
  end
end

#follow_url?(url) ⇒ `Boolean`

Returns:

(Boolean)



33
34
35

# File 'lib/rider/crawler.rb', line 33

def follow_url?(url)
  match_mask?(url) and !seen_url?(url)
end

#get(url) ⇒ `Object`

Gets the document at the specified url. Returns an Array [uri, metadata, contents]

# File 'lib/rider/crawler.rb', line 58

def get(url)
  uri = URI.parse(url)
  Timeout::timeout(8, Timeout::Error) do
    case uri.scheme
    when 'http'
      get_http(uri)
    when 'file'
      get_file(uri)
    else
      raise(ArgumentError, "don't know how to get #{url}")
    end
  end      
end

#get_file(uri) ⇒ `Object`

# File 'lib/rider/crawler.rb', line 72

def get_file(uri)
  filename = uri.gsub(/^file:\/\//, '')
  [uri, {}, File.read(filename)]
end

#get_http(uri) ⇒ `Object`

# File 'lib/rider/crawler.rb', line 77

def get_http(uri)
  page = @www.get(uri)
  meta = page.response
  [uri, meta, page]
end

#match_mask?(url) ⇒ `Boolean`

Returns true if url passes the mask.

Returns:

(Boolean)



17
18
19

# File 'lib/rider/crawler.rb', line 17

def match_mask?(url)
  @mask.match(url) != nil
end

#next_document ⇒ `Object`

Returns the next retrievable document from the next valid URL in the queue.

# File 'lib/rider/crawler.rb', line 40

def next_document
  begin
    url = next_url()
    return nil if url.nil?
    doc_data = get(url)
    saw_url(url)
    return doc_data
  rescue Exception=>ex
    if SKIPPABLE_EXCEPTIONS.include?(ex.class)
      Rider.log.debug("EXCEPTION: #{ex.inspect}, skipping...")
      retry # go on to the next document
    else
      raise ex
    end
  end
end

#next_url ⇒ `Object`

Retrieves the next URL in the queue that matches the mask.

# File 'lib/rider/crawler.rb', line 84

def next_url
  while url = @queue.shift
    return url if valid_url?(url)
  end
end

#saw_url(url) ⇒ `Object`



98
99
100

# File 'lib/rider/crawler.rb', line 98

def saw_url(url)
  @seen_urls << url
end

#seen_url?(url) ⇒ `Boolean`

Returns:

(Boolean)



94
95
96

# File 'lib/rider/crawler.rb', line 94

def seen_url?(url)
  @seen_urls.include?(url)
end

#valid_url?(url) ⇒ `Boolean`

Returns:

(Boolean)



90
91
92

# File 'lib/rider/crawler.rb', line 90

def valid_url?(url)
  !seen_url?(url) && match_mask?(url)
end

Class: Rider::Crawler

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(mask, queue) ⇒ Crawler

Instance Method Details

#add_follow_urls(urls) ⇒ Object

#each_document ⇒ Object

#follow_url?(url) ⇒ Boolean

#get(url) ⇒ Object

#get_file(uri) ⇒ Object

#get_http(uri) ⇒ Object

#match_mask?(url) ⇒ Boolean

#next_document ⇒ Object

#next_url ⇒ Object

#saw_url(url) ⇒ Object

#seen_url?(url) ⇒ Boolean

#valid_url?(url) ⇒ Boolean

#initialize(mask, queue) ⇒ `Crawler`

#add_follow_urls(urls) ⇒ `Object`

#each_document ⇒ `Object`

#follow_url?(url) ⇒ `Boolean`

#get(url) ⇒ `Object`

#get_file(uri) ⇒ `Object`

#get_http(uri) ⇒ `Object`

#match_mask?(url) ⇒ `Boolean`

#next_document ⇒ `Object`

#next_url ⇒ `Object`

#saw_url(url) ⇒ `Object`

#seen_url?(url) ⇒ `Boolean`

#valid_url?(url) ⇒ `Boolean`