Class: Rider::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/rider/crawler.rb

Constant Summary collapse

SKIPPABLE_EXCEPTIONS =
[Errno::ETIMEDOUT, WWW::Mechanize::ResponseCodeError, Errno::EHOSTUNREACH, SocketError,
Errno::ECONNREFUSED, Timeout::Error, Net::HTTPBadResponse, Hpricot::ParseError]

Instance Method Summary collapse

Constructor Details

#initialize(mask, queue) ⇒ Crawler

Creates a new Crawler, with the specified mask (a Regexp) and queue (a Rider::Queue instance).



6
7
8
9
10
11
12
13
14
# File 'lib/rider/crawler.rb', line 6

def initialize(mask, queue)
  @mask = mask
  @queue = queue
  @seen_urls = []
  @www = WWW::Mechanize.new do |a|
    a.log = Logger.new("tmp/www.log")
    a.pluggable_parser.default = Hpricot
  end
end

Instance Method Details

#add_follow_urls(urls) ⇒ Object



29
30
31
# File 'lib/rider/crawler.rb', line 29

def add_follow_urls(urls)
  urls.each { |url| @queue.push(url) if follow_url?(url) }
end

#each_documentObject

Crawls documents and passes their URL, response headers, and data to the supplied block.



22
23
24
25
26
27
# File 'lib/rider/crawler.rb', line 22

def each_document
  while doc_data = next_document()
    follow_urls = yield(doc_data) || []
    add_follow_urls(follow_urls)
  end
end

#follow_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


33
34
35
# File 'lib/rider/crawler.rb', line 33

def follow_url?(url)
  match_mask?(url) and !seen_url?(url)
end

#get(url) ⇒ Object

Gets the document at the specified url. Returns an Array [uri, metadata, contents]



58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/rider/crawler.rb', line 58

def get(url)
  uri = URI.parse(url)
  Timeout::timeout(8, Timeout::Error) do
    case uri.scheme
    when 'http'
      get_http(uri)
    when 'file'
      get_file(uri)
    else
      raise(ArgumentError, "don't know how to get #{url}")
    end
  end      
end

#get_file(uri) ⇒ Object



72
73
74
75
# File 'lib/rider/crawler.rb', line 72

def get_file(uri)
  filename = uri.gsub(/^file:\/\//, '')
  [uri, {}, File.read(filename)]
end

#get_http(uri) ⇒ Object



77
78
79
80
81
# File 'lib/rider/crawler.rb', line 77

def get_http(uri)
  page = @www.get(uri)
  meta = page.response
  [uri, meta, page]
end

#match_mask?(url) ⇒ Boolean

Returns true if url passes the mask.

Returns:

  • (Boolean)


17
18
19
# File 'lib/rider/crawler.rb', line 17

def match_mask?(url)
  @mask.match(url) != nil
end

#next_documentObject

Returns the next retrievable document from the next valid URL in the queue.



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/rider/crawler.rb', line 40

def next_document
  begin
    url = next_url()
    return nil if url.nil?
    doc_data = get(url)
    saw_url(url)
    return doc_data
  rescue Exception=>ex
    if SKIPPABLE_EXCEPTIONS.include?(ex.class)
      Rider.log.debug("EXCEPTION: #{ex.inspect}, skipping...")
      retry # go on to the next document
    else
      raise ex
    end
  end
end

#next_urlObject

Retrieves the next URL in the queue that matches the mask.



84
85
86
87
88
# File 'lib/rider/crawler.rb', line 84

def next_url
  while url = @queue.shift
    return url if valid_url?(url)
  end
end

#saw_url(url) ⇒ Object



98
99
100
# File 'lib/rider/crawler.rb', line 98

def saw_url(url)
  @seen_urls << url
end

#seen_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


94
95
96
# File 'lib/rider/crawler.rb', line 94

def seen_url?(url)
  @seen_urls.include?(url)
end

#valid_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


90
91
92
# File 'lib/rider/crawler.rb', line 90

def valid_url?(url)
  !seen_url?(url) && match_mask?(url)
end