Class: Translink::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/translink/crawler.rb

Constant Summary collapse

MAX_RETRY_COUNT =

Maximum number of times to attempt a HTTP request.

2
SLEEP_DURATION =

Base amount of time to sleep in seconds before retrying.

5

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Crawler

Returns a new instance of Crawler.



9
10
11
12
# File 'lib/translink/crawler.rb', line 9

def initialize url
  @url = URI.parse url
  @out = $stdout
end

Instance Attribute Details

#outObject

Returns the value of attribute out.



6
7
8
# File 'lib/translink/crawler.rb', line 6

def out
  @out
end

#urlObject (readonly)

Returns the value of attribute url.



7
8
9
# File 'lib/translink/crawler.rb', line 7

def url
  @url
end

Instance Method Details

#crawl(date, from_route_url = nil, step = nil) ⇒ Object



14
15
16
17
18
19
# File 'lib/translink/crawler.rb', line 14

def crawl date, from_route_url = nil, step = nil
  timetable_page = Page::Timetable.new(url.to_s).timetable_page date
  timetable_page.route_pages(from_route_url, step).each do |route_page|
    crawl_route_page route_page
  end
end

#crawl_route_page(route_page, retry_count = 0) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/translink/crawler.rb', line 21

def crawl_route_page route_page, retry_count = 0
  route_model = Model::Route.find_or_add_route_from_route_page route_page
  route_page.trip_pages.each do |trip_page|
    crawl_trip_page route_model, trip_page
  end
rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception
  if retry_count <= MAX_RETRY_COUNT
    sleep SLEEP_DURATION * retry_count
    crawl_route_page route_page, retry_count + 1
  else
    out.puts "Skipping route page (#{route_page.url}) because of #{exception}"
  end
rescue => exception
  out.puts "Skipping route page (#{route_page.url}) because of #{exception}"
  out.puts exception.backtrace
end

#crawl_trip_page(route_model, trip_page, retry_count = 0) ⇒ Object



38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/translink/crawler.rb', line 38

def crawl_trip_page route_model, trip_page, retry_count = 0
  trip_model = route_model.add_trip_from_trip_page trip_page
  trip_model.add_stop_times_from_stop_time_pages trip_page.stop_times
rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception
  if retry_count <= MAX_RETRY_COUNT
    sleep SLEEP_DURATION * retry_count
    crawl_trip_page route_model, trip_page, retry_count + 1
  else
    out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}"
  end
rescue => exception
  out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}"
  out.puts exception.backtrace
end