Class: Translink::Crawler

Inherits:

Object

Object
Translink::Crawler

show all

Defined in:: lib/translink/crawler.rb

Constant Summary collapse

MAX_RETRY_COUNT = Maximum number of times to attempt a HTTP request.

SLEEP_DURATION = Base amount of time to sleep in seconds before retrying.

Instance Attribute Summary collapse

#out ⇒ Object

Returns the value of attribute out.
#url ⇒ Object readonly

Returns the value of attribute url.

Instance Method Summary collapse

#crawl(date, from_route_url = nil, step = nil) ⇒ Object
#crawl_route_page(route_page, retry_count = 0) ⇒ Object
#crawl_trip_page(route_model, trip_page, retry_count = 0) ⇒ Object
#initialize(url) ⇒ Crawler constructor

A new instance of Crawler.

Constructor Details

#initialize(url) ⇒ `Crawler`

Returns a new instance of Crawler.

# File 'lib/translink/crawler.rb', line 9

def initialize url
  @url = URI.parse url
  @out = $stdout
end

Instance Attribute Details

#out ⇒ `Object`

Returns the value of attribute out.



6
7
8

# File 'lib/translink/crawler.rb', line 6

def out
  @out
end

#url ⇒ `Object` (readonly)

Returns the value of attribute url.



7
8
9

# File 'lib/translink/crawler.rb', line 7

def url
  @url
end

Instance Method Details

#crawl(date, from_route_url = nil, step = nil) ⇒ `Object`

# File 'lib/translink/crawler.rb', line 14

def crawl date, from_route_url = nil, step = nil
  timetable_page = Page::Timetable.new(url.to_s).timetable_page date
  timetable_page.route_pages(from_route_url, step).each do |route_page|
    crawl_route_page route_page
  end
end

#crawl_route_page(route_page, retry_count = 0) ⇒ `Object`

# File 'lib/translink/crawler.rb', line 21

def crawl_route_page route_page, retry_count = 0
  route_model = Model::Route.find_or_add_route_from_route_page route_page
  route_page.trip_pages.each do |trip_page|
    crawl_trip_page route_model, trip_page
  end
rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception
  if retry_count <= MAX_RETRY_COUNT
    sleep SLEEP_DURATION * retry_count
    crawl_route_page route_page, retry_count + 1
  else
    out.puts "Skipping route page (#{route_page.url}) because of #{exception}"
  end
rescue => exception
  out.puts "Skipping route page (#{route_page.url}) because of #{exception}"
  out.puts exception.backtrace
end

#crawl_trip_page(route_model, trip_page, retry_count = 0) ⇒ `Object`

# File 'lib/translink/crawler.rb', line 38

def crawl_trip_page route_model, trip_page, retry_count = 0
  trip_model = route_model.add_trip_from_trip_page trip_page
  trip_model.add_stop_times_from_stop_time_pages trip_page.stop_times
rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception
  if retry_count <= MAX_RETRY_COUNT
    sleep SLEEP_DURATION * retry_count
    crawl_trip_page route_model, trip_page, retry_count + 1
  else
    out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}"
  end
rescue => exception
  out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}"
  out.puts exception.backtrace
end

Class: Translink::Crawler

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Crawler

Instance Attribute Details

#out ⇒ Object

#url ⇒ Object (readonly)