Class: Translink::Crawler
- Inherits:
-
Object
- Object
- Translink::Crawler
- Defined in:
- lib/translink/crawler.rb
Constant Summary collapse
- MAX_RETRY_COUNT =
Maximum number of times to attempt a HTTP request.
2
- SLEEP_DURATION =
Base amount of time to sleep in seconds before retrying.
5
Instance Attribute Summary collapse
-
#out ⇒ Object
Returns the value of attribute out.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #crawl(date, from_route_url = nil, step = nil) ⇒ Object
- #crawl_route_page(route_page, retry_count = 0) ⇒ Object
- #crawl_trip_page(route_model, trip_page, retry_count = 0) ⇒ Object
-
#initialize(url) ⇒ Crawler
constructor
A new instance of Crawler.
Constructor Details
#initialize(url) ⇒ Crawler
Returns a new instance of Crawler.
9 10 11 12 |
# File 'lib/translink/crawler.rb', line 9 def initialize url @url = URI.parse url @out = $stdout end |
Instance Attribute Details
#out ⇒ Object
Returns the value of attribute out.
6 7 8 |
# File 'lib/translink/crawler.rb', line 6 def out @out end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
7 8 9 |
# File 'lib/translink/crawler.rb', line 7 def url @url end |
Instance Method Details
#crawl(date, from_route_url = nil, step = nil) ⇒ Object
14 15 16 17 18 19 |
# File 'lib/translink/crawler.rb', line 14 def crawl date, from_route_url = nil, step = nil = Page::Timetable.new(url.to_s). date .route_pages(from_route_url, step).each do |route_page| crawl_route_page route_page end end |
#crawl_route_page(route_page, retry_count = 0) ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/translink/crawler.rb', line 21 def crawl_route_page route_page, retry_count = 0 route_model = Model::Route.find_or_add_route_from_route_page route_page route_page.trip_pages.each do |trip_page| crawl_trip_page route_model, trip_page end rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception if retry_count <= MAX_RETRY_COUNT sleep SLEEP_DURATION * retry_count crawl_route_page route_page, retry_count + 1 else out.puts "Skipping route page (#{route_page.url}) because of #{exception}" end rescue => exception out.puts "Skipping route page (#{route_page.url}) because of #{exception}" out.puts exception.backtrace end |
#crawl_trip_page(route_model, trip_page, retry_count = 0) ⇒ Object
38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/translink/crawler.rb', line 38 def crawl_trip_page route_model, trip_page, retry_count = 0 trip_model = route_model.add_trip_from_trip_page trip_page trip_model.add_stop_times_from_stop_time_pages trip_page.stop_times rescue Mechanize::ResponseCodeError, Page::UnexpectedParserError => exception if retry_count <= MAX_RETRY_COUNT sleep SLEEP_DURATION * retry_count crawl_trip_page route_model, trip_page, retry_count + 1 else out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}" end rescue => exception out.puts "Skipping trip page (#{trip_page.url}) because of #{exception}" out.puts exception.backtrace end |