Module: Scraper

Defined in:
lib/scraper.rb

Constant Summary collapse

POOL_LIST_URLS =

faster testing POOL_LIST_URLS = [“web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html”] Full list

[ "https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html","https://web.toronto.ca/data/parks/prd/facilities/outdoor-pools/index.html" ]

Class Method Summary collapse

Class Method Details

.build_pool_schedule_array_from_html(doc) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/scraper.rb', line 69

def build_pool_schedule_array_from_html(doc)
  weeks = {}

  for i in 0..1 #eventually poll more weeks, possibly 4 of available 7
    week = doc.at_css("#dropin_Swimming_#{i}")
    !week.nil?? week_dates = week.at_css('tr').children.map(&:text) : next

    !week_dates.nil?? lane_swim_row_index = week.at_css("tbody").css('tr').find_index { |el| el.text=~ /Lane Swim/ } : next

    if !lane_swim_row_index.nil?
      week_lane_swim_times = swim_time_finder(week, lane_swim_row_index)
      weeks.merge!(week_dates.zip(week_lane_swim_times).to_h)
    end
  end

  # remove days with no swim times
  weeks.delete_if { |_, time| time == [" "] || time == [] }
end

.display_mode(display_mode) ⇒ Object



11
12
13
# File 'lib/scraper.rb', line 11

def display_mode(display_mode)
  @display_mode = display_mode
end

.gather_pool_addresses(pools) ⇒ Object



88
89
90
91
92
93
94
95
96
97
# File 'lib/scraper.rb', line 88

def gather_pool_addresses(pools)
  address_index = pools.css('td').length / pools.css('tr').length

  pools.css('td').each_with_object([]).with_index do |(node, pool_addresses), index|
    # Address is always second column, table width varies for indoor vs. outdoor
    if index % address_index == 1
      pool_addresses << node.text
    end
  end
end

.gather_pool_coordinates(address) ⇒ Object



99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/scraper.rb', line 99

def gather_pool_coordinates(address)
  if @display_mode == "verbose"
    puts "Geocoding: #{address}"
  else
    print "."
  end

  coordinates_arr = Geocoder.coordinates("#{address}, Toronto")

  # To avoid triggering google API limit of 50 queries per second
  sleep(0.02)
  return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
end

.gather_pool_infoObject



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/scraper.rb', line 22

def gather_pool_info
  @pool_urls, pool_names, pool_addresses, pool_links, pool_coordinates = [],[],[],[],[]

  POOL_LIST_URLS.each do |url|
    doc = Nokogiri::HTML(open(url))
    pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
    pool_names += pools.css('a').map { |link| link.children.text unless ( link.children.text == "" || link['href'].match(/maps/) ) }.compact
    pool_links += pools.css('a').map { |link| link['href'] if link['href'].match(/parks\/prd\/facilities\/complex/) }.compact
    pool_addresses += gather_pool_addresses(pools)
  end

  array_length_equality = pool_names.length == pool_links.length && pool_links.length == pool_addresses.length
  raise "Pool information lengths are unequal, the website schema has likely changed" unless array_length_equality

  # Geotag pools
  puts "\n--- Scraping pool coordinates ---"
  pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }

  # Convert Pool Data to Hash
  pool_names.each_with_index do |_, index|
    current_pool = {}
    current_pool[:name] = pool_names[index]
    current_pool[:url] = pool_links[index]
    current_pool[:address] = pool_addresses[index]
    current_pool[:coordinates] = pool_coordinates[index]
    @pool_urls << current_pool
  end

  # Write Hash
  File.open("pool_urls.json","w") do |f|
    f.write(@pool_urls.to_json)
  end
  @pool_urls
end

.gather_pool_program_cost_statusObject



142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/scraper.rb', line 142

def gather_pool_program_cost_status
  @pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)

  page = "https://www.toronto.ca/explore-enjoy/recreation/free-lower-cost-recreation-options/"
  doc = Nokogiri::HTML(open(page))
  free_facility_article = doc.at_css("#collapse-centres-where-programs-are-free")
  links = free_facility_article.css('a')
  all_hrefs = links.map { |link| link.attribute('href').to_s }.uniq.sort.delete_if { |href| href.empty? }

  free_facility_urls_regexed = all_hrefs.keep_if{ |href| href.match("\/parks/prd/facilities/complex\w*") }
                                        .map{ |url| url.match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s }

  @pools.each do |pool|
    pool_url_regex = pool[:url].match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s
    match = free_facility_urls_regexed.find{ |e| pool_url_regex == e }
    pool[:free_swim] = match ? true : false
  end

  File.open("pools_data.json","w") do |f|
    f.write(@pools.to_json)
    puts "Writing program cost status to pools_data.json complete"
  end

  @pools
end

.gather_pool_swim_timesObject

Parse Weekly Leisure Swim Data#####



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/scraper.rb', line 114

def gather_pool_swim_times
  begin
    @pool_urls ||= JSON.parse(File.read('pool_urls.json'), symbolize_names: true)
  rescue
    puts "Couldn't open pool_info, run scrape -f or run in path with pool_urls.json file"
    exit
  end

  puts "\n--- Scraping pool swim times ---"
  @pool_urls.each do |pool|
    if @display_mode == "verbose"
      puts "Scraping: " + pool[:name]
    else
      print "."
    end
    url = "https://www.toronto.ca" + pool[:url]
    doc = Nokogiri::HTML(open(url))
    pool[:times] = build_pool_schedule_array_from_html(doc)
  end

  File.open("pools_data.json","w") do |f|
    f.write(@pool_urls.to_json)
    puts "\nWriting pools_data.json complete"
  end

  @pool_urls
end

.swim_time_finder(week, lane_swim_row_index) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
# File 'lib/scraper.rb', line 57

def swim_time_finder(week, lane_swim_row_index)
  week.at_css("tbody").css('tr')[lane_swim_row_index].children
  .map do |el|
    nodes = el.children.find_all(&:text?)
    if nodes.length == 1
      nodes = [el.children.text]
    else
      nodes.map!(&:text)
    end
  end
end