Class: ListScraper::LocationScraper
- Inherits:
-
Object
- Object
- ListScraper::LocationScraper
- Defined in:
- lib/storeListScraper/location_scraper.rb
Constant Summary collapse
- @@base =
'https://storefound.org'
Instance Attribute Summary collapse
-
#city_pages ⇒ Object
Returns the value of attribute city_pages.
-
#link ⇒ Object
Returns the value of attribute link.
-
#loc_pages ⇒ Object
Returns the value of attribute loc_pages.
-
#state_pages ⇒ Object
Returns the value of attribute state_pages.
Class Method Summary collapse
Instance Method Summary collapse
-
#clean_out ⇒ Object
when running a store, the first table of links will only ever be states, cities, or locations.
- #create_stores ⇒ Object
-
#initialize(link) ⇒ LocationScraper
constructor
A new instance of LocationScraper.
- #linked_page_scrape(array, type) ⇒ Object
- #page_scrape(page, type = 'all') ⇒ Object
Constructor Details
#initialize(link) ⇒ LocationScraper
Returns a new instance of LocationScraper.
6 7 8 9 10 11 |
# File 'lib/storeListScraper/location_scraper.rb', line 6 def initialize(link) @link = link @state_pages = [] @city_pages = [] @loc_pages = [] end |
Instance Attribute Details
#city_pages ⇒ Object
Returns the value of attribute city_pages.
3 4 5 |
# File 'lib/storeListScraper/location_scraper.rb', line 3 def city_pages @city_pages end |
#link ⇒ Object
Returns the value of attribute link.
3 4 5 |
# File 'lib/storeListScraper/location_scraper.rb', line 3 def link @link end |
#loc_pages ⇒ Object
Returns the value of attribute loc_pages.
3 4 5 |
# File 'lib/storeListScraper/location_scraper.rb', line 3 def loc_pages @loc_pages end |
#state_pages ⇒ Object
Returns the value of attribute state_pages.
3 4 5 |
# File 'lib/storeListScraper/location_scraper.rb', line 3 def state_pages @state_pages end |
Class Method Details
.base ⇒ Object
13 14 15 |
# File 'lib/storeListScraper/location_scraper.rb', line 13 def self.base @@base end |
Instance Method Details
#clean_out ⇒ Object
when running a store, the first table of links will only ever be states, cities, or locations. On state pages, cities will be picked up at the bottom and need to be removed clean_out will determine next steps and de-duplicate arrays clean_out can only be used after the first pass
55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/storeListScraper/location_scraper.rb', line 55 def clean_out @state_pages.uniq! @city_pages.uniq! @loc_pages.uniq! if @state_pages.length > 0 #if state links avaliable, clean other arrays and scrape each state @city_pages.clear @loc_pages.clear linked_page_scrape(@state_pages,'State') linked_page_scrape(@city_pages,'City') elsif @city_pages.length > 0 linked_page_scrape(@city_pages,'City') end end |
#create_stores ⇒ Object
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/storeListScraper/location_scraper.rb', line 80 def create_stores total = @loc_pages.length i = 1 @loc_pages.each do |loc| begin st = Nokogiri::HTML5(URI.open("#{@@base}#{loc}")) j = st.css("li span") info = { idnum: i, address: j[0].text, city: j[1].text, state: j[2].text, zip: j[3].text } ListScraper::Store.new(info) print "#{((i.to_f/total.to_f)*100).round(2)}% | Progress: #{i}/#{total}\r" i += 1 rescue next end end end |
#linked_page_scrape(array, type) ⇒ Object
69 70 71 72 73 74 75 76 77 78 |
# File 'lib/storeListScraper/location_scraper.rb', line 69 def linked_page_scrape(array,type) total = array.length i = 0 array.each do |page| i += 1 page_scrape("#{@@base}#{page}","#{type}") print "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r" end puts "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r" end |
#page_scrape(page, type = 'all') ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/storeListScraper/location_scraper.rb', line 17 def page_scrape(page, type = 'all') begin doc = Nokogiri::HTML5(URI.open(page)) doc.css(".main-block a").each do |lk| #pull all links from main body j = lk.attribute("href").text #look at only the url text case type when 'all' case j.split("/").length when 3 @state_pages << j when 4 @city_pages << j when 5 @loc_pages << j end when 'State' case j.split("/").length when 4 @city_pages << j when 5 @loc_pages << j end when 'City' case j.split("/").length when 5 @loc_pages << j end end end rescue #OpenURI::HTTPError => e end end |