Class: ListScraper::LocationScraper

Inherits:
Object
  • Object
show all
Defined in:
lib/storeListScraper/location_scraper.rb

Constant Summary collapse

@@base =
'https://storefound.org'

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(link) ⇒ LocationScraper

Returns a new instance of LocationScraper.



6
7
8
9
10
11
# File 'lib/storeListScraper/location_scraper.rb', line 6

def initialize(link)
    @link = link
    @state_pages = []
    @city_pages = []
    @loc_pages = []
end

Instance Attribute Details

#city_pagesObject

Returns the value of attribute city_pages.



3
4
5
# File 'lib/storeListScraper/location_scraper.rb', line 3

def city_pages
  @city_pages
end

Returns the value of attribute link.



3
4
5
# File 'lib/storeListScraper/location_scraper.rb', line 3

def link
  @link
end

#loc_pagesObject

Returns the value of attribute loc_pages.



3
4
5
# File 'lib/storeListScraper/location_scraper.rb', line 3

def loc_pages
  @loc_pages
end

#state_pagesObject

Returns the value of attribute state_pages.



3
4
5
# File 'lib/storeListScraper/location_scraper.rb', line 3

def state_pages
  @state_pages
end

Class Method Details

.baseObject



13
14
15
# File 'lib/storeListScraper/location_scraper.rb', line 13

def self.base
    @@base
end

Instance Method Details

#clean_outObject

when running a store, the first table of links will only ever be states, cities, or locations. On state pages, cities will be picked up at the bottom and need to be removed clean_out will determine next steps and de-duplicate arrays clean_out can only be used after the first pass



55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/storeListScraper/location_scraper.rb', line 55

def clean_out
  @state_pages.uniq!
  @city_pages.uniq!
  @loc_pages.uniq!
  if @state_pages.length > 0 #if state links avaliable, clean other arrays and scrape each state
      @city_pages.clear
      @loc_pages.clear
      linked_page_scrape(@state_pages,'State')
      linked_page_scrape(@city_pages,'City')
  elsif @city_pages.length > 0
      linked_page_scrape(@city_pages,'City')
  end
end

#create_storesObject



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/storeListScraper/location_scraper.rb', line 80

def create_stores
  total = @loc_pages.length
  i = 1
  @loc_pages.each do |loc|
      begin
          st = Nokogiri::HTML5(URI.open("#{@@base}#{loc}"))
          j = st.css("li span")
          info = {
              idnum: i,
              address: j[0].text,
              city: j[1].text,
              state: j[2].text,
              zip: j[3].text
          }
          ListScraper::Store.new(info)
          print "#{((i.to_f/total.to_f)*100).round(2)}% | Progress: #{i}/#{total}\r"
          i += 1
      rescue
          next
      end
  end
end

#linked_page_scrape(array, type) ⇒ Object



69
70
71
72
73
74
75
76
77
78
# File 'lib/storeListScraper/location_scraper.rb', line 69

def linked_page_scrape(array,type)
  total = array.length
  i = 0
  array.each do |page|
      i += 1
      page_scrape("#{@@base}#{page}","#{type}")
      print "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
  end
  puts "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
end

#page_scrape(page, type = 'all') ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/storeListScraper/location_scraper.rb', line 17

def page_scrape(page, type = 'all')
  begin
      doc = Nokogiri::HTML5(URI.open(page))
      doc.css(".main-block a").each do |lk| #pull all links from main body 
          j = lk.attribute("href").text #look at only the url text
          case type
          when 'all'
              case j.split("/").length
              when 3 
                  @state_pages << j
              when 4
                  @city_pages << j
              when 5
                  @loc_pages << j
              end
          when 'State'
              case j.split("/").length
              when 4
                  @city_pages << j
              when 5
                  @loc_pages << j
              end
          when 'City'
              case j.split("/").length
              when 5
                  @loc_pages << j
              end
          end
      end
  rescue #OpenURI::HTTPError => e
  end
end