Class: BrewerySearch::Scraper
- Inherits:
-
Object
- Object
- BrewerySearch::Scraper
- Defined in:
- lib/brewery_search/scraper.rb
Class Method Summary collapse
-
.scrape_profile(brewery) ⇒ Object
it will accept a url for a brewery’s profile on the page, and scrape additional details to be displayed when requested.
- .scrape_state(state_input) ⇒ Object
Class Method Details
.scrape_profile(brewery) ⇒ Object
it will accept a url for a brewery’s profile on the page, and scrape additional details to be displayed when requested
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/brewery_search/scraper.rb', line 36 def self.scrape_profile(brewery) profile = Nokogiri::HTML(open("https://www.brewbound.com#{brewery.site_url}")) #determining address based one one of several formats the site can use if (profile.css("div #overview dl dd dt").text.include?("PARENT") || profile.css("div #overview dl dd dt").text.include?("Founded")) && !!profile.css("div #overview dl dd")[0].text.match(/[0-9]/) == true brewery.address = profile.css("div #overview dl dd")[3].css("a").attr("href").text.gsub(/\bhttps:.*=(?:,)?/, '') elsif profile.css("div #overview dl dd dt").text.include?("PARENT") || profile.css("div #overview dl dd dt").text.include?("Founded") brewery.address = profile.css("div #overview dl dd")[2].css("a").attr("href").text.gsub(/\bhttps:.*=(?:,)?/, '') elsif profile.css("div #overview dl dd")[0].text.include?("JOB") && !!profile.css("div #overview dl dd")[0].text.match(/[0-9]/) == true brewery.address = profile.css("div #overview dl dd")[3].css("a").attr("href").text.gsub(/\bhttps:.*=(?:,)?/, '') elsif profile.css("div #overview dl dd")[0].text.include?("JOB") && !!profile.css("div #overview dl dd")[3].text.match(/[0-9]/) == true brewery.address = profile.css("div #overview dl dd")[3].css("a").attr("href").text.gsub(/\bhttps:.*=(?:,)?/, '') elsif profile.css("div #overview dl dt")[2].text.include?("TYPE") brewery.address = profile.css("div #overview dl dd")[3].css("a").attr("href").text.gsub(/\bhttps:.*=(?:,)?/, '') else brewery.address = profile.css("div #overview dl dd")[2].css("a").attr("href").text.gsub(/\bhttps:.*=(?:,)?/, '') end #determining overview based on one of several formats the site can use if (profile.css("div #overview dl dd dt").text.include?("PARENT") || profile.css("div #overview dl dd dt").text.include?("Founded")) && !!profile.css("div #overview dl dd")[0].text.match(/[0-9]/) == true brewery.overview = profile.css("div #overview dl dd")[4].text elsif profile.css("div #overview dl dd dt").text.include?("PARENT") || profile.css("div #overview dl dd dt").text.include?("Founded") brewery.overview = profile.css("div #overview dl dd")[3].text elsif profile.css("div #overview dl dd")[0].text.include?("JOB") && !!profile.css("div #overview dl dd")[0].text.match(/[0-9]/) == true brewery.overview = profile.css("div #overview dl dd")[4].text elsif profile.css("div #overview dl dd")[0].text.include?("JOB") && !!profile.css("div #overview dl dd")[1].text.match(/[0-9]/) == true brewery.overview = profile.css("div #overview dl dd")[4].text elsif profile.css("div #overview dl dt")[2].text.include?("TYPE") brewery.overview = profile.css("div #overview dl dd")[4].text else brewery.overview = profile.css("div #overview dl dd")[3].text end #determine phone number if profile.css("div.contact dt")[1].text == "Phone" brewery.phone = profile.css("div.contact dd")[1].text end #determine external website brewery.website = profile.css("div.contact a").attr("href").text #grab social media links depending on what they have available = profile.css("div.contact ul.brewer-social-media li").each do || if .css("a").attr("href").text.include?("twitter") brewery.twitter = .css("a").attr("href").text elsif .css("a").attr("href").text.include?("facebook") brewery.facebook = .css("a").attr("href").text elsif .css("a").attr("href").text.include?("instagram") brewery.instagram = .css("a").attr("href").text elsif .css("a").attr("href").text.include?("youtube") brewery.youtube = .css("a").attr("href").text end end end |
.scrape_state(state_input) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/brewery_search/scraper.rb', line 7 def self.scrape_state(state_input) search_result_pages = [] doc = Nokogiri::HTML(open("https://www.brewbound.com/mvc/Breweries/state/#{state_input}?displayOutOfBiz=False")) search_result_pages << doc #is able to scrape data from additional searrch result pages when applicable, all pages use same format for additional page #results, and user input is injected into url page = 2 while doc.css("table.breweries-list tfoot p.text-center").text.include?("Next") do doc = Nokogiri::HTML(open("https://www.brewbound.com/mvc/Breweries/state/#{state_input}/page/#{page}?displayOutOfBiz=False")) search_result_pages << doc page += 1 end #instantiates a new Brewery object for each entry search_result_pages.each do |additional_page| additional_page.css("table.breweries-list tbody tr").each do |tr| new_brewery = BrewerySearch::Brewery.new new_brewery.name = tr.css("td a.accented.hidden-mobile.bold").text.strip new_brewery.city = tr.css("td.hidden-mobile")[0].text.split(",")[0].strip new_brewery.state = state_input new_brewery.site_url = tr.css("td a.accented.hidden-mobile.bold").attr("href").text.strip new_brewery.type = tr.css("td.hidden-mobile")[1].text.strip end end end |