Module: Bnm::Scrape
- Defined in:
- lib/bnm/scrape.rb
Class Method Summary collapse
-
.deep_scrape(artists) ⇒ Object
SCRAPES ARTISTS STORY AND SCORE######.
-
.init_scrape(url) ⇒ Object
SCRAPES URL#########################.
-
.scrub(data) ⇒ Object
SCRUBS DATA#########################.
Class Method Details
.deep_scrape(artists) ⇒ Object
SCRAPES ARTISTS STORY AND SCORE######
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/bnm/scrape.rb', line 32 def self.deep_scrape(artists) artists.each do |artist| doc = Nokogiri::HTML(open("http://pitchfork.com#{artist[:review_url]}")) html = doc.css('div.review-detail') html.each do |i| editorial = scrub(HTMLEntities.new.decode(i.css('div.article-content').text)) = scrub(i.css('a.display-name').text) score = i.css('span.score').text # CHECKS THAT URI IS NOT NIL if !(i.css('.find-it-at a')).empty? listen = i.css('.find-it-at a').attribute('href').text end # NOT EVERY ARTIST HAS THE SAME INFORMATION, SOME ATTRIUBUTES ARE EMPTY, HENCE THE CONDITIONAL artist[:editorial] = (editorial) if !editorial.empty? artist[:score] = (score) if !score.empty? artist[:listen] = (listen) if listen artist[:author] = () if !.empty? # FINDS GENRES FOR ARTIST - SOME HAVE MORE THAN ONE GENRE. i.css('ul.genre-list').each do |i| artist[:genres] = scrub(i.css('a').text) end end end artists.sort! {|a, b| b[:score] <=> a[:score]} end |
.init_scrape(url) ⇒ Object
SCRAPES URL#########################
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/bnm/scrape.rb', line 8 def self.init_scrape(url) artists = [] doc = Nokogiri::HTML(open(url)) html = doc.css("div.review") html.each do |i| artists << { review_url: i.css("a").attribute("href").text, name: scrub(i.css('.artist-list li').text), album: scrub(i.css('.album-artist .title').text), date: i.css('.meta .pub-date').text } end # IF SCRAPE COMES UP EMPTY artists.each do |i| i.each do |k, v| artists.delete(i) if v == "" end end artists end |
.scrub(data) ⇒ Object
SCRUBS DATA#########################
63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/bnm/scrape.rb', line 63 def self.scrub(data) ### REMOVES AMOEBA LINK FROM ARTICLE if data.include? 'Find it at:Amoeba Music' data.slice! 'Find it at:Amoeba Music' end ### REPLACING UTF-8 ARTIFACTS data.gsub!(/â/, "'") data.gsub!(/Â/, "") data.gsub!(/é/, "é") data end |