Class: Scraper
- Inherits:
-
Object
- Object
- Scraper
- Defined in:
- lib/scraper.rb
Constant Summary collapse
- BASE_URL =
"http://www.nytimes.com"
Instance Attribute Summary collapse
-
#author ⇒ Object
Returns the value of attribute author.
-
#story ⇒ Object
Returns the value of attribute story.
-
#title ⇒ Object
Returns the value of attribute title.
-
#url ⇒ Object
Returns the value of attribute url.
Class Method Summary collapse
Instance Attribute Details
#author ⇒ Object
Returns the value of attribute author.
5 6 7 |
# File 'lib/scraper.rb', line 5 def @author end |
#story ⇒ Object
Returns the value of attribute story.
5 6 7 |
# File 'lib/scraper.rb', line 5 def story @story end |
#title ⇒ Object
Returns the value of attribute title.
5 6 7 |
# File 'lib/scraper.rb', line 5 def title @title end |
#url ⇒ Object
Returns the value of attribute url.
5 6 7 |
# File 'lib/scraper.rb', line 5 def url @url end |
Class Method Details
.scrape_article(url) ⇒ Object
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/scraper.rb', line 29 def self.scrape_article(url) agent = Mechanize.new article = agent.get(url) stories = [] story_hash = {} title = article.search("//*[@id='headline']").text = article.search('.byline-author').text story_hash[:title] = title story_hash[:author] = story_hash[:url] = url article_string = "" article.search(".story-body *").each do |paragraph| if !article_string.include?(paragraph.children.text) if paragraph.name == "p" && paragraph.children.text != "Advertisement" article_string << paragraph.children.text + "\n" + "\n" elsif paragraph.name == "h4" && !paragraph.children.text.nil? article_string << paragraph.children.text + "\n" + "\n" end end end story_hash[:story] = article_string story_hash end |
.scrape_front_page ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/scraper.rb', line 9 def self.scrape_front_page agent = Mechanize.new index = agent.get(BASE_URL) front_page_articles = [] index.css(".story-heading").each do |story| val = story.css("a").text next if val.nil? || val == false || val == "" hash = { :title => story.css("a").text.strip, :url => story.css('a').attribute('href').value } front_page_articles << hash end front_page_articles end |