Class: Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/scraper.rb

Constant Summary collapse

BASE_URL =
"http://www.nytimes.com"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Attribute Details

#authorObject

Returns the value of attribute author.



5
6
7
# File 'lib/scraper.rb', line 5

def author
  @author
end

#storyObject

Returns the value of attribute story.



5
6
7
# File 'lib/scraper.rb', line 5

def story
  @story
end

#titleObject

Returns the value of attribute title.



5
6
7
# File 'lib/scraper.rb', line 5

def title
  @title
end

#urlObject

Returns the value of attribute url.



5
6
7
# File 'lib/scraper.rb', line 5

def url
  @url
end

Class Method Details

.scrape_article(url) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/scraper.rb', line 29

def self.scrape_article(url)

  agent = Mechanize.new
  article = agent.get(url)
  stories = []
  story_hash = {}

  title = article.search("//*[@id='headline']").text
  author = article.search('.byline-author').text

  story_hash[:title] = title
  story_hash[:author] = author
  story_hash[:url] =  url

  article_string = ""
 
  article.search(".story-body *").each do |paragraph|

   if  !article_string.include?(paragraph.children.text)
    
    if paragraph.name == "p" && paragraph.children.text != "Advertisement"
       article_string << paragraph.children.text + "\n" + "\n"
    elsif paragraph.name == "h4" && !paragraph.children.text.nil?
       article_string << paragraph.children.text + "\n" + "\n"
    end
   end
    
  end
  story_hash[:story] = article_string

  story_hash
  
  
end

.scrape_front_pageObject



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/scraper.rb', line 9

def self.scrape_front_page

 agent = Mechanize.new
 index = agent.get(BASE_URL)
 front_page_articles = []
 index.css(".story-heading").each do |story|
   val =  story.css("a").text 
   next if val.nil? || val == false || val == ""
   hash = {
    
      :title => story.css("a").text.strip,  
      :url => story.css('a').attribute('href').value
   }

   front_page_articles  << hash
 end

 front_page_articles 
end