Class: Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/CLI_Headline_Scraper/Scraper.rb

Class Method Summary collapse

Class Method Details

.check_msnbc_urls(articles) ⇒ Object



120
121
122
123
124
125
126
127
128
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 120

def self.check_msnbc_urls(articles)
   #checks for and corrects common issue where MSNBC uses partial urls for internal links

  articles.each do |article|
    if !article[1].include?("www")
      article[1] = "http://www.msnbc.com" + article[1]
    end
  end
end

.check_reuters_urls(articles) ⇒ Object



36
37
38
39
40
41
42
43
44
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 36

def self.check_reuters_urls(articles)
   #checks for and corrects common issue where a website uses partial urls for internal links

  articles.each do |article|
    if !article[1].include?("www")
      article[1] = "https://www.reuters.com" + article[1]
    end
  end
end

.fox_article(article) ⇒ Object



86
87
88
89
90
91
92
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 86

def self.fox_article(article)
  article.html = self.get_page(article.url)
  article.summary = article.html.css("meta[name='description']").attribute("content").value

  article.date = article.html.css("meta[name='dc.date']").attribute("content").value

end

.fox_homepageObject

<<<<<<<<<<<<<<<<<<FOX SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



62
63
64
65
66
67
68
69
70
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 62

def self.fox_homepage
  puts "scraping Fox homepage"
  url = "http://www.foxnews.com"
  homepage = self.get_page(url)
  fox = Network.create_with_url("FOX NEWS", url)
  fox.home_html = homepage
  self.scrape_fox_articles.each{|article| article = Article.create_with_url(article[0],"FOX NEWS", article[1])}

end

.get_page(url) ⇒ Object



3
4
5
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 3

def self.get_page(url)
  doc = Nokogiri::HTML(open(url))
end

.msnbc_article(article) ⇒ Object



130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 130

def self.msnbc_article(article)

  article.html = self.get_page(article.url)
  article.summary = article.html.css("meta[name='description']").attribute("content").value

 if !!article.html.css("meta[property='nv:date']")[0]
   article.date = article.html.css("meta[property='nv:date']").attribute("content").value
 else
   article.date = article.html.css("meta[name = 'DC.date.issued']").attribute("content").value
 end

end

.msnbc_homepageObject

<<<<<<<<<<<<<<<MSNBC SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



97
98
99
100
101
102
103
104
105
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 97

def self.msnbc_homepage
  puts "scraping MSNBC homepage"
  url = "http://www.msnbc.com"
  homepage = self.get_page(url)
  msnbc = Network.create_with_url("MSNBC", url)
  msnbc.home_html = homepage
  self.scrape_msnbc_articles.each{|article| article = Article.create_with_url(article[0],"MSNBC", article[1])}

end

.reuters_article(article) ⇒ Object



46
47
48
49
50
51
52
53
54
55
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 46

def self.reuters_article(article)

  article.html = self.get_page(article.url)
  article.summary = article.html.css("meta[name='description']").attribute("content").value

  article.date = article.html.css("meta[name='REVISION_DATE']").attribute("content").value

  # article.authors = article.html.css("meta[name='Author']").attribute("content").value

end

.reuters_homepageObject

<<<<<<<<<<<<<<<<<<REUTERS SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



9
10
11
12
13
14
15
16
17
18
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 9

def self.reuters_homepage
  puts "scraping Reuters homepage"
  url = "https://www.reuters.com"
  homepage = self.get_page(url)
  reuters = Network.create_with_url("REUTERS", url)
  reuters.home_html = homepage
  self.scrape_reuters_articles.each{|article| article = Article.create_with_url(article[0],"REUTERS", article[1])}


end

.scrape_fox_articlesObject



72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 72

def self.scrape_fox_articles

  html = Network.find_by_name("FOX NEWS").home_html
    leader = [html.css("div.collection.collection-spotlight article.article.story-1 header a").text.strip, html.css("div.collection.collection-spotlight article.article.story-1 header a").attribute("href")]

    second = [html.css("div.main.main-secondary article.article.story-1 h2.title a").text, html.css("div.main.main-secondary article.article.story-1 h2.title a").attribute("href").value]

    third = [html.css("div.main.main-secondary article.article.story-2 h2.title a").text, html.css("div.main.main-secondary article.article.story-2 h2.title a").attribute("href").value]

  articles = [leader, second, third]

end

.scrape_msnbc_articlesObject



107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 107

def self.scrape_msnbc_articles

  html = Network.find_by_name("MSNBC").home_html
  leader = [html.css("a[data-fragment = '#homepage-item-1'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-1']").attribute("href").value]
  second = [html.css("a[data-fragment = '#homepage-item-2'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-2']").attribute("href").value]
  third = [html.css("a[data-fragment = '#homepage-item-3'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-3']").attribute("href").value]

  articles = [leader, second, third]
  self.check_msnbc_urls(articles)

  articles
end

.scrape_reuters_articlesObject



21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 21

def self.scrape_reuters_articles

  html = Network.find_by_name("REUTERS").home_html
  leader = [html.css("section.right-now-module h2.story-title a").text, html.css("section.right-now-module h2.story-title a").attribute("href").value]
  second = [html.css("section#hp-top-news-top article.story div.story-content a h3.story-title").first.text.strip, html.css("section#hp-top-news-top article.story div.story-content a").first.attribute("href").value]
  third = [html.css("section#hp-top-news-top article.story div.story-content a h3.story-title")[1].text.strip, html.css("section#hp-top-news-top article.story div.story-content a")[1].attribute("href").value]
  articles = [leader, second, third]

  self.check_reuters_urls(articles)

  articles

end