Class: Scraper
- Inherits:
-
Object
- Object
- Scraper
- Defined in:
- lib/CLI_Headline_Scraper/Scraper.rb
Class Method Summary collapse
- .check_msnbc_urls(articles) ⇒ Object
- .check_reuters_urls(articles) ⇒ Object
- .fox_article(article) ⇒ Object
-
.fox_homepage ⇒ Object
<<<<<<<<<<<<<<<<<<FOX SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.
- .get_page(url) ⇒ Object
- .msnbc_article(article) ⇒ Object
-
.msnbc_homepage ⇒ Object
<<<<<<<<<<<<<<<MSNBC SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.
- .reuters_article(article) ⇒ Object
-
.reuters_homepage ⇒ Object
<<<<<<<<<<<<<<<<<<REUTERS SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.
- .scrape_fox_articles ⇒ Object
- .scrape_msnbc_articles ⇒ Object
- .scrape_reuters_articles ⇒ Object
Class Method Details
.check_msnbc_urls(articles) ⇒ Object
120 121 122 123 124 125 126 127 128 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 120 def self.check_msnbc_urls(articles) #checks for and corrects common issue where MSNBC uses partial urls for internal links articles.each do |article| if !article[1].include?("www") article[1] = "http://www.msnbc.com" + article[1] end end end |
.check_reuters_urls(articles) ⇒ Object
36 37 38 39 40 41 42 43 44 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 36 def self.check_reuters_urls(articles) #checks for and corrects common issue where a website uses partial urls for internal links articles.each do |article| if !article[1].include?("www") article[1] = "https://www.reuters.com" + article[1] end end end |
.fox_article(article) ⇒ Object
86 87 88 89 90 91 92 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 86 def self.fox_article(article) article.html = self.get_page(article.url) article.summary = article.html.css("meta[name='description']").attribute("content").value article.date = article.html.css("meta[name='dc.date']").attribute("content").value end |
.fox_homepage ⇒ Object
<<<<<<<<<<<<<<<<<<FOX SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
62 63 64 65 66 67 68 69 70 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 62 def self.fox_homepage puts "scraping Fox homepage" url = "http://www.foxnews.com" homepage = self.get_page(url) fox = Network.create_with_url("FOX NEWS", url) fox.home_html = homepage self.scrape_fox_articles.each{|article| article = Article.create_with_url(article[0],"FOX NEWS", article[1])} end |
.get_page(url) ⇒ Object
3 4 5 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 3 def self.get_page(url) doc = Nokogiri::HTML(open(url)) end |
.msnbc_article(article) ⇒ Object
130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 130 def self.msnbc_article(article) article.html = self.get_page(article.url) article.summary = article.html.css("meta[name='description']").attribute("content").value if !!article.html.css("meta[property='nv:date']")[0] article.date = article.html.css("meta[property='nv:date']").attribute("content").value else article.date = article.html.css("meta[name = 'DC.date.issued']").attribute("content").value end end |
.msnbc_homepage ⇒ Object
<<<<<<<<<<<<<<<MSNBC SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
97 98 99 100 101 102 103 104 105 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 97 def self.msnbc_homepage puts "scraping MSNBC homepage" url = "http://www.msnbc.com" homepage = self.get_page(url) msnbc = Network.create_with_url("MSNBC", url) msnbc.home_html = homepage self.scrape_msnbc_articles.each{|article| article = Article.create_with_url(article[0],"MSNBC", article[1])} end |
.reuters_article(article) ⇒ Object
46 47 48 49 50 51 52 53 54 55 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 46 def self.reuters_article(article) article.html = self.get_page(article.url) article.summary = article.html.css("meta[name='description']").attribute("content").value article.date = article.html.css("meta[name='REVISION_DATE']").attribute("content").value # article.authors = article.html.css("meta[name='Author']").attribute("content").value end |
.reuters_homepage ⇒ Object
<<<<<<<<<<<<<<<<<<REUTERS SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
9 10 11 12 13 14 15 16 17 18 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 9 def self.reuters_homepage puts "scraping Reuters homepage" url = "https://www.reuters.com" homepage = self.get_page(url) reuters = Network.create_with_url("REUTERS", url) reuters.home_html = homepage self.scrape_reuters_articles.each{|article| article = Article.create_with_url(article[0],"REUTERS", article[1])} end |
.scrape_fox_articles ⇒ Object
72 73 74 75 76 77 78 79 80 81 82 83 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 72 def self.scrape_fox_articles html = Network.find_by_name("FOX NEWS").home_html leader = [html.css("div.collection.collection-spotlight article.article.story-1 header a").text.strip, html.css("div.collection.collection-spotlight article.article.story-1 header a").attribute("href")] second = [html.css("div.main.main-secondary article.article.story-1 h2.title a").text, html.css("div.main.main-secondary article.article.story-1 h2.title a").attribute("href").value] third = [html.css("div.main.main-secondary article.article.story-2 h2.title a").text, html.css("div.main.main-secondary article.article.story-2 h2.title a").attribute("href").value] articles = [leader, second, third] end |
.scrape_msnbc_articles ⇒ Object
107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 107 def self.scrape_msnbc_articles html = Network.find_by_name("MSNBC").home_html leader = [html.css("a[data-fragment = '#homepage-item-1'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-1']").attribute("href").value] second = [html.css("a[data-fragment = '#homepage-item-2'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-2']").attribute("href").value] third = [html.css("a[data-fragment = '#homepage-item-3'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-3']").attribute("href").value] articles = [leader, second, third] self.check_msnbc_urls(articles) articles end |
.scrape_reuters_articles ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/CLI_Headline_Scraper/Scraper.rb', line 21 def self.scrape_reuters_articles html = Network.find_by_name("REUTERS").home_html leader = [html.css("section.right-now-module h2.story-title a").text, html.css("section.right-now-module h2.story-title a").attribute("href").value] second = [html.css("section#hp-top-news-top article.story div.story-content a h3.story-title").first.text.strip, html.css("section#hp-top-news-top article.story div.story-content a").first.attribute("href").value] third = [html.css("section#hp-top-news-top article.story div.story-content a h3.story-title")[1].text.strip, html.css("section#hp-top-news-top article.story div.story-content a")[1].attribute("href").value] articles = [leader, second, third] self.check_reuters_urls(articles) articles end |