Class: WebPageParser::NewYorkTimesPageParserV2

Inherits:
BaseParser
  • Object
show all
Defined in:
lib/web-page-parser/parsers/new_york_times_page_parser.rb

Overview

NewYorkTimesPageParserV2 parses New York Times web pages, including the new format change in Janurary 2014

Instance Attribute Summary

Attributes inherited from BaseParser

#guid, #url

Instance Method Summary collapse

Methods inherited from BaseParser

#hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#contentObject



84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/web-page-parser/parsers/new_york_times_page_parser.rb', line 84

def content
  return @content if @content
  @content = []
  story_body = html_doc.css('p.story-content')
  if story_body.empty?
    # old style
    story_body = html_doc.css('p[itemprop=articleBody]')
  end
  story_body.each do |p|
    @content << p.text.strip
  end
  @content
end

#dateObject



98
99
100
101
102
103
104
# File 'lib/web-page-parser/parsers/new_york_times_page_parser.rb', line 98

def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[name=dat]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end

#html_docObject



76
77
78
# File 'lib/web-page-parser/parsers/new_york_times_page_parser.rb', line 76

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#titleObject



80
81
82
# File 'lib/web-page-parser/parsers/new_york_times_page_parser.rb', line 80

def title
  @title ||= html_doc.css('h1[itemprop=headline]').text.strip
end