Class: WebPageParser::NewYorkTimesPageParserV2

Inherits:

BaseParser

Object
BaseParser
WebPageParser::NewYorkTimesPageParserV2

show all

Defined in:: lib/web-page-parser/parsers/new_york_times_page_parser.rb

Overview

NewYorkTimesPageParserV2 parses New York Times web pages, including the new format change in Janurary 2014

Instance Attribute Summary

Attributes inherited from BaseParser

#guid, #url

Instance Method Summary collapse

Methods inherited from BaseParser

#hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#content ⇒ `Object`

# File 'lib/web-page-parser/parsers/new_york_times_page_parser.rb', line 84

def content
  return @content if @content
  @content = []
  story_body = html_doc.css('p.story-content')
  if story_body.empty?
    # old style
    story_body = html_doc.css('p[itemprop=articleBody]')
  end
  story_body.each do |p|
    @content << p.text.strip
  end
  @content
end

#date ⇒ `Object`

# File 'lib/web-page-parser/parsers/new_york_times_page_parser.rb', line 98

def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[name=dat]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end

#html_doc ⇒ `Object`



76
77
78

# File 'lib/web-page-parser/parsers/new_york_times_page_parser.rb', line 76

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#title ⇒ `Object`



80
81
82

# File 'lib/web-page-parser/parsers/new_york_times_page_parser.rb', line 80

def title
  @title ||= html_doc.css('h1[itemprop=headline]').text.strip
end

Class: WebPageParser::NewYorkTimesPageParserV2

Overview

Instance Attribute Summary

Attributes inherited from BaseParser

Instance Method Summary collapse

Methods inherited from BaseParser

Constructor Details

Instance Method Details

#content ⇒ Object

#date ⇒ Object

#html_doc ⇒ Object

#title ⇒ Object

#content ⇒ `Object`

#date ⇒ `Object`

#html_doc ⇒ `Object`

#title ⇒ `Object`