Class: WebPageParser::BbcNewsPageParserV5

Inherits:

BaseParser

Object
BaseParser
WebPageParser::BbcNewsPageParserV5

show all

Defined in:: lib/web-page-parser/parsers/bbc_news_page_parser.rb

Instance Attribute Summary

Attributes inherited from BaseParser

#guid, #url

Instance Method Summary collapse

Methods inherited from BaseParser

#hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#content ⇒ `Object`

# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 168

def content
  return @content if @content
  @content = []
  story_body = html_doc.css('div.story-body')

  # for older bbc articles
  if story_body.children.empty?
    story_body = html_doc.css('div#story-body')
  end

  # for very old bbc articles
  if story_body.children.empty?
    story_body = html_doc.css('td.storybody')
  end

  story_body.children.each do |n|
    @content << n.text.strip if n.name == 'p'
    @content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head'
  end
  @content
end

#date ⇒ `Object`

# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 190

def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end

#html_doc ⇒ `Object`



145
146
147

# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 145

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#title ⇒ `Object`

# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 149

def title
  return @title if @title
  @title = html_doc.css('h1.story-header').text.strip

  # for older bbc articles
  if @title.empty?
    @title = html_doc.css('div#meta-information h1').text.strip
  end

  # for very old bbc articles
  if @title.empty?
    if headline_meta = html_doc.at_css('meta[name=Headline]')
      @title = headline_meta['content'].to_s.strip
    end
  end

  @title
end

Class: WebPageParser::BbcNewsPageParserV5

Instance Attribute Summary

Attributes inherited from BaseParser

Instance Method Summary collapse

Methods inherited from BaseParser

Constructor Details

Instance Method Details

#content ⇒ Object

#date ⇒ Object

#html_doc ⇒ Object

#title ⇒ Object

#content ⇒ `Object`

#date ⇒ `Object`

#html_doc ⇒ `Object`

#title ⇒ `Object`