Class: WebPageParser::BbcNewsPageParserV5

Inherits:
BaseParser
  • Object
show all
Defined in:
lib/web-page-parser/parsers/bbc_news_page_parser.rb

Instance Attribute Summary

Attributes inherited from BaseParser

#guid, #url

Instance Method Summary collapse

Methods inherited from BaseParser

#hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#contentObject



168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 168

def content
  return @content if @content
  @content = []
  story_body = html_doc.css('div.story-body')

  # for older bbc articles
  if story_body.children.empty?
    story_body = html_doc.css('div#story-body')
  end

  # for very old bbc articles
  if story_body.children.empty?
    story_body = html_doc.css('td.storybody')
  end

  story_body.children.each do |n|
    @content << n.text.strip if n.name == 'p'
    @content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head'
  end
  @content
end

#dateObject



190
191
192
193
194
195
196
# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 190

def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end

#html_docObject



145
146
147
# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 145

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#titleObject



149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 149

def title
  return @title if @title
  @title = html_doc.css('h1.story-header').text.strip

  # for older bbc articles
  if @title.empty?
    @title = html_doc.css('div#meta-information h1').text.strip
  end

  # for very old bbc articles
  if @title.empty?
    if headline_meta = html_doc.at_css('meta[name=Headline]')
      @title = headline_meta['content'].to_s.strip
    end
  end

  @title
end