Class: WebPageParser::BbcNewsPageParserV5
- Inherits:
-
BaseParser
- Object
- BaseParser
- WebPageParser::BbcNewsPageParserV5
show all
- Defined in:
- lib/web-page-parser/parsers/bbc_news_page_parser.rb
Instance Attribute Summary
Attributes inherited from BaseParser
#guid, #url
Instance Method Summary
collapse
Methods inherited from BaseParser
#hash, #initialize, #page, #retrieve_page
Instance Method Details
#content ⇒ Object
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
|
# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 168
def content
return @content if @content
@content = []
story_body = html_doc.css('div.story-body')
if story_body.children.empty?
story_body = html_doc.css('div#story-body')
end
if story_body.children.empty?
story_body = html_doc.css('td.storybody')
end
story_body.children.each do |n|
@content << n.text.strip if n.name == 'p'
@content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head'
end
@content
end
|
#date ⇒ Object
190
191
192
193
194
195
196
|
# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 190
def date
return @date if @date
if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
@date = DateTime.parse(date_meta['content']) rescue nil
end
@date
end
|
#html_doc ⇒ Object
145
146
147
|
# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 145
def html_doc
@html_document ||= Nokogiri::HTML(page)
end
|
#title ⇒ Object
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
|
# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 149
def title
return @title if @title
@title = html_doc.css('h1.story-header').text.strip
if @title.empty?
@title = html_doc.css('div#meta-information h1').text.strip
end
if @title.empty?
if headline_meta = html_doc.at_css('meta[name=Headline]')
@title = headline_meta['content'].to_s.strip
end
end
@title
end
|