Class: Parser::NewsFr
- Defined in:
- lib/fly_parser/sources/news-fr.rb
Instance Method Summary collapse
-
#initialize(source, options = {}) ⇒ NewsFr
constructor
A new instance of NewsFr.
- #parse_all ⇒ Object
Methods inherited from XmlBase
Constructor Details
#initialize(source, options = {}) ⇒ NewsFr
Returns a new instance of NewsFr.
3 4 5 |
# File 'lib/fly_parser/sources/news-fr.rb', line 3 def initialize(source, = {}) super end |
Instance Method Details
#parse_all ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/fly_parser/sources/news-fr.rb', line 7 def parse_all items = @source.search('//item') # # last_date = Time.now - 2.years # for dev 2 years # # select! or reject! is not exists for Nokogiri#NodeSet # # items = items.select {|item| item.xpath('pubDate').first.content() > last_date } items.map do |item| title = item.xpath('title/text()').text() date = item.xpath('pubdate').first.content() link = item.xpath('link/following-sibling::text()[1]').first begin page = Nokogiri::HTML(open(link)) rescue Exception => e puts e. next end next if page.search('figure.img img').first.nil? poster_image = page.search('.article-long figure.img img').first.attributes['src'].value full_desc = page.search('.article-long .bd') full_desc.search('.modification').remove() full_desc.search('script').remove() full_desc.search('.ft').remove() full_desc.search('a').remove_attr('href') full_desc.search('.twitter-tweet').remove() desc = full_desc.inner_html desc.gsub! /h2|h1|h3/, 'h4' # remove href attributes #full_desc = full_desc.text().gsub(/<a href="([a-zA-Z:\/\.\d\-]*)">(.*)<\/a>/,'<a>\2</a>') copyright = "<p>Source: <a href='#{@copyright[:url]}'>#{@copyright[:title]}</a></p>" content = desc + copyright {title: title, content: content, poster_image: poster_image} end.compact end |