Class: Parser::NewsFr

Inherits:
XmlBase show all
Defined in:
lib/fly_parser/sources/news-fr.rb

Instance Method Summary collapse

Methods inherited from XmlBase

#copyright, #fake_url, #tags

Constructor Details

#initialize(source, options = {}) ⇒ NewsFr

Returns a new instance of NewsFr.



3
4
5
# File 'lib/fly_parser/sources/news-fr.rb', line 3

def initialize(source, options = {})
  super
end

Instance Method Details

#parse_allObject



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/fly_parser/sources/news-fr.rb', line 7

def parse_all
  items = @source.search('//item')
  # # last_date = Time.now - 2.years # for dev 2 years
  # # select! or reject! is not exists for Nokogiri#NodeSet
  # # items = items.select {|item| item.xpath('pubDate').first.content() > last_date }
  items.map do |item|
    title = item.xpath('title/text()').text()
    date = item.xpath('pubdate').first.content()

    link = item.xpath('link/following-sibling::text()[1]').first

    begin
      page = Nokogiri::HTML(open(link))
    rescue Exception => e
      puts e.message
      next
    end

    next if page.search('figure.img img').first.nil?

    poster_image = page.search('.article-long figure.img img').first.attributes['src'].value
    full_desc = page.search('.article-long .bd')
    full_desc.search('.modification').remove()
    full_desc.search('script').remove()
    full_desc.search('.ft').remove()
    full_desc.search('a').remove_attr('href')
    full_desc.search('.twitter-tweet').remove()

    desc = full_desc.inner_html
    desc.gsub! /h2|h1|h3/, 'h4'
    # remove href attributes
    #full_desc = full_desc.text().gsub(/<a href="([a-zA-Z:\/\.\d\-]*)">(.*)<\/a>/,'<a>\2</a>')
    copyright = "<p>Source: <a href='#{@copyright[:url]}'>#{@copyright[:title]}</a></p>"
    content = desc + copyright
    {title: title, content: content, poster_image: poster_image}
  end.compact
end