Class: Feedbase::FetchFeed

Inherits:
Object
  • Object
show all
Defined in:
lib/feedbase/fetch_feed.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(feed_url) ⇒ FetchFeed

Returns a new instance of FetchFeed.



10
11
12
# File 'lib/feedbase/fetch_feed.rb', line 10

def initialize(feed_url)
  @feed_url = feed_url
end

Instance Attribute Details

#feed_urlObject

Returns the value of attribute feed_url.



8
9
10
# File 'lib/feedbase/fetch_feed.rb', line 8

def feed_url
  @feed_url
end

Instance Method Details

#fetchObject



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/feedbase/fetch_feed.rb', line 38

def fetch
  url = fix_url(feed_url)
  start_time = Time.now
  result = begin
             Timeout::timeout(20) do
               agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3" 
               headers
               # get headers and any redirects
               `curl -sL -A'#{agent}' '#{url}'`
               end
             rescue StandardError, Timeout::Error => ex
               raise
             end
  elapsed = Time.now - start_time
  if !(x = headers[:headers].scan(/^Location: (.*)$/).flatten).empty?
    #puts "Redirected to #{x.last}"
    feed_url = x.last
  end
  result2 = Iconv.conv("UTF-8//TRANSLIT//IGNORE", (headers[:encoding] || 'iso-8859-1'), result)
  f = FeedParser.new(result2).result
  feed_params = {:feed_url => feed_url, :title => f[:title], :web_url => f[:link]}
  items = f[:items]

  { feed_params: feed_params,
    items: f[:items],
    download_params: headers.merge(download_time: elapsed) }
end

#fix_url(url) ⇒ Object



66
67
68
69
70
71
# File 'lib/feedbase/fetch_feed.rb', line 66

def fix_url(url)
  unless url =~ /^https?:\/\//
    url = "http://" + url
  end
  url
end

#headersObject



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/feedbase/fetch_feed.rb', line 14

def headers
  if @headers
    return @headers 
  end
  _headers = begin
               Timeout::timeout(20) do
                 agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3"
                 # get headers and any redirects
                 res = `curl -sIL -A'#{agent}' '#{feed_url}'`.gsub("\r\n", "\n")
                 if res !~ /^HTTP.*200 OK$/
                   puts res.inspect
                   raise "Response not OK"
                 end
                 res
               end 
             end

  #TODO check for xml 
  @headers = { headers: _headers, 
    encoding: _headers[/^Content-Type:.*charset=(.*)$/i, 1],
    etag: _headers[/^ETag: (.*)$/,1],
    last_modified: ((x = _headers[/Last-Modified: (.*)/, 1]) && DateTime.parse(x)) }
end