Module: Feedisco::Discovery

Defined in:
lib/feedisco/discovery.rb

Instance Method Summary collapse

Instance Method Details

#find(url, args = {}) ⇒ Object

Find RSS/Atom feed URLs by looking around the specified URL.

Raises:

  • (ArgumentError)


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/feedisco/discovery.rb', line 11

def find(url, args = {})
  raise ArgumentError.new("url can't be nil!") if url.nil?
  
  harmonized_url = harmonize_url(url)

  raise ArgumentError.new("url's protocol must be 'http(s)' or 'feed' (#{url})") if harmonized_url.nil?

  feeds = []

  # Open the URL to check the content-type or crawl for feed links
  open(harmonized_url) do |file|

    if feed_content_type?(file)
      # Add the url to feeds if it shows a feed content type
      feeds << harmonized_url
    
    else
      # Else, parse the page to search for links
      doc = Nokogiri::HTML(file.read)

      # Check <link> elements
      doc.css('link').each do |link|
        discovery = complete_extracted_url(link[:href], harmonized_url) if link[:rel] =~ %r{(alternate|service.feed)}i && Feedisco.feed_content_types.include?(link[:type].downcase.strip)
        feeds << discovery unless discovery.nil? or feeds.include? discovery
      end

      # Check <a> elements
      doc.css('a').each do |a|
        if looks_like_feed? a[:href] and
           (a[:href] =~ %r{\A/} or a[:href] =~ %r{#{URI.parse(harmonized_url).host}/})

          discovery = complete_extracted_url(a[:href], harmonized_url)
          feeds << discovery unless discovery.nil? or feeds.include?(discovery)
        end
      end

      # Check <a> elements again, less restrictively, so we add all discovered feeds even the ones
      # on external domains, but the will come after in the feeds array.
      doc.css('a').each do |a|
        discovery = complete_extracted_url(a[:href], harmonized_url) if looks_like_feed? a[:href]
        feeds << discovery unless discovery.nil? or feeds.include?(discovery)
      end
    end
  end

  feeds
end