Module: FeedParser

Extended by:: FeedParserUtilities

Defined in:: lib/rfeedparser.rb,
lib/rfeedparser/expat_parser.rb,
lib/rfeedparser/time_helpers.rb,
lib/rfeedparser/libxml_parser.rb,
lib/rfeedparser/feedparserdict.rb,
lib/rfeedparser/loose_feed_parser.rb

Defined Under Namespace

Modules: Expat, LibXML Classes: CharacterEncodingOverride, CharacterEncodingUnknown, FeedParserDict, FeedTimeParser, LooseFeedParser, NonXMLContentType, ThingsNobodyCaresAboutButMe, UndeclaredNamespace

Constant Summary collapse

VERSION =

"0.9.951"

AUTHOR =

"Mark Pilgrim <http://diveintomark.org/>"

PORTER =

"Jeff Hodges <http://somethingsimilar.com>"

CONTRIBUTERS =

["Jason Diamond <http://injektilo.org/>",
"John Beimler <http://john.beimler.org/>",
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
"Aaron Swartz <http://aaronsw.com/>",
"Kevin Marks <http://epeus.blogspot.com/>",
"Jesse Newland <http://jnewland.com/>",
"Charlie Savage <http://cfis.savagexi.com/>",
"Phil Hagelberg <http://technomancy.us>"]

USER_AGENT = HTTP “User-Agent” header to send to servers when downloading feeds. If you are embedding feedparser in a larger application, you should change this to your application name and URL.

"rFeedParser/#{VERSION} +http://rfeedparser.rubyforge.org/"

ACCEPT_HEADER = HTTP “Accept” header to send to servers when downloading feeds. If you don’t want to send an Accept header, set this to nil.

"application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"

SUPPORTED_VERSIONS =

{'' => 'unknown',
  'rss090' => 'RSS 0.90',
  'rss091n' => 'RSS 0.91 (Netscape)',
  'rss091u' => 'RSS 0.91 (Userland)',
  'rss092' => 'RSS 0.92',
  'rss093' => 'RSS 0.93',
  'rss094' => 'RSS 0.94',
  'rss20' => 'RSS 2.0',
  'rss10' => 'RSS 1.0',
  'rss' => 'RSS (unknown version)',
  'atom01' => 'Atom 0.1',
  'atom02' => 'Atom 0.2',
  'atom03' => 'Atom 0.3',
  'atom10' => 'Atom 1.0',
  'atom' => 'Atom (unknown version)',
  'cdf' => 'CDF',
  'hotrss' => 'Hot RSS'
}

Constants included from FeedParserUtilities

FeedParserUtilities::Encoding_Aliases

Class Method Summary collapse

.open_resource(url_file_stream_or_string, options) ⇒ Object

Accepted in options: :agent, :modified, :etag, and :referrer.
.parse(url_file_stream_or_string, options = {}) ⇒ Object

Parse a feed from a URL, file, stream or string.

Methods included from FeedParserUtilities

SanitizerDoc, _ebcdic_to_ascii, extract_tuple, getCharacterEncoding, index_match, parse_date, py2rtime, resolveRelativeURIs, sanitizeHTML, stripDoctype, toUTF8, uconvert, unicode

Class Method Details

.open_resource(url_file_stream_or_string, options) ⇒ `Object`

Accepted in options: :agent, :modified, :etag, and :referrer

# File 'lib/rfeedparser.rb', line 151

def open_resource(url_file_stream_or_string, options)
  options[:handlers] ||= []

  if url_file_stream_or_string.respond_to?(:read)
    return url_file_stream_or_string

  elsif url_file_stream_or_string == '-'
    return $stdin
  end
    
  # open-uri freaks out if there's leading spaces.
  url_file_stream_or_string.strip!
  
  
  uri = Addressable::URI.parse(url_file_stream_or_string)
  if uri && ['http','https','ftp'].include?(uri.scheme)
    auth = nil

    if uri.host && uri.password
      auth = Base64::encode64("#{uri.user}:#{uri.password}").strip
      uri.password = nil
      url_file_stream_or_string = uri.to_s
    end

    req_headers = {} 
    req_headers["User-Agent"] = options[:agent] || USER_AGENT
    req_headers["If-None-Match"] = options[:etag] if options[:etag]
    
    if options[:modified]
      if options[:modified].is_a?(String)
        req_headers["If-Modified-Since"] = parse_date(options[:modified]).httpdate
      elsif options[:modified].is_a?(Time)
        req_headers["If-Modified-Since"] = options[:modified].httpdate
      elsif options[:modified].is_a?(Array)
        req_headers["If-Modified-Since"] = py2rtime(options[:modified]).httpdate
      end
    end
    
    req_headers["Referer"] = options[:referrer] if options[:referrer]
    req_headers["Accept-encoding"] = 'gzip, deflate' # FIXME make tests
    req_headers["Authorization"] = "Basic #{auth}" if auth
    req_headers['Accept'] = ACCEPT_HEADER if ACCEPT_HEADER
    req_headers['A-IM'] = 'feed' # RFC 3229 support 
    
    begin
      return open(url_file_stream_or_string, req_headers) 
    rescue OpenURI::HTTPError => e
      return e.io
    rescue
    end
  end

  # try to open with native open function (if url_file_stream_or_string is a filename)
  begin 
    return open(url_file_stream_or_string)
  rescue
  end
  # treat url_file_stream_or_string as string          
  return StringIO.new(url_file_stream_or_string.to_s)
end

.parse(url_file_stream_or_string, options = {}) ⇒ `Object`

Parse a feed from a URL, file, stream or string

# File 'lib/rfeedparser.rb', line 214

def parse(url_file_stream_or_string, options = {})
    
  
  # Use the default compatibility if compatible is nil
  $compatible = options[:compatible].nil? ? $compatible : options[:compatible]

  strictklass = options[:strict] || StrictFeedParser
  looseklass = options[:loose] || LooseFeedParser
  options[:handlers] = options[:handlers] || []
  
  result = FeedParserDict.new
  result['feed'] = FeedParserDict.new
  result['entries'] = []
  
  result['bozo'] = false
      
  begin
    f = open_resource(url_file_stream_or_string, options)
    data = f.read
  rescue => e
    result['bozo'] = true
    result['bozo_exception'] = e
    data = ''
    f = nil
  end
  
  if f and !(data.nil? || data.empty?) and f.respond_to?(:meta)
    # if feed is gzip-compressed, decompress it
    if f.meta['content-encoding'] == 'gzip'
      begin
        gz =  Zlib::GzipReader.new(StringIO.new(data))
        data = gz.read
        gz.close
      rescue => e
        # Some feeds claim to be gzipped but they're not, so
        # we get garbage.  Ideally, we should re-request the
        # feed without the 'Accept-encoding: gzip' header,
        # but we don't.
        result['bozo'] = true
        result['bozo_exception'] = e
        data = ''
      end
    elsif f.meta['content-encoding'] == 'deflate'
      begin
        data = Zlib::Deflate.inflate(data)
      rescue => e
        result['bozo'] = true
        result['bozo_exception'] = e
        data = ''
      end
    end
  end
  
  if f.respond_to?(:meta)
    result['etag'] = f.meta['etag']
    result['modified_time'] = parse_date(f.meta['last-modified'])
    result['modified'] = extract_tuple(result['modified_time'])
    result['headers'] = f.meta
  end
  
  # FIXME open-uri does not return a non-nil base_uri in its HTTPErrors. 
  if f.respond_to?(:base_uri)
    result['href'] = f.base_uri.to_s # URI => String
    result['status'] = '200'
  end
  
  if f.respond_to?(:status)
    result['status'] = f.status[0] 
  end


  # there are four encodings to keep track of:
  # - http_encoding is the encoding declared in the Content-Type HTTP header
  # - xml_encoding is the encoding declared in the <?xml declaration
  # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
  # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
  http_headers = result['headers'] || {}
  result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
  getCharacterEncoding(http_headers, data)


  if !(http_headers.nil? || http_headers.empty?) && !acceptable_content_type
    if http_headers['content-type']
      bozo_message = "#{http_headers['content-type']} is not an XML media type"
    else
      bozo_message = 'no Content-type specified'
    end

    result['bozo'] = true
    result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
  end

  result['version'], data = stripDoctype(data)
  
  baseuri = http_headers['content-location'] || result['href']
  baselang = http_headers['content-language']

  # if server sent 304, we're done
  if result['status'] == 304
    result['version'] = ''
    result['debug_message'] = "The feed has not changed since you last checked, " +
    "so the server sent no data. This is a feature, not a bug!"
    return result
  end

  # if there was a problem downloading, we're done
  if data.nil? or data.empty?
    return result
  end

  # determine character encoding
  use_strict_parser = false
  known_encoding = false
  tried_encodings = []
  proposed_encoding = nil
  # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
  [result['encoding'], xml_encoding, sniffed_xml_encoding].each do |proposed_encoding|
    next if proposed_encoding.nil? or proposed_encoding.empty?
    next if tried_encodings.include? proposed_encoding
    tried_encodings << proposed_encoding
    begin
      data = toUTF8(data, proposed_encoding)
      known_encoding = use_strict_parser = true
      break
    rescue
    end
  end

  # if no luck and we have auto-detection library, try that
  if not known_encoding and $chardet
    begin 
      proposed_encoding = CharDet.detect(data)['encoding']
      if proposed_encoding and not tried_encodings.include?proposed_encoding
        tried_encodings << proposed_encoding
        data = toUTF8(data, proposed_encoding)
        known_encoding = use_strict_parser = true
      end
    rescue
    end
  end

  # if still no luck and we haven't tried utf-8 yet, try that
  if not known_encoding and not tried_encodings.include?'utf-8'
    begin
      proposed_encoding = 'utf-8'
      tried_encodings << proposed_encoding
      data = toUTF8(data, proposed_encoding)
      known_encoding = use_strict_parser = true
    rescue
    end
  end

  # if still no luck and we haven't tried windows-1252 yet, try that
  if not known_encoding and not tried_encodings.include?'windows-1252'
    begin
      proposed_encoding = 'windows-1252'
      tried_encodings << proposed_encoding
      data = toUTF8(data, proposed_encoding)
      known_encoding = use_strict_parser = true
    rescue
    end
  end

  # NOTE this isn't in FeedParser.py 4.1
  # if still no luck and we haven't tried iso-8859-2 yet, try that.
  #if not known_encoding and not tried_encodings.include?'iso-8859-2'
  #  begin
  #    proposed_encoding = 'iso-8859-2'
  #    tried_encodings << proposed_encoding
  #    data = toUTF8(data, proposed_encoding)
  #    known_encoding = use_strict_parser = true
  #  rescue
  #  end
  #end


  # if still no luck, give up
  if not known_encoding
    result['bozo'] = true
    result['bozo_exception'] = CharacterEncodingUnknown.new("document encoding unknown, I tried #{result['encoding']}, #{xml_encoding}, utf-8 and windows-1252 but nothing worked")
    result['encoding'] = ''
  elsif proposed_encoding != result['encoding']
    result['bozo'] = true
    result['bozo_exception'] = CharacterEncodingOverride.new("documented declared as #{result['encoding']}, but parsed as #{proposed_encoding}")
    result['encoding'] = proposed_encoding
  end

  use_strict_parser = false unless StrictFeedParser

  if use_strict_parser
    begin
      parser = StrictFeedParser.new(baseuri, baselang)
      feedparser = parser.handler
      parser.parse(data)

    rescue => err
      $stderr << "xml parsing failed: #{err.message}\n#{err.backtrace.join("\n")}" if $debug
      result['bozo'] = true
      result['bozo_exception'] = feedparser.exc || e 
      use_strict_parser = false
    end
  end
  
  if not use_strict_parser
    $stderr << "Using LooseFeed\n\n" if $debug
    feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
    feedparser.parse(data)
  end

  result['feed'] = feedparser.feeddata
  result['entries'] = feedparser.entries
  result['version'] = result['version'] || feedparser.version
  result['namespaces'] = feedparser.namespacesInUse
  return result
end