Module: FeedParserUtilities

Defined in:
lib/rfeedparser/scrub.rb,
lib/rfeedparser/aliases.rb,
lib/rfeedparser/time_helpers.rb,
lib/rfeedparser/feedparserdict.rb,
lib/rfeedparser/markup_helpers.rb,
lib/rfeedparser/encoding_helpers.rb

Defined Under Namespace

Classes: FeedParserDict, SanitizerDoc

Constant Summary collapse

Encoding_Aliases =

Adapted from python2.4’s encodings/aliases.py

{     
  'unicode'		 => 'utf-16',  

  # MacOSX does not have Unicode as a separate encoding nor even
  # aliased. My Ubuntu box has it as a separate encoding but I cannot
  # for the life of me figure out where the source code for UNICODE.so
  # is (supposedly, in libc6 .deb but that's a damn lie), so I don't
  # know what it expects. After some extensive research, I've decided
  # to alias it to utf-16 much like Python does when it is built with
  # --enable-unicode=ucs2. This could be seriously wrong. I have no idea.

  # ascii codec
  '646'                => 'ascii',
  'ansi_x3.4_1968'     => 'ascii',
  'ansi_x3_4_1968'     => 'ascii', # some email headers use this non-standard name
  'ansi_x3.4_1986'     => 'ascii',
  'cp367'              => 'ascii',
  'csascii'            => 'ascii',
  'ibm367'             => 'ascii',
  'iso646_us'          => 'ascii',
  'iso_646.irv_1991'   => 'ascii',
  'iso_ir_6'           => 'ascii',
  'us'                 => 'ascii',
  'us_ascii'           => 'ascii',

  # big5 codec
  'big5_tw'            => 'big5',
  'csbig5'             => 'big5',

  # big5hkscs codec
  'big5_hkscs'         => 'big5hkscs',
  'hkscs'              => 'big5hkscs',

  # cp037 codec
  '037'                => 'cp037',
  'csibm037'           => 'cp037',
  'ebcdic_cp_ca'       => 'cp037',
  'ebcdic_cp_nl'       => 'cp037',
  'ebcdic_cp_us'       => 'cp037',
  'ebcdic_cp_wt'       => 'cp037',
  'ibm037'             => 'cp037',
  'ibm039'             => 'cp037',

  # cp1026 codec
  '1026'               => 'cp1026',
  'csibm1026'          => 'cp1026',
  'ibm1026'            => 'cp1026',

  # cp1140 codec
  '1140'               => 'cp1140',
  'ibm1140'            => 'cp1140',

  # cp1250 codec
  '1250'               => 'cp1250',
  'windows_1250'       => 'cp1250',

  # cp1251 codec
  '1251'               => 'cp1251',
  'windows_1251'       => 'cp1251',

  # cp1252 codec
  '1252'               => 'cp1252',
  'windows_1252'       => 'cp1252',

  # cp1253 codec
  '1253'               => 'cp1253',
  'windows_1253'       => 'cp1253',

  # cp1254 codec
  '1254'               => 'cp1254',
  'windows_1254'       => 'cp1254',

  # cp1255 codec
  '1255'               => 'cp1255',
  'windows_1255'       => 'cp1255',

  # cp1256 codec
  '1256'               => 'cp1256',
  'windows_1256'       => 'cp1256',

  # cp1257 codec
  '1257'               => 'cp1257',
  'windows_1257'       => 'cp1257',

  # cp1258 codec
  '1258'               => 'cp1258',
  'windows_1258'       => 'cp1258',

  # cp424 codec
  '424'                => 'cp424',
  'csibm424'           => 'cp424',
  'ebcdic_cp_he'       => 'cp424',
  'ibm424'             => 'cp424',

  # cp437 codec
  '437'                => 'cp437',
  'cspc8codepage437'   => 'cp437',
  'ibm437'             => 'cp437',

  # cp500 codec
  '500'                => 'cp500',
  'csibm500'           => 'cp500',
  'ebcdic_cp_be'       => 'cp500',
  'ebcdic_cp_ch'       => 'cp500',
  'ibm500'             => 'cp500',

  # cp775 codec
  '775'              => 'cp775',
  'cspc775baltic'      => 'cp775',
  'ibm775'             => 'cp775',

  # cp850 codec
  '850'                => 'cp850',
  'cspc850multilingual' => 'cp850',
  'ibm850'             => 'cp850',

  # cp852 codec
  '852'                => 'cp852',
  'cspcp852'           => 'cp852',
  'ibm852'             => 'cp852',

  # cp855 codec
  '855'                => 'cp855',
  'csibm855'           => 'cp855',
  'ibm855'             => 'cp855',

  # cp857 codec
  '857'                => 'cp857',
  'csibm857'           => 'cp857',
  'ibm857'             => 'cp857',

  # cp860 codec
  '860'                => 'cp860',
  'csibm860'           => 'cp860',
  'ibm860'             => 'cp860',

  # cp861 codec
  '861'                => 'cp861',
  'cp_is'              => 'cp861',
  'csibm861'           => 'cp861',
  'ibm861'             => 'cp861',

  # cp862 codec
  '862'                => 'cp862',
  'cspc862latinhebrew' => 'cp862',
  'ibm862'             => 'cp862',

  # cp863 codec
  '863'                => 'cp863',
  'csibm863'           => 'cp863',
  'ibm863'             => 'cp863',

  # cp864 codec
  '864'                => 'cp864',
  'csibm864'           => 'cp864',
  'ibm864'             => 'cp864',

  # cp865 codec
  '865'                => 'cp865',
  'csibm865'           => 'cp865',
  'ibm865'             => 'cp865',

  # cp866 codec
  '866'                => 'cp866',
  'csibm866'           => 'cp866',
  'ibm866'             => 'cp866',

  # cp869 codec
  '869'                => 'cp869',
  'cp_gr'              => 'cp869',
  'csibm869'           => 'cp869',
  'ibm869'             => 'cp869',

  # cp932 codec
  '932'                => 'cp932',
  'ms932'              => 'cp932',
  'mskanji'            => 'cp932',
  'ms_kanji'           => 'cp932',

  # cp949 codec
  '949'                => 'cp949',
  'ms949'              => 'cp949',
  'uhc'                => 'cp949',

  # cp950 codec
  '950'                => 'cp950',
  'ms950'              => 'cp950',

  # euc_jp codec
  'euc_jp'             => 'euc-jp',
  'eucjp'              => 'euc-jp',
  'ujis'               => 'euc-jp',
  'u_jis'              => 'euc-jp',

  # euc_kr codec
  'euc_kr'             => 'euc-kr',
  'euckr'              => 'euc-kr',
  'korean'             => 'euc-kr',
  'ksc5601'            => 'euc-kr',
  'ks_c_5601'          => 'euc-kr',
  'ks_c_5601_1987'     => 'euc-kr',
  'ksx1001'            => 'euc-kr',
  'ks_x_1001'          => 'euc-kr',

  # gb18030 codec
  'gb18030_2000'       => 'gb18030',

  # gb2312 codec
  'chinese'            => 'gb2312',
  'csiso58gb231280'    => 'gb2312',
  'euc_cn'             => 'gb2312',
  'euccn'              => 'gb2312',
  'eucgb2312_cn'       => 'gb2312',
  'gb2312_1980'        => 'gb2312',
  'gb2312_80'          => 'gb2312',
  'iso_ir_58'          => 'gb2312',

  # gbk codec
  '936'                => 'gbk',
  'cp936'              => 'gbk',
  'ms936'              => 'gbk',

  # hp-roman8 codec
  'hp_roman8'          => 'hp-roman8',
  'roman8'             => 'hp-roman8',
  'r8'                 => 'hp-roman8',
  'csHPRoman8'         => 'hp-roman8',

  # iso2022_jp codec
  'iso2022_jp'         => 'iso-2022-jp',
  'csiso2022jp'        => 'iso-2022-jp',
  'iso2022jp'          => 'iso-2022-jp',
  'iso_2022_jp'        => 'iso-2022-jp',

  # iso2022_jp_1 codec
  'iso2002_jp_1'       => 'iso-2022-jp-1',
  'iso2022jp_1'        => 'iso-2022-jp-1',
  'iso_2022_jp_1'      => 'iso-2022-jp-1',

  # iso2022_jp_2 codec
  'iso2022_jp_2'       => 'iso-2002-jp-2',
  'iso2022jp_2'        => 'iso-2022-jp-2',
  'iso_2022_jp_2'      => 'iso-2022-jp-2',

  # iso2022_jp_3 codec
  'iso2002_jp_3'       => 'iso-2022-jp-3',
  'iso2022jp_3'        => 'iso-2022-jp-3',
  'iso_2022_jp_3'      => 'iso-2022-jp-3',

  # iso2022_kr codec
  'iso2022_kr'         => 'iso-2022-kr',
  'csiso2022kr'        => 'iso-2022-kr',
  'iso2022kr'          => 'iso-2022-kr',
  'iso_2022_kr'        => 'iso-2022-kr',

  # iso8859_10 codec
  'iso8859_10'         => 'iso-8859-10',
  'csisolatin6'        => 'iso-8859-10',
  'iso_8859_10'        => 'iso-8859-10',
  'iso_8859_10_1992'   => 'iso-8859-10',
  'iso_ir_157'         => 'iso-8859-10',
  'l6'                 => 'iso-8859-10',
  'latin6'             => 'iso-8859-10',

  # iso8859_13 codec
  'iso8859_13'         => 'iso-8859-13',
  'iso_8859_13'        => 'iso-8859-13',

  # iso8859_14 codec
  'iso8859_14'         => 'iso-8859-14',
  'iso_8859_14'        => 'iso-8859-14',
  'iso_8859_14_1998'   => 'iso-8859-14',
  'iso_celtic'         => 'iso-8859-14',
  'iso_ir_199'         => 'iso-8859-14',
  'l8'                 => 'iso-8859-14',
  'latin8'             => 'iso-8859-14',

  # iso8859_15 codec
  'iso8859_15'         => 'iso-8859-15',
  'iso_8859_15'        => 'iso-8859-15',

  # iso8859_1 codec
  'latin_1'            => 'iso-8859-1',
  'cp819'              => 'iso-8859-1',
  'csisolatin1'        => 'iso-8859-1',
  'ibm819'             => 'iso-8859-1',
  'iso8859'            => 'iso-8859-1',
  'iso_8859_1'         => 'iso-8859-1',
  'iso_8859_1_1987'    => 'iso-8859-1',
  'iso_ir_100'         => 'iso-8859-1',
  'l1'                 => 'iso-8859-1',
  'latin'              => 'iso-8859-1',
  'latin1'             => 'iso-8859-1',

  # iso8859_2 codec
  'iso8859_2'          => 'iso-8859-2',
  'csisolatin2'        => 'iso-8859-2',
  'iso_8859_2'         => 'iso-8859-2',
  'iso_8859_2_1987'    => 'iso-8859-2',
  'iso_ir_101'         => 'iso-8859-2',
  'l2'                 => 'iso-8859-2',
  'latin2'             => 'iso-8859-2',

  # iso8859_3 codec
  'iso8859_3'          => 'iso-8859-3',
  'csisolatin3'        => 'iso-8859-3',
  'iso_8859_3'         => 'iso-8859-3',
  'iso_8859_3_1988'    => 'iso-8859-3',
  'iso_ir_109'         => 'iso-8859-3',
  'l3'                 => 'iso-8859-3',
  'latin3'             => 'iso-8859-3',

  # iso8859_4 codec
  'iso8849_4'          => 'iso-8859-4',
  'csisolatin4'        => 'iso-8859-4',
  'iso_8859_4'         => 'iso-8859-4',
  'iso_8859_4_1988'    => 'iso-8859-4',
  'iso_ir_110'         => 'iso-8859-4',
  'l4'                 => 'iso-8859-4',
  'latin4'             => 'iso-8859-4',

  # iso8859_5 codec
  'iso8859_5'          => 'iso-8859-5',
  'csisolatincyrillic' => 'iso-8859-5',
  'cyrillic'           => 'iso-8859-5',
  'iso_8859_5'         => 'iso-8859-5',
  'iso_8859_5_1988'    => 'iso-8859-5',
  'iso_ir_144'         => 'iso-8859-5',

  # iso8859_6 codec
  'iso8859_6'          => 'iso-8859-6',
  'arabic'             => 'iso-8859-6',
  'asmo_708'           => 'iso-8859-6',
  'csisolatinarabic'   => 'iso-8859-6',
  'ecma_114'           => 'iso-8859-6',
  'iso_8859_6'         => 'iso-8859-6',
  'iso_8859_6_1987'    => 'iso-8859-6',
  'iso_ir_127'         => 'iso-8859-6',

  # iso8859_7 codec
  'iso8859_7'          => 'iso-8859-7',
  'csisolatingreek'    => 'iso-8859-7',
  'ecma_118'           => 'iso-8859-7',
  'elot_928'           => 'iso-8859-7',
  'greek'              => 'iso-8859-7',
  'greek8'             => 'iso-8859-7',
  'iso_8859_7'         => 'iso-8859-7',
  'iso_8859_7_1987'    => 'iso-8859-7',
  'iso_ir_126'         => 'iso-8859-7',

  # iso8859_8 codec
  'iso8859_9'          => 'iso8859_8',
  'csisolatinhebrew'   => 'iso-8859-8',
  'hebrew'             => 'iso-8859-8',
  'iso_8859_8'         => 'iso-8859-8',
  'iso_8859_8_1988'    => 'iso-8859-8',
  'iso_ir_138'         => 'iso-8859-8',

  # iso8859_9 codec
  'iso8859_9'          => 'iso-8859-9',
  'csisolatin5'        => 'iso-8859-9',
  'iso_8859_9'         => 'iso-8859-9',
  'iso_8859_9_1989'    => 'iso-8859-9',
  'iso_ir_148'         => 'iso-8859-9',
  'l5'                 => 'iso-8859-9',
  'latin5'             => 'iso-8859-9',

  # iso8859_11 codec
  'iso8859_11'         => 'iso-8859-11',
  'thai'               => 'iso-8859-11',
  'iso_8859_11'        => 'iso-8859-11',
  'iso_8859_11_2001'   => 'iso-8859-11',

  # iso8859_16 codec
  'iso8859_16'         => 'iso-8859-16',
  'iso_8859_16'        => 'iso-8859-16',
  'iso_8859_16_2001'   => 'iso-8859-16',
  'iso_ir_226'         => 'iso-8859-16',
  'l10'                => 'iso-8859-16',
  'latin10'            => 'iso-8859-16',

  # cskoi8r codec 
  'koi8_r'             => 'cskoi8r',

  # mac_cyrillic codec
  'mac_cyrillic'       => 'maccyrillic',

  # shift_jis codec
  'csshiftjis'         => 'shift_jis',
  'shiftjis'           => 'shift_jis',
  'sjis'               => 'shift_jis',
  's_jis'              => 'shift_jis',

  # shift_jisx0213 codec
  'shiftjisx0213'      => 'shift_jisx0213',
  'sjisx0213'          => 'shift_jisx0213',
  's_jisx0213'         => 'shift_jisx0213',

  # utf_16 codec
  'utf_16'             => 'utf-16',
  'u16'                => 'utf-16',
  'utf16'              => 'utf-16',

  # utf_16_be codec
  'utf_16_be'          => 'utf-16be',
  'unicodebigunmarked' => 'utf-16be',
  'utf_16be'           => 'utf-16be',

  # utf_16_le codec
  'utf_16_le'          => 'utf-16le',
  'unicodelittleunmarked' => 'utf-16le',
  'utf_16le'           => 'utf-16le',

  # utf_7 codec
  'utf_7'              => 'utf-7',
  'u7'                 => 'utf-7',
  'utf7'               => 'utf-7',

  # utf_8 codec
  'utf_8'              => 'utf-8',
  'u8'                 => 'utf-8',
  'utf'                => 'utf-8',
  'utf8'               => 'utf-8',
  'utf8_ucs2'          => 'utf-8',
  'utf8_ucs4'          => 'utf-8',
}

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.SanitizerDoc(html) ⇒ Object



195
196
197
# File 'lib/rfeedparser/scrub.rb', line 195

def SanitizerDoc(html)
  SanitizerDoc.new(Hpricot.make(html))
end

Instance Method Details

#_ebcdic_to_ascii(s) ⇒ Object



26
27
28
# File 'lib/rfeedparser/encoding_helpers.rb', line 26

def _ebcdic_to_ascii(s)   
  return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
end

#getCharacterEncoding(feed, xml_data) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/rfeedparser/encoding_helpers.rb', line 30

def getCharacterEncoding(feed, xml_data)
  # Get the character encoding of the XML document
  $stderr << "In getCharacterEncoding\n" if $debug
  sniffed_xml_encoding = nil
  xml_encoding = nil
  true_encoding = nil
  begin 
    http_headers = feed.meta
    http_content_type = feed.meta['content-type'].split(';')[0]
    encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
    http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
    http_encoding = nil if http_encoding.empty?
    # FIXME Open-Uri returns iso8859-1 if there is no charset header,
    # but that doesn't pass the tests. Open-Uri claims its following
    # the right RFC. Are they wrong or do we need to change the tests?
  rescue NoMethodError
    http_headers = {}
    http_content_type = nil
    http_encoding = nil
  end
  # Must sniff for non-ASCII-compatible character encodings before
  # searching for XML declaration.  This heuristic is defined in
  # section F of the XML specification:
  # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
  begin 
    if xml_data[0..3] == "\x4c\x6f\xa7\x94"
	# EBCDIC
	xml_data = _ebcdic_to_ascii(xml_data)
    elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
	# UTF-16BE
	sniffed_xml_encoding = 'utf-16be'
	xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
    elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
	# UTF-16BE with BOM
	sniffed_xml_encoding = 'utf-16be'
	xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
    elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
	# UTF-16LE
	sniffed_xml_encoding = 'utf-16le'
	xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
    elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
	# UTF-16LE with BOM
	sniffed_xml_encoding = 'utf-16le'
	xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
    elsif xml_data[0..3] == "\x00\x00\x00\x3c"
	# UTF-32BE
	sniffed_xml_encoding = 'utf-32be'
	xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
    elsif xml_data[0..3] == "\x3c\x00\x00\x00"
	# UTF-32LE
	sniffed_xml_encoding = 'utf-32le'
	xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
    elsif xml_data[0..3] == "\x00\x00\xfe\xff"
	# UTF-32BE with BOM
	sniffed_xml_encoding = 'utf-32be'
	xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
    elsif xml_data[0..3] == "\xff\xfe\x00\x00"
	# UTF-32LE with BOM
	sniffed_xml_encoding = 'utf-32le'
	xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
    elsif xml_data[0..2] == "\xef\xbb\xbf"
	# UTF-8 with BOM
	sniffed_xml_encoding = 'utf-8'
	xml_data = xml_data[3..-1]
    else
	# ASCII-compatible
    end
    xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
  rescue
    xml_encoding_match = nil
  end
  if xml_encoding_match 
    xml_encoding = xml_encoding_match[1].downcase
    xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
    if sniffed_xml_encoding and xencodings.include?xml_encoding
	xml_encoding = sniffed_xml_encoding
    end
  end

  acceptable_content_type = false
  application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
  text_content_types = ['text/xml', 'text/xml-external-parsed-entity']

  if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
    acceptable_content_type = true
    true_encoding = http_encoding || xml_encoding || 'utf-8'
  elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
    acceptable_content_type = true
    true_encoding = http_encoding || 'us-ascii'
  elsif /^text\// =~ http_content_type 
    true_encoding = http_encoding || 'us-ascii'
  elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
    true_encoding = xml_encoding || 'iso-8859-1'
  else
    true_encoding = xml_encoding || 'utf-8'
  end
  return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
end

#index_match(stri, regexp, offset) ⇒ Object



17
18
19
20
21
22
23
24
# File 'lib/rfeedparser/encoding_helpers.rb', line 17

def index_match(stri,regexp, offset)
  i = stri.index(regexp, offset)

  return nil, nil unless i

  full = stri[i..-1].match(regexp)
  return i, full
end

#py2rtime(pytuple) ⇒ Object



403
404
405
# File 'lib/rfeedparser/time_helpers.rb', line 403

def py2rtime(pytuple)
  return Time.utc(*pytuple[0..5]) unless pytuple.blank? 
end

#resolveRelativeURIs(htmlSource, baseURI, encoding) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/rfeedparser/markup_helpers.rb', line 28

def resolveRelativeURIs(htmlSource, baseURI, encoding)
  $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
  relative_uris = [ ['a','href'],
    ['applet','codebase'],
    ['area','href'],
    ['blockquote','cite'],
    ['body','background'],
    ['del','cite'],
    ['form','action'],
    ['frame','longdesc'],
    ['frame','src'],
    ['iframe','longdesc'],
    ['iframe','src'],
    ['head','profile'],
    ['img','longdesc'],
    ['img','src'],
    ['img','usemap'],
    ['input','src'],
    ['input','usemap'],
    ['ins','cite'],
    ['link','href'],
    ['object','classid'],
    ['object','codebase'],
    ['object','data'],
    ['object','usemap'],
    ['q','cite'],
    ['script','src'],
  ]
  h = Hpricot(htmlSource)
  relative_uris.each do |l|
    ename, eattr = l
    h.search(ename).each do |elem|
      euri = elem.attributes[eattr]
      # FIXME uses the URI.encode method.  should it?
      if euri and not euri.empty? and ForgivingURI.parse(URI.encode(euri)).relative? 
        elem.attributes[eattr] = urljoin(baseURI, euri)
      end
    end
  end
  return h.to_html
end

#sanitizeHTML(html, encoding) ⇒ Object



200
201
202
203
204
205
206
# File 'lib/rfeedparser/scrub.rb', line 200

def sanitizeHTML(html,encoding)
  # FIXME Tidy not yet supported
  html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
    h = SanitizerDoc(html)
    h = h.scrub
    return h.to_html.strip
end

#stripDoctype(data) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/rfeedparser/markup_helpers.rb', line 3

def stripDoctype(data)
  #Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
  #rss_version may be 'rss091n' or None
  #stripped_data is the same XML document, minus the DOCTYPE
  entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
  
  data = data.gsub(entity_pattern,'')

  doctype_pattern = /<!DOCTYPE(.*?)>/m
  doctype_results = data.scan(doctype_pattern)
  if doctype_results and doctype_results[0]
    doctype = doctype_results[0][0]
  else
    doctype = ''
  end

  if /netscape/ =~ doctype.downcase
    version = 'rss091n'
  else
    version = nil
  end
  data = data.sub(doctype_pattern, '')
  return version, data
end

#toUTF8(data, encoding) ⇒ Object



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/rfeedparser/encoding_helpers.rb', line 129

def toUTF8(data, encoding)
=begin
  Changes an XML data stream on the fly to specify a new encoding

  data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
  encoding is a string recognized by encodings.aliases
=end
  $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
  # NOTE we must use double quotes when dealing with \x encodings!
  if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
    if $debug
	$stderr << "stripping BOM\n"
	if encoding != 'utf-16be'
 $stderr << "string utf-16be instead\n"
	end
    end
    encoding = 'utf-16be'
    data = data[2..-1]
  elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
    if $debug
	$stderr << "stripping BOM\n"
	$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
    end
    encoding = 'utf-16le'
    data = data[2..-1]
  elsif (data[0..2] == "\xef\xbb\xbf")
    if $debug
	$stderr << "stripping BOM\n"
	$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
    end
    encoding = 'utf-8'
    data = data[3..-1]
  elsif (data[0..3] == "\x00\x00\xfe\xff")
    if $debug
	$stderr << "stripping BOM\n"
	if encoding != 'utf-32be'
 $stderr << "trying utf-32be instead\n"
	end
    end
    encoding = 'utf-32be'
    data = data[4..-1]
  elsif (data[0..3] == "\xff\xfe\x00\x00")
    if $debug
	$stderr << "stripping BOM\n"
	if encoding != 'utf-32le'
 $stderr << "trying utf-32le instead\n"
	end
    end
    encoding = 'utf-32le'
    data = data[4..-1]
  end
  begin
    newdata = uconvert(data, encoding, 'utf-8')
  rescue => details
    raise details
  end
  $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
  declmatch = /^<\?xml[^>]*?>/
    newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
    if declmatch =~ newdata
      newdata.sub!(declmatch, newdecl) 
    else
      newdata = newdecl + "\n" + newdata
    end
  return newdata
end

#uconvert(data, from_encoding, to_encoding = 'utf-8') ⇒ Object



11
12
13
14
15
# File 'lib/rfeedparser/encoding_helpers.rb', line 11

def uconvert(data, from_encoding, to_encoding = 'utf-8')
  from_encoding = Encoding_Aliases[from_encoding] || from_encoding
  to_encoding = Encoding_Aliases[to_encoding] || to_encoding
  Iconv.iconv(to_encoding, from_encoding, data)[0]
end

#unicode(data, from_encoding) ⇒ Object



5
6
7
8
9
# File 'lib/rfeedparser/encoding_helpers.rb', line 5

def unicode(data, from_encoding)
  # Takes a single string and converts it from the encoding in 
  # from_encoding to unicode.
  uconvert(data, from_encoding, 'unicode')
end