Class: HTree::Encoder
- Inherits:
-
Object
- Object
- HTree::Encoder
- Defined in:
- lib/htree/encoder.rb
Constant Summary collapse
- ChRef =
{ '&' => '&', '<' => '<', '>' => '>', '"' => '"', }
- KcodeCharset =
:stopdoc:
{ 'EUC' => 'EUC-JP', 'SJIS' => 'Shift_JIS', 'UTF8' => 'UTF-8', 'NONE' => 'ISO-8859-1', }
- FirstCharPattern =
{ 'EUC-JP' => /\A(?: [\x00-\x7f] |[\xa1-\xfe][\xa1-\xfe] |\x8e[\xa1-\xfe] |\x8f[\xa1-\xfe][\xa1-\xfe])/nx, 'Shift_JIS' => /\A(?: [\x00-\x7f] |[\x81-\x9f][\x40-\x7e\x80-\xfc] |[\xa1-\xdf] |[\xe0-\xfc][\x40-\x7e\x80-\xfc])/nx, 'UTF-8' => /\A(?: [\x00-\x7f] |[\xc0-\xdf][\x80-\xbf] |[\xe0-\xef][\x80-\xbf][\x80-\xbf] |[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf] |[\xf8-\xfb][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf] |[\xfc-\xfd][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf])/nx, 'ISO-8859-1' => /\A[\x00-\xff]/n }
- SubCharset =
{ 'ISO-2022-JP-2' => ['US-ASCII', 'ISO-2022-JP'], 'ISO-2022-JP-3' => ['US-ASCII', 'ISO-2022-JP'], 'UTF-16BE' => [], 'UTF-16LE' => [], 'UTF-16' => [], }
Class Method Summary collapse
-
.internal_charset ⇒ Object
HTree::Encoder.internal_charset returns the MIME charset corresponding to $KCODE.
Instance Method Summary collapse
-
#finish ⇒ Object
:startdoc:.
- #finish_with_xmldecl ⇒ Object
- #html_output=(flag) ⇒ Object
-
#html_output? ⇒ Boolean
:stopdoc:.
-
#initialize(output_encoding, internal_encoding = HTree::Encoder.internal_charset) ⇒ Encoder
constructor
A new instance of Encoder.
- #minimal_charset ⇒ Object
- #output_cdata_content(content, context) ⇒ Object
- #output_cdata_content_do(out, pre, body, post) ⇒ Object
- #output_cdata_for_html(*args) ⇒ Object
- #output_dynamic_attvalue(string) ⇒ Object
- #output_dynamic_text(string) ⇒ Object
- #output_slash_if_xml ⇒ Object
- #output_string(internal_str, external_str = @ic.iconv(internal_str)) ⇒ Object
- #output_text(string) ⇒ Object
Constructor Details
#initialize(output_encoding, internal_encoding = HTree::Encoder.internal_charset) ⇒ Encoder
Returns a new instance of Encoder.
22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/htree/encoder.rb', line 22 def initialize(output_encoding, internal_encoding=HTree::Encoder.internal_charset) @buf = '' @internal_encoding = internal_encoding @output_encoding = output_encoding @ic = Iconv.new(output_encoding, @internal_encoding) @charpat = FirstCharPattern[internal_encoding] @subcharset_list = SubCharset[output_encoding] || [] @subcharset_ic = {} @subcharset_list.each {|subcharset| @subcharset_ic[subcharset] = Iconv.new(subcharset, @internal_encoding) } @html_output = false end |
Class Method Details
.internal_charset ⇒ Object
HTree::Encoder.internal_charset returns the MIME charset corresponding to $KCODE.
-
‘ISO-8859-1’ when $KCODE==‘NONE’
-
‘UTF-8’ when $KCODE==‘UTF8’
-
‘EUC-JP’ when $KCODE==‘EUC’
-
‘Shift_JIS’ when $KCODE==‘SJIS’
This mapping ignores EUC-KR and various single byte charset other than ISO-8859-1 at least. This should be fixed when Ruby is m17nized.
14 15 16 17 18 19 20 |
# File 'lib/htree/encoder.rb', line 14 def Encoder.internal_charset if Object.const_defined? :Encoding Encoding.default_external.name else KcodeCharset[$KCODE] end end |
Instance Method Details
#finish ⇒ Object
:startdoc:
147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/htree/encoder.rb', line 147 def finish external_str = @ic.close @buf << external_str @subcharset_ic.reject! {|subcharset, ic| begin ic.close != external_str rescue Iconv::Failure true end } @buf end |
#finish_with_xmldecl ⇒ Object
160 161 162 163 164 165 |
# File 'lib/htree/encoder.rb', line 160 def finish_with_xmldecl content = finish xmldecl = Iconv.conv(@output_encoding, 'US-ASCII', "<?xml version=\"1.0\" encoding=\"#{minimal_charset}\"?>") xmldecl + content end |
#html_output=(flag) ⇒ Object
41 42 43 |
# File 'lib/htree/encoder.rb', line 41 def html_output=(flag) @html_output = flag end |
#html_output? ⇒ Boolean
:stopdoc:
37 38 39 |
# File 'lib/htree/encoder.rb', line 37 def html_output? @html_output end |
#minimal_charset ⇒ Object
167 168 169 170 171 172 173 174 |
# File 'lib/htree/encoder.rb', line 167 def minimal_charset @subcharset_list.each {|subcharset| if @subcharset_ic.include? subcharset return subcharset end } @output_encoding end |
#output_cdata_content(content, context) ⇒ Object
62 63 64 65 66 67 68 69 70 71 |
# File 'lib/htree/encoder.rb', line 62 def output_cdata_content(content, context) if @html_output # xxx: should raise an error for non-text node? texts = content.grep(HTree::Text) text = HTree::Text.concat(*texts) text.output_cdata(self) else content.each {|n| n.output(self, context) } end end |
#output_cdata_content_do(out, pre, body, post) ⇒ Object
45 46 47 48 49 50 51 52 53 54 |
# File 'lib/htree/encoder.rb', line 45 def output_cdata_content_do(out, pre, body, post) if @html_output pre.call body.call post.call(out) else body.call end return out end |
#output_cdata_for_html(*args) ⇒ Object
73 74 75 76 77 78 79 |
# File 'lib/htree/encoder.rb', line 73 def output_cdata_for_html(*args) str = args.join('') if %r{</} =~ str raise ArgumentError, "cdata contains '</' : #{str.inspect}" end output_string str end |
#output_dynamic_attvalue(string) ⇒ Object
137 138 139 140 141 142 143 |
# File 'lib/htree/encoder.rb', line 137 def output_dynamic_attvalue(string) if string.respond_to? :rcdata output_text(string.rcdata.gsub(/[<>"]/) { ChRef[$&] }) else output_text(string.to_s.gsub(/[&<>"]/) { ChRef[$&] }) end end |
#output_dynamic_text(string) ⇒ Object
129 130 131 132 133 134 135 |
# File 'lib/htree/encoder.rb', line 129 def output_dynamic_text(string) if string.respond_to? :rcdata output_text(string.rcdata.gsub(/[<>]/) { ChRef[$&] }) else output_text(string.to_s.gsub(/[&<>]/) { ChRef[$&] }) end end |
#output_slash_if_xml ⇒ Object
56 57 58 59 60 |
# File 'lib/htree/encoder.rb', line 56 def output_slash_if_xml if !@html_output output_string('/') end end |
#output_string(internal_str, external_str = @ic.iconv(internal_str)) ⇒ Object
81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/htree/encoder.rb', line 81 def output_string(internal_str, external_str=@ic.iconv(internal_str)) @buf.force_encoding(external_str.encoding) if @buf.empty? && @buf.respond_to?(:force_encoding) # xxx: should be fixed Ruby itself @buf << external_str @subcharset_ic.reject! {|subcharset, ic| begin ic.iconv(internal_str) != external_str rescue Iconv::Failure true end } nil end |
#output_text(string) ⇒ Object
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/htree/encoder.rb', line 94 def output_text(string) begin output_string string, @ic.iconv(string) rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => e success = e.success output_string string[0, string.length - e.failed.length], success unless /\A./m =~ e.failed # xxx: should be configulable? #raise ArgumentError, "cannot extract first character: #{e.failed.dump}" string = e.failed[1, e.failed.length-1] output_string '?' retry end char = $& rest = $' begin ucode = Iconv.conv("UTF-8", @internal_encoding, char).unpack("U")[0] char = "&##{ucode};" rescue Iconv::IllegalSequence, Iconv::InvalidCharacter # xxx: should be configulable? char = '?' end output_string char string = rest retry end end |