Module: Builder::XChar

Defined in:
lib/builder/xchar.rb,
lib/builder/xchar.rb

Overview

:nodoc:

Constant Summary collapse

CP1252 =
{			# :nodoc:
  128 => 8364,		# euro sign
  130 => 8218,		# single low-9 quotation mark
  131 =>  402,		# latin small letter f with hook
  132 => 8222,		# double low-9 quotation mark
  133 => 8230,		# horizontal ellipsis
  134 => 8224,		# dagger
  135 => 8225,		# double dagger
  136 =>  710,		# modifier letter circumflex accent
  137 => 8240,		# per mille sign
  138 =>  352,		# latin capital letter s with caron
  139 => 8249,		# single left-pointing angle quotation mark
  140 =>  338,		# latin capital ligature oe
  142 =>  381,		# latin capital letter z with caron
  145 => 8216,		# left single quotation mark
  146 => 8217,		# right single quotation mark
  147 => 8220,		# left double quotation mark
  148 => 8221,		# right double quotation mark
  149 => 8226,		# bullet
  150 => 8211,		# en dash
  151 => 8212,		# em dash
  152 =>  732,		# small tilde
  153 => 8482,		# trade mark sign
  154 =>  353,		# latin small letter s with caron
  155 => 8250,		# single right-pointing angle quotation mark
  156 =>  339,		# latin small ligature oe
  158 =>  382,		# latin small letter z with caron
  159 =>  376,		# latin capital letter y with diaeresis
}
PREDEFINED =
{
  38 => '&',		# ampersand
  60 => '<',		# left angle bracket
  62 => '>',		# right angle bracket
}
VALID =
[
  0x9, 0xA, 0xD,
  (0x20..0xD7FF), 
  (0xE000..0xFFFD),
  (0x10000..0x10FFFF)
]
REPLACEMENT_CHAR =
if String.method_defined?(:encode)
  "\uFFFD"
elsif $KCODE == 'UTF8'
  "\xEF\xBF\xBD"
else
  '*'
end
XML_PREDEFINED =
Regexp.new('[' +
  Builder::XChar::PREDEFINED.keys.pack('U*').force_encoding('utf-8') +
']')
INVALID_XML_CHAR =
Regexp.new('[^'+
  Builder::XChar::VALID.map { |item|
    case item
    when Fixnum
      [item].pack('U').force_encoding('utf-8')
    when Range
      [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
    end
  }.join +
']')
ENCODING_BINARY =
Encoding.find('BINARY')
ENCODING_UTF8 =
Encoding.find('UTF-8')
ENCODING_ISO1 =
Encoding.find('ISO-8859-1')

Class Method Summary collapse

Class Method Details

.encode(string) ⇒ Object

encode a string per XML rules



151
152
153
154
155
156
# File 'lib/builder/xchar.rb', line 151

def XChar.encode(string)
  unicode(string).
    tr(CP1252_DIFFERENCES, UNICODE_EQUIVALENT).
    gsub(INVALID_XML_CHAR, REPLACEMENT_CHAR).
    gsub(XML_PREDEFINED) {|c| PREDEFINED[c.ord]}
end

.unicode(string) ⇒ Object

convert a string to valid UTF-8, compensating for a number of common errors.



125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/builder/xchar.rb', line 125

def XChar.unicode(string)
  if string.encoding == ENCODING_BINARY
    if string.ascii_only?
      string
    else
      string = string.clone.force_encoding(ENCODING_UTF8)
      if string.valid_encoding?
        string
      else
        string.encode(ENCODING_UTF8, ENCODING_ISO1)
      end
    end

  elsif string.encoding == ENCODING_UTF8
    if string.valid_encoding?
      string
    else
      string.encode(ENCODING_UTF8, ENCODING_ISO1)
    end

  else
    string.encode(ENCODING_UTF8)
  end
end