Module: XMLUtil

Defined in:
lib/xml_util.rb

Overview

This module provides utility methods for working with XML.

Copyright 2008 R. Mark Volkmann

This file is part of WAX.

WAX is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

Foobar is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License along with WAX. If not, see www.gnu.org/licenses.

  1. Mark Volkmann, Object Computing, Inc.

Constant Summary collapse

DEFAULT_ENCODING =

The default encoding used in XML declarations.

"UTF-8"
BASE_CHAR_PATTERN =

The following regular expressions were taken from the W3C XML Recommenation:

"[\u0041-\u005A]|[\u0061-\u007A]|[\u00C0-\u00D6]|[\u00D8-\u00F6]|" +
"[\u00F8-\u00FF]|[\u0100-\u0131]|[\u0134-\u013E]|[\u0141-\u0148]|" +
"[\u014A-\u017E]|[\u0180-\u01C3]|[\u01CD-\u01F0]|[\u01F4-\u01F5]|" +
"[\u01FA-\u0217]|[\u0250-\u02A8]|[\u02BB-\u02C1]|\u0386|" +
"[\u0388-\u038A]|\u038C|[\u038E-\u03A1]|[\u03A3-\u03CE]|" +
"[\u03D0-\u03D6]|\u03DA|\u03DC|\u03DE|\u03E0|[\u03E2-\u03F3]|" +
"[\u0401-\u040C]|[\u040E-\u044F]|[\u0451-\u045C]|[\u045E-\u0481]|" +
"[\u0490-\u04C4]|[\u04C7-\u04C8]|[\u04CB-\u04CC]|[\u04D0-\u04EB]|" +
"[\u04EE-\u04F5]|[\u04F8-\u04F9]|[\u0531-\u0556]|\u0559|" +
"[\u0561-\u0586]|[\u05D0-\u05EA]|[\u05F0-\u05F2]|[\u0621-\u063A]|" +
"[\u0641-\u064A]|[\u0671-\u06B7]|[\u06BA-\u06BE]|[\u06C0-\u06CE]|" +
"[\u06D0-\u06D3]|\u06D5|[\u06E5-\u06E6]|[\u0905-\u0939]|" +
"\u093D|[\u0958-\u0961]|[\u0985-\u098C]|[\u098F-\u0990]|" +
"[\u0993-\u09A8]|[\u09AA-\u09B0]|\u09B2|[\u09B6-\u09B9]|" +
"[\u09DC-\u09DD]|[\u09DF-\u09E1]|[\u09F0-\u09F1]|[\u0A05-\u0A0A]|" +
"[\u0A0F-\u0A10]|[\u0A13-\u0A28]|[\u0A2A-\u0A30]|[\u0A32-\u0A33]|" +
"[\u0A35-\u0A36]|[\u0A38-\u0A39]|[\u0A59-\u0A5C]|\u0A5E|" +
"[\u0A72-\u0A74]|[\u0A85-\u0A8B]|\u0A8D|[\u0A8F-\u0A91]|" +
"[\u0A93-\u0AA8]|[\u0AAA-\u0AB0]|[\u0AB2-\u0AB3]|[\u0AB5-\u0AB9]|" +
"\u0ABD|\u0AE0|[\u0B05-\u0B0C]|[\u0B0F-\u0B10]|[\u0B13-\u0B28]|" +
"[\u0B2A-\u0B30]|[\u0B32-\u0B33]|[\u0B36-\u0B39]|\u0B3D|" +
"[\u0B5C-\u0B5D]|[\u0B5F-\u0B61]|[\u0B85-\u0B8A]|[\u0B8E-\u0B90]|" +
"[\u0B92-\u0B95]|[\u0B99-\u0B9A]|\u0B9C|[\u0B9E-\u0B9F]|" +
"[\u0BA3-\u0BA4]|[\u0BA8-\u0BAA]|[\u0BAE-\u0BB5]|[\u0BB7-\u0BB9]|" +
"[\u0C05-\u0C0C]|[\u0C0E-\u0C10]|[\u0C12-\u0C28]|[\u0C2A-\u0C33]|" +
"[\u0C35-\u0C39]|[\u0C60-\u0C61]|[\u0C85-\u0C8C]|[\u0C8E-\u0C90]|" +
"[\u0C92-\u0CA8]|[\u0CAA-\u0CB3]|[\u0CB5-\u0CB9]|\u0CDE|" +
"[\u0CE0-\u0CE1]|[\u0D05-\u0D0C]|[\u0D0E-\u0D10]|[\u0D12-\u0D28]|" +
"[\u0D2A-\u0D39]|[\u0D60-\u0D61]|[\u0E01-\u0E2E]|\u0E30|" +
"[\u0E32-\u0E33]|[\u0E40-\u0E45]|[\u0E81-\u0E82]|\u0E84|" +
"[\u0E87-\u0E88]|\u0E8A|\u0E8D|[\u0E94-\u0E97]|[\u0E99-\u0E9F]|" +
"[\u0EA1-\u0EA3]|\u0EA5|\u0EA7|[\u0EAA-\u0EAB]|[\u0EAD-\u0EAE]|" +
"\u0EB0|[\u0EB2-\u0EB3]|\u0EBD|[\u0EC0-\u0EC4]|[\u0F40-\u0F47]|" +
"[\u0F49-\u0F69]|[\u10A0-\u10C5]|[\u10D0-\u10F6]|\u1100|" +
"[\u1102-\u1103]|[\u1105-\u1107]|\u1109|[\u110B-\u110C]|" +
"[\u110E-\u1112]|\u113C|\u113E|\u1140|\u114C|\u114E|\u1150|" +
"[\u1154-\u1155]|\u1159|[\u115F-\u1161]|\u1163|\u1165|\u1167|" +
"\u1169|[\u116D-\u116E]|[\u1172-\u1173]|\u1175|\u119E|\u11A8|" +
"\u11AB|[\u11AE-\u11AF]|[\u11B7-\u11B8]|\u11BA|[\u11BC-\u11C2]|" +
"\u11EB|\u11F0|\u11F9|[\u1E00-\u1E9B]|[\u1EA0-\u1EF9]|" +
"[\u1F00-\u1F15]|[\u1F18-\u1F1D]|[\u1F20-\u1F45]|[\u1F48-\u1F4D]|" +
"[\u1F50-\u1F57]|\u1F59|\u1F5B|\u1F5D|[\u1F5F-\u1F7D]|" +
"[\u1F80-\u1FB4]|[\u1FB6-\u1FBC]|\u1FBE|[\u1FC2-\u1FC4]|" +
"[\u1FC6-\u1FCC]|[\u1FD0-\u1FD3]|[\u1FD6-\u1FDB]|[\u1FE0-\u1FEC]|" +
"[\u1FF2-\u1FF4]|[\u1FF6-\u1FFC]|\u2126|[\u212A-\u212B]|\u212E|" +
"[\u2180-\u2182]|[\u3041-\u3094]|[\u30A1-\u30FA]|[\u3105-\u312C]|" +
"[\uAC00-\uD7A3]"
COMBINING_CHAR_PATTERN =
"[\u0300-\u0345]|[\u0360-\u0361]|[\u0483-\u0486]|[\u0591-\u05A1]|" +
"[\u05A3-\u05B9]|[\u05BB-\u05BD]|\u05BF|[\u05C1-\u05C2]|\u05C4|" +
"[\u064B-\u0652]|\u0670|[\u06D6-\u06DC]|[\u06DD-\u06DF]|" +
"[\u06E0-\u06E4]|[\u06E7-\u06E8]|[\u06EA-\u06ED]|[\u0901-\u0903]|" +
"\u093C|[\u093E-\u094C]|\u094D|[\u0951-\u0954]|[\u0962-\u0963]|" +
"[\u0981-\u0983]|\u09BC|\u09BE|\u09BF|[\u09C0-\u09C4]|" +
"[\u09C7-\u09C8]|[\u09CB-\u09CD]|\u09D7|[\u09E2-\u09E3]|\u0A02|" +
"\u0A3C|\u0A3E|\u0A3F|[\u0A40-\u0A42]|[\u0A47-\u0A48]|" +
"[\u0A4B-\u0A4D]|[\u0A70-\u0A71]|[\u0A81-\u0A83]|\u0ABC|" +
"[\u0ABE-\u0AC5]|[\u0AC7-\u0AC9]|[\u0ACB-\u0ACD]|[\u0B01-\u0B03]|" +
"\u0B3C|[\u0B3E-\u0B43]|[\u0B47-\u0B48]|[\u0B4B-\u0B4D]|" +
"[\u0B56-\u0B57]|[\u0B82-\u0B83]|[\u0BBE-\u0BC2]|[\u0BC6-\u0BC8]|" +
"[\u0BCA-\u0BCD]|\u0BD7|[\u0C01-\u0C03]|[\u0C3E-\u0C44]|" +
"[\u0C46-\u0C48]|[\u0C4A-\u0C4D]|[\u0C55-\u0C56]|[\u0C82-\u0C83]|" +
"[\u0CBE-\u0CC4]|[\u0CC6-\u0CC8]|[\u0CCA-\u0CCD]|[\u0CD5-\u0CD6]|" +
"[\u0D02-\u0D03]|[\u0D3E-\u0D43]|[\u0D46-\u0D48]|[\u0D4A-\u0D4D]|" +
"\u0D57|\u0E31|[\u0E34-\u0E3A]|[\u0E47-\u0E4E]|\u0EB1|" +
"[\u0EB4-\u0EB9]|[\u0EBB-\u0EBC]|[\u0EC8-\u0ECD]|[\u0F18-\u0F19]|" +
"\u0F35|\u0F37|\u0F39|\u0F3E|\u0F3F|[\u0F71-\u0F84]|" +
"[\u0F86-\u0F8B]|[\u0F90-\u0F95]|\u0F97|[\u0F99-\u0FAD]|" +
"[\u0FB1-\u0FB7]|\u0FB9|[\u20D0-\u20DC]|\u20E1|[\u302A-\u302F]|" +
"\u3099|\u309A"
DIGIT_PATTERN =
"[\u0030-\u0039]|[\u0660-\u0669]|[\u06F0-\u06F9]|"+
"[\u0966-\u096F]|[\u09E6-\u09EF]|[\u0A66-\u0A6F]|"+
"[\u0AE6-\u0AEF]|[\u0B66-\u0B6F]|[\u0BE7-\u0BEF]|"+
"[\u0C66-\u0C6F]|[\u0CE6-\u0CEF]|[\u0D66-\u0D6F]|"+
"[\u0E50-\u0E59]|[\u0ED0-\u0ED9]|[\u0F20-\u0F29]"
EXTENDER_PATTERN =
"\u00B7|\u02D0|\u02D1|\u0387|\u0640|\u0E46|\u0EC6|\u3005|" +
"[\u3031-\u3035]|[\u309D-\u309E]|[\u30FC-\u30FE]"
IDEOGRAPHIC_PATTERN =
"[\u4E00-\u9FA5]|\u3007|[\u3021-\u3029]"
LETTER_PATTERN =
BASE_CHAR_PATTERN + "|" + IDEOGRAPHIC_PATTERN
NAME_CHAR_PATTERN =
LETTER_PATTERN + "|" +
DIGIT_PATTERN + "|" +
"'.'|'-'|'_'|':'|" +
COMBINING_CHAR_PATTERN + "|" +
EXTENDER_PATTERN
LATIN_NAME_PATTERN =

Element and attribute names must be name tokens. This is a regular expression used to determine whether a given string is a valid XML “name token” using only Latin characters.

/^[A-Za-z][A-Za-z0-9\-_\.]*$/
FULL_NAME_PATTERN =

Element and attribute names must be name tokens. This is a regular expression used to determine whether a given string is a valid XML “name token” using any valid Unicode characters.

/^(#{LETTER_PATTERN}|'_')(#{NAME_CHAR_PATTERN})*$/
XMLSCHEMA_INSTANCE_NS =
"http://www.w3.org/1999/XMLSchema-instance"

Class Method Summary collapse

Class Method Details

.escape(text) ⇒ Object

Escapes special characters in XML text.



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/xml_util.rb', line 136

def self.escape(text)
  return text unless text.kind_of?(String)

  result = ""
  text.each_byte do |c|
    # TODO: Is using [0] the best way to do these comparisons?
    if c == '<'[0]
      result << "&lt;"
    elsif c == '>'[0]
      result << "&gt;"
    elsif c == "'"[0]
      result << "&apos;"
    elsif c == '"'[0]
      result << "&quot;"
    elsif c == '&'[0]
      result << "&amp;"
    else
      result << c
    end
  end
      
  result
end

.is_comment(text) ⇒ Object

Determines whether given text is a valid comment.



161
162
163
# File 'lib/xml_util.rb', line 161

def self.is_comment(text)
  /--/ !~ text
end

.is_name(text) ⇒ Object

Determines whether given text is a name token.



166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/xml_util.rb', line 166

def self.is_name(text)
  return false if text == nil

  # Names that start with "XML" in any case are reserved.
  return false if text.downcase =~ /$xml/

  # First attempt to match against the simpler regular expression
  # for names that only use Latin characters
  # because this should be faster and the most common case.
  return true if (LATIN_NAME_PATTERN =~ text) != nil

  # Since that didn't match, try the full regular expression.
  # Ruby 1.8 doesn't support Unicode in regular expressions!
  # Save this code for Ruby 1.9.
  #matched_full = (FULL_NAME_PATTERN =~ text) != nil
  #puts "#{text} matched full? #{matched_full}"
  #matched_letter = (/^(#{LETTER_PATTERN}|'_')/ =~ text) != nil
  #puts "#{text} matched letter? #{matched_letter}"

  false
end

.is_uri(text) ⇒ Object

Determines whether given text is a URI.



189
190
191
192
193
194
195
196
# File 'lib/xml_util.rb', line 189

def self.is_uri(text)
  begin
    uri = URI.parse(text)
    true
  rescue URI::InvalidURIError
    false
  end
end

.is_version(text) ⇒ Object

Determines whether given text is a valid XML version.



199
200
201
# File 'lib/xml_util.rb', line 199

def self.is_version(text)
  ["1.0", "1.1", "1.2"].include?(text)
end

.verify_comment(text) ⇒ Object

Verifies that the given text is a valid comment and raises an ArgumentError if it isn’t.



205
206
207
208
209
# File 'lib/xml_util.rb', line 205

def self.verify_comment(text)
  unless is_comment(text)
    raise ArgumentError, "\"#{text}\" is an invalid comment"
  end
end

.verify_name(text) ⇒ Object

Verifies that the given text is a valid name token and raises an ArgumentError if it isn’t.



213
214
215
216
217
# File 'lib/xml_util.rb', line 213

def self.verify_name(text)
  unless is_name(text)
    raise ArgumentError, "\"#{text}\" is an invalid NMTOKEN"
  end
end

.verify_uri(text) ⇒ Object

Verifies that the given text is a valid URI and raises an ArgumentError if it isn’t.



221
222
223
224
225
# File 'lib/xml_util.rb', line 221

def self.verify_uri(text)
  unless is_uri(text)
    raise ArgumentError, "\"#{text}\" is an invalid URI"
  end
end

.verify_version(text) ⇒ Object

Verifies that the given text is a valid XML version and raises an ArgumentError if it isn’t.



229
230
231
232
233
# File 'lib/xml_util.rb', line 229

def self.verify_version(text)
  unless is_version(text)
    raise ArgumentError, "\"#{text}\" is an invalid XML version"
  end
end