Class: Nokogiri::XML::Document
- Defined in:
- lib/nokogiri/xml/document.rb,
ext/nokogiri/xml_document.c,
ext/nokogiri/html_document.c
Overview
Nokogiri::XML::Document is the main entry point for dealing with XML documents. The Document is created by parsing an XML document. See Nokogiri.XML()
For searching a Document, see Nokogiri::XML::Node#css and Nokogiri::XML::Node#xpath
Direct Known Subclasses
Constant Summary
Constants inherited from Node
Node::ATTRIBUTE_DECL, Node::ATTRIBUTE_NODE, Node::CDATA_SECTION_NODE, Node::COMMENT_NODE, Node::DOCB_DOCUMENT_NODE, Node::DOCUMENT_FRAG_NODE, Node::DOCUMENT_NODE, Node::DOCUMENT_TYPE_NODE, Node::DTD_NODE, Node::ELEMENT_DECL, Node::ELEMENT_NODE, Node::ENTITY_DECL, Node::ENTITY_NODE, Node::ENTITY_REF_NODE, Node::HTML_DOCUMENT_NODE, Node::NAMESPACE_DECL, Node::NOTATION_NODE, Node::PI_NODE, Node::TEXT_NODE, Node::XINCLUDE_END, Node::XINCLUDE_START
Instance Attribute Summary collapse
-
#errors ⇒ Object
A list of Nokogiri::XML::SyntaxError found when parsing a document.
Class Method Summary collapse
-
.new(version = default) ⇒ Object
Create a new document with
version
(defaults to “1.0”). -
.parse(string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML) {|options| ... } ⇒ Object
Parse an XML file.
-
.read_io(io, url, encoding, options) ⇒ Object
Create a new document from an IO object.
-
.read_memory(string, url, encoding, options) ⇒ Object
Create a new document from a String.
-
.wrap(document) ⇒ Object
JRuby
Wraps Java’s org.w3c.dom.document and returns Nokogiri::XML::Document.
Instance Method Summary collapse
- #add_child(child) ⇒ Object (also: #<<)
-
#collect_namespaces ⇒ Object
Recursively get all namespaces from this node and its subtree and return them as a hash.
-
#create_cdata(text) ⇒ Object
Create a CDATA element containing
text
. -
#create_element(name, *args, &block) ⇒ Object
Create an element with
name
, and optionally setting the content and attributes. -
#create_entity(name, type, external_id, system_id, content) ⇒ Object
Create a new entity named
name
. -
#create_text_node(text, &block) ⇒ Object
Create a text node with
text
. -
#decorate(node) ⇒ Object
Apply any decorators to
node
. -
#decorators(key) ⇒ Object
Get the list of decorators given
key
. -
#document ⇒ Object
A reference to
self
. -
#dup ⇒ Object
(also: #clone)
Copy this Document.
-
#encoding ⇒ Object
Get the encoding for this Document.
-
#encoding=(encoding) ⇒ Object
Set the encoding string for this Document.
-
#fragment(tags = nil) ⇒ Object
Create a Nokogiri::XML::DocumentFragment from
tags
Returns an empty fragment iftags
is nil. -
#initialize(*args) ⇒ Document
constructor
:nodoc:.
-
#name ⇒ Object
The name of this document.
-
#namespaces ⇒ Object
Get the hash of namespaces on the root Nokogiri::XML::Node.
-
#remove_namespaces! ⇒ Object
Remove all namespaces from all nodes in the document.
-
#root ⇒ Object
Get the root node for this document.
-
#root= ⇒ Object
Set the root element on this document.
-
#slop! ⇒ Object
Explore a document with shortcut methods.
-
#to_java ⇒ Object
JRuby
Returns Java’s org.w3c.dom.document of this Document. -
#url ⇒ Object
Get the url name for this document.
-
#validate ⇒ Object
Validate this Document against it’s DTD.
-
#version ⇒ Object
Get the XML version for this Document.
Methods inherited from Node
#<=>, #==, #>, #[], #[]=, #accept, #add_namespace_definition, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #at, #at_css, #at_xpath, #attribute, #attribute_nodes, #attribute_with_ns, #attributes, #before, #blank?, #cdata?, #child, #children, #children=, #comment?, #content, #content=, #create_external_subset, #create_internal_subset, #css, #css_path, #decorate!, #default_namespace=, #description, #each, #element?, #element_children, #encode_special_chars, #external_subset, #first_element_child, #fragment?, #html?, #inner_html, #inner_html=, #internal_subset, #key?, #keys, #last_element_child, #line, #matches?, #namespace, #namespace=, #namespace_definitions, #namespace_scopes, #namespaced_key?, #next_element, #next_sibling, #node_name, #node_name=, #node_type, #parent, #parent=, #parse, #path, #pointer_id, #previous_element, #previous_sibling, #read_only?, #remove_attribute, #replace, #search, #serialize, #swap, #text?, #to_html, #to_s, #to_xhtml, #traverse, #unlink, #values, #write_html_to, #write_to, #write_xhtml_to, #write_xml_to, #xml?, #xpath
Methods included from PP::Node
Constructor Details
#initialize(*args) ⇒ Document
:nodoc:
39 40 41 42 |
# File 'lib/nokogiri/xml/document.rb', line 39 def initialize *args # :nodoc: @errors = [] @decorators = nil end |
Instance Attribute Details
#errors ⇒ Object
A list of Nokogiri::XML::SyntaxError found when parsing a document
37 38 39 |
# File 'lib/nokogiri/xml/document.rb', line 37 def errors @errors end |
Class Method Details
.new(version = default) ⇒ Object
Create a new document with version
(defaults to “1.0”)
314 315 316 317 318 319 320 321 322 323 324 325 326 327 |
# File 'ext/nokogiri/xml_document.c', line 314
static VALUE new(int argc, VALUE *argv, VALUE klass)
{
xmlDocPtr doc;
VALUE version, rest, rb_doc ;
rb_scan_args(argc, argv, "0*", &rest);
version = rb_ary_entry(rest, (long)0);
if (NIL_P(version)) version = rb_str_new2("1.0");
doc = xmlNewDoc((xmlChar *)StringValuePtr(version));
rb_doc = Nokogiri_wrap_xml_document(klass, doc);
rb_obj_call_init(rb_doc, argc, argv);
return rb_doc ;
}
|
.parse(string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML) {|options| ... } ⇒ Object
Parse an XML file. string_or_io
may be a String, or any object that responds to read and close such as an IO, or StringIO. url
is resource where this document is located. encoding
is the encoding that should be used when processing the document. options
is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/nokogiri/xml/document.rb', line 19 def self.parse string_or_io, url = nil, encoding = nil, = ParseOptions::DEFAULT_XML, &block = Nokogiri::XML::ParseOptions.new() if Fixnum === # Give the options to the user yield if block_given? if string_or_io.respond_to?(:read) url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil return read_io(string_or_io, url, encoding, .to_i) end # read_memory pukes on empty docs return new if string_or_io.nil? or string_or_io.empty? read_memory(string_or_io, url, encoding, .to_i) end |
.read_io(io, url, encoding, options) ⇒ Object
Create a new document from an IO object
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
# File 'ext/nokogiri/xml_document.c', line 196
static VALUE read_io( VALUE klass,
VALUE io,
VALUE url,
VALUE encoding,
VALUE options )
{
const char * c_url = NIL_P(url) ? NULL : StringValuePtr(url);
const char * c_enc = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
VALUE error_list = rb_ary_new();
VALUE document;
xmlDocPtr doc;
xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
doc = xmlReadIO(
(xmlInputReadCallback)io_read_callback,
(xmlInputCloseCallback)io_close_callback,
(void *)io,
c_url,
c_enc,
(int)NUM2INT(options)
);
xmlSetStructuredErrorFunc(NULL, NULL);
if(doc == NULL) {
xmlErrorPtr error;
xmlFreeDoc(doc);
error = xmlGetLastError();
if(error)
rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
else
rb_raise(rb_eRuntimeError, "Could not parse document");
return Qnil;
}
document = Nokogiri_wrap_xml_document(klass, doc);
rb_iv_set(document, "@errors", error_list);
return document;
}
|
.read_memory(string, url, encoding, options) ⇒ Object
Create a new document from a String
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
# File 'ext/nokogiri/xml_document.c', line 246
static VALUE read_memory( VALUE klass,
VALUE string,
VALUE url,
VALUE encoding,
VALUE options )
{
const char * c_buffer = StringValuePtr(string);
const char * c_url = NIL_P(url) ? NULL : StringValuePtr(url);
const char * c_enc = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
int len = (int)RSTRING_LEN(string);
VALUE error_list = rb_ary_new();
VALUE document;
xmlDocPtr doc;
xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
doc = xmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
xmlSetStructuredErrorFunc(NULL, NULL);
if(doc == NULL) {
xmlErrorPtr error;
xmlFreeDoc(doc);
error = xmlGetLastError();
if(error)
rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
else
rb_raise(rb_eRuntimeError, "Could not parse document");
return Qnil;
}
document = Nokogiri_wrap_xml_document(klass, doc);
rb_iv_set(document, "@errors", error_list);
return document;
}
|
Instance Method Details
#add_child(child) ⇒ Object Also known as: <<
197 198 199 200 201 202 203 204 205 |
# File 'lib/nokogiri/xml/document.rb', line 197 def add_child child raise "Document already has a root node" if root if child.type == Node::DOCUMENT_FRAG_NODE raise "Document cannot have multiple root nodes" if child.children.size > 1 super(child.children.first) else super end end |
#collect_namespaces ⇒ Object
Recursively get all namespaces from this node and its subtree and return them as a hash.
For example, given this document:
<root xmlns:foo="bar">
<bar xmlns:hello="world" />
</root>
This method will return:
{ 'xmlns:foo' => 'bar', 'xmlns:hello' => 'world' }
WARNING: this method will clobber duplicate names in the keys. For example, given this document:
<root xmlns:foo="bar">
<bar xmlns:foo="baz" />
</root>
The hash returned will look like this: { ‘xmlns:foo’ => ‘bar’ }
Non-prefixed default namespaces (as in “xmlns=”) are not included in the hash.
Note this is a very expensive operation in current implementation, as it traverses the entire graph, and also has to bring each node accross the libxml bridge into a ruby object.
123 124 125 126 127 |
# File 'lib/nokogiri/xml/document.rb', line 123 def collect_namespaces ns = {} traverse { |j| ns.merge!(j.namespaces) } ns end |
#create_cdata(text) ⇒ Object
Create a CDATA element containing text
80 81 82 |
# File 'lib/nokogiri/xml/document.rb', line 80 def create_cdata text Nokogiri::XML::CDATA.new(self, text.to_s) end |
#create_element(name, *args, &block) ⇒ Object
Create an element with name
, and optionally setting the content and attributes.
doc.create_element "div" # <div></div>
doc.create_element "div", :class => "container" # <div class='container'></div>
doc.create_element "div", "contents" # <div>contents</div>
doc.create_element "div", "contents", :class => "container" # <div class='container'>contents</div>
doc.create_element "div" { |node| node['class'] = "container" } # <div class='container'></div>
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/nokogiri/xml/document.rb', line 53 def create_element name, *args, &block elm = Nokogiri::XML::Element.new(name, self, &block) args.each do |arg| case arg when Hash arg.each { |k,v| key = k.to_s if key =~ /^xmlns(:\w+)?$/ ns_name = key.split(":", 2)[1] elm.add_namespace_definition ns_name, v next end elm[k.to_s] = v.to_s } else elm.content = arg end end elm end |
#create_entity(name, type, external_id, system_id, content) ⇒ Object
Create a new entity named name
.
type
is an integer representing the type of entity to be created, and it defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL. See the constants on Nokogiri::XML::EntityDecl for more information.
external_id
, system_id
, and content
set the External ID, System ID, and content respectively. All of these parameters are optional.
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 |
# File 'ext/nokogiri/xml_document.c', line 386
static VALUE create_entity(int argc, VALUE *argv, VALUE self)
{
VALUE name;
VALUE type;
VALUE external_id;
VALUE system_id;
VALUE content;
xmlEntityPtr ptr;
xmlDocPtr doc ;
Data_Get_Struct(self, xmlDoc, doc);
rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id,
&content);
xmlResetLastError();
ptr = xmlAddDocEntity(
doc,
(xmlChar *)(NIL_P(name) ? NULL : StringValuePtr(name)),
(int) (NIL_P(type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(type)),
(xmlChar *)(NIL_P(external_id) ? NULL : StringValuePtr(external_id)),
(xmlChar *)(NIL_P(system_id) ? NULL : StringValuePtr(system_id)),
(xmlChar *)(NIL_P(content) ? NULL : StringValuePtr(content))
);
if(NULL == ptr) {
xmlErrorPtr error = xmlGetLastError();
if(error)
rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
else
rb_raise(rb_eRuntimeError, "Could not create entity");
return Qnil;
}
return Nokogiri_wrap_xml_node(cNokogiriXmlEntityDecl, (xmlNodePtr)ptr);
}
|
#create_text_node(text, &block) ⇒ Object
Create a text node with text
75 76 77 |
# File 'lib/nokogiri/xml/document.rb', line 75 def create_text_node text, &block Nokogiri::XML::Text.new(text.to_s, self, &block) end |
#decorate(node) ⇒ Object
Apply any decorators to node
170 171 172 173 174 175 176 |
# File 'lib/nokogiri/xml/document.rb', line 170 def decorate node return unless @decorators @decorators.each { |klass,list| next unless node.is_a?(klass) list.each { |moodule| node.extend(moodule) } } end |
#decorators(key) ⇒ Object
Get the list of decorators given key
130 131 132 133 |
# File 'lib/nokogiri/xml/document.rb', line 130 def decorators key @decorators ||= Hash.new @decorators[key] ||= [] end |
#document ⇒ Object
A reference to self
90 91 92 |
# File 'lib/nokogiri/xml/document.rb', line 90 def document self end |
#dup ⇒ Object Also known as: clone
Copy this Document. An optional depth may be passed in, but it defaults to a deep copy. 0 is a shallow copy, 1 is a deep copy.
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
# File 'ext/nokogiri/xml_document.c', line 291
static VALUE duplicate_node(int argc, VALUE *argv, VALUE self)
{
xmlDocPtr doc, dup;
VALUE level;
if(rb_scan_args(argc, argv, "01", &level) == 0)
level = INT2NUM((long)1);
Data_Get_Struct(self, xmlDoc, doc);
dup = xmlCopyDoc(doc, (int)NUM2INT(level));
if(dup == NULL) return Qnil;
dup->type = doc->type;
return Nokogiri_wrap_xml_document(rb_obj_class(self), dup);
}
|
#encoding ⇒ Object
Get the encoding for this Document
166 167 168 169 170 171 172 173 |
# File 'ext/nokogiri/xml_document.c', line 166
static VALUE encoding(VALUE self)
{
xmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
if(!doc->encoding) return Qnil;
return NOKOGIRI_STR_NEW2(doc->encoding);
}
|
#encoding=(encoding) ⇒ Object
Set the encoding string for this Document
150 151 152 153 154 155 156 157 158 |
# File 'ext/nokogiri/xml_document.c', line 150
static VALUE set_encoding(VALUE self, VALUE encoding)
{
xmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
doc->encoding = xmlStrdup((xmlChar *)StringValuePtr(encoding));
return encoding;
}
|
#fragment(tags = nil) ⇒ Object
Create a Nokogiri::XML::DocumentFragment from tags
Returns an empty fragment if tags
is nil.
189 190 191 |
# File 'lib/nokogiri/xml/document.rb', line 189 def fragment = nil DocumentFragment.new(self, , self.root) end |
#name ⇒ Object
The name of this document. Always returns “document”
85 86 87 |
# File 'lib/nokogiri/xml/document.rb', line 85 def name 'document' end |
#namespaces ⇒ Object
Get the hash of namespaces on the root Nokogiri::XML::Node
182 183 184 |
# File 'lib/nokogiri/xml/document.rb', line 182 def namespaces root ? root.namespaces : {} end |
#remove_namespaces! ⇒ Object
Remove all namespaces from all nodes in the document.
This could be useful for developers who either don’t understand namespaces or don’t care about them.
The following example shows a use case, and you can decide for yourself whether this is a good thing or not:
doc = Nokogiri::XML <<-EOXML
<root>
<car xmlns:part="http://general-motors.com/">
<part:tire>Michelin Model XGV</part:tire>
</car>
<bicycle xmlns:part="http://schwinn.com/">
<part:tire>I'm a bicycle tire!</part:tire>
</bicycle>
</root>
EOXML
doc.xpath("//tire").to_s # => ""
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "<part:tire>Michelin Model XGV</part:tire>"
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => "<part:tire>I'm a bicycle tire!</part:tire>"
doc.remove_namespaces!
doc.xpath("//tire").to_s # => "<tire>Michelin Model XGV</tire><tire>I'm a bicycle tire!</tire>"
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => ""
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => ""
For more information on why this probably is not a good thing in general, please direct your browser to tenderlovemaking.com/2009/04/23/namespaces-in-xml/
366 367 368 369 370 371 372 373 |
# File 'ext/nokogiri/xml_document.c', line 366
VALUE remove_namespaces_bang(VALUE self)
{
xmlDocPtr doc ;
Data_Get_Struct(self, xmlDoc, doc);
recursively_remove_namespaces_from_node((xmlNodePtr)doc);
return self;
}
|
#root ⇒ Object
Get the root node for this document.
131 132 133 134 135 136 137 138 139 140 141 142 |
# File 'ext/nokogiri/xml_document.c', line 131
static VALUE root(VALUE self)
{
xmlDocPtr doc;
xmlNodePtr root;
Data_Get_Struct(self, xmlDoc, doc);
root = xmlDocGetRootElement(doc);
if(!root) return Qnil;
return Nokogiri_wrap_xml_node(Qnil, root) ;
}
|
#root= ⇒ Object
Set the root element on this document
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'ext/nokogiri/xml_document.c', line 87
static VALUE set_root(VALUE self, VALUE root)
{
xmlDocPtr doc;
xmlNodePtr new_root;
xmlNodePtr old_root;
Data_Get_Struct(self, xmlDoc, doc);
old_root = NULL;
if(NIL_P(root)) {
old_root = xmlDocGetRootElement(doc);
if(old_root) {
xmlUnlinkNode(old_root);
NOKOGIRI_ROOT_NODE(old_root);
}
return root;
}
Data_Get_Struct(root, xmlNode, new_root);
/* If the new root's document is not the same as the current document,
* then we need to dup the node in to this document. */
if(new_root->doc != doc) {
old_root = xmlDocGetRootElement(doc);
if (!(new_root = xmlDocCopyNode(new_root, doc, 1))) {
rb_raise(rb_eRuntimeError, "Could not reparent node (xmlDocCopyNode)");
}
}
xmlDocSetRootElement(doc, new_root);
if(old_root) NOKOGIRI_ROOT_NODE(old_root);
return root;
}
|
#slop! ⇒ Object
Explore a document with shortcut methods. See Nokogiri::Slop for details.
Note that any nodes that have been instantiated before #slop! is called will not be decorated with sloppy behavior. So, if you’re in irb, the preferred idiom is:
irb> doc = Nokogiri::Slop my_markup
and not
irb> doc = Nokogiri::HTML my_markup
... followed by irb's implicit inspect (and therefore instantiation of every node) ...
irb> doc.slop!
... which does absolutely nothing.
159 160 161 162 163 164 165 166 |
# File 'lib/nokogiri/xml/document.rb', line 159 def slop! unless decorators(XML::Node).include? Nokogiri::Decorators::Slop decorators(XML::Node) << Nokogiri::Decorators::Slop decorate! end self end |
#to_java ⇒ Object
JRuby
Returns Java’s org.w3c.dom.document of this Document.
219 220 221 222 |
# File 'lib/nokogiri/xml/document.rb', line 219 def to_java raise "JRuby only method" unless Nokogiri.jruby? return toJavaDocument() end |
#url ⇒ Object
Get the url name for this document.
71 72 73 74 75 76 77 78 79 |
# File 'ext/nokogiri/xml_document.c', line 71
static VALUE url(VALUE self)
{
xmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
if(doc->URL) return NOKOGIRI_STR_NEW2(doc->URL);
return Qnil;
}
|
#validate ⇒ Object
Validate this Document against it’s DTD. Returns a list of errors on the document or nil
when there is no DTD.
138 139 140 141 |
# File 'lib/nokogiri/xml/document.rb', line 138 def validate return nil unless internal_subset internal_subset.validate self end |
#version ⇒ Object
Get the XML version for this Document
181 182 183 184 185 186 187 188 |
# File 'ext/nokogiri/xml_document.c', line 181
static VALUE version(VALUE self)
{
xmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
if(!doc->version) return Qnil;
return NOKOGIRI_STR_NEW2(doc->version);
}
|