Class: Nokogiri::XML::Document

Inherits:
Node
  • Object
show all
Defined in:
lib/nokogiri/xml/document.rb,
lib/nokogiri/ffi/xml/document.rb,
ext/nokogiri/xml_document.c,
ext/nokogiri/html_document.c

Overview

Nokogiri::XML::Document is the main entry point for dealing with XML documents. The Document is created by parsing an XML document. See Nokogiri.XML()

For searching a Document, see Nokogiri::XML::Node#css and Nokogiri::XML::Node#xpath

Direct Known Subclasses

HTML::Document

Constant Summary

Constants inherited from Node

Node::ATTRIBUTE_DECL, Node::ATTRIBUTE_NODE, Node::CDATA_SECTION_NODE, Node::COMMENT_NODE, Node::DOCB_DOCUMENT_NODE, Node::DOCUMENT_FRAG_NODE, Node::DOCUMENT_NODE, Node::DOCUMENT_TYPE_NODE, Node::DTD_NODE, Node::ELEMENT_DECL, Node::ELEMENT_NODE, Node::ENTITY_DECL, Node::ENTITY_NODE, Node::ENTITY_REF_NODE, Node::HTML_DOCUMENT_NODE, Node::NAMESPACE_DECL, Node::NOTATION_NODE, Node::PI_NODE, Node::TEXT_NODE, Node::XINCLUDE_END, Node::XINCLUDE_START

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Node

#<=>, #==, #>, #[], #[]=, #accept, #add_namespace, #add_namespace_definition, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #at, #at_css, #at_xpath, #attribute, #attribute_nodes, #attribute_with_ns, #attributes, #before, #blank?, #cdata?, #child, #children, #comment?, #content, #content=, #create_external_subset, #create_internal_subset, #css, #css_path, #decorate!, #default_namespace=, #description, #each, #element?, #encode_special_chars, #external_subset, #has_attribute?, #html?, #inner_html, #inner_html=, #inner_text, #internal_subset, #key?, #keys, #line, #matches?, #name=, #namespace, #namespace=, #namespace_definitions, #namespaced_key?, #next, #next_element, #next_sibling, #node_name, #node_name=, node_properties, #node_type, #parent, #parent=, #path, #pointer_id, #previous, #previous_element, #previous_sibling, #read_only?, #remove, #remove_attribute, #replace, #search, #serialize, #set_attribute, #swap, #text, #text?, #to_html, #to_s, #to_xhtml, #traverse, #type, #unlink, #values, #write_html_to, #write_to, #write_xhtml_to, #write_xml_to, #xml?, #xpath

Methods included from PP::Node

#inspect, #pretty_print

Constructor Details

#initialize(*args) ⇒ Document

Returns a new instance of Document.



39
40
41
# File 'lib/nokogiri/xml/document.rb', line 39

def initialize *args
  @decorators = nil
end

Instance Attribute Details

#cstructObject

Returns the value of attribute cstruct.



6
7
8
# File 'lib/nokogiri/ffi/xml/document.rb', line 6

def cstruct
  @cstruct
end

#errorsObject

A list of Nokogiri::XML::SyntaxError found when parsing a document



37
38
39
# File 'lib/nokogiri/xml/document.rb', line 37

def errors
  @errors
end

Class Method Details

.new(version = default) ⇒ Object

Create a new document with version (defaults to “1.0”)



272
273
274
275
276
277
# File 'ext/nokogiri/xml_document.c', line 272

def new(*args)
  version = args.first || "1.0"
  doc = wrap(LibXML.xmlNewDoc(version))
  doc.send :initialize, *args
  doc
end

.parse(string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML) {|options| ... } ⇒ Object

Parse an XML file. thing may be a String, or any object that responds to read and close such as an IO, or StringIO. url is resource where this document is located. encoding is the encoding that should be used when processing the document. options is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.

Yields:

  • (options)


19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/nokogiri/xml/document.rb', line 19

def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML, &block

  options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
  # Give the options to the user
  yield options if block_given?

  if string_or_io.respond_to?(:read)
    url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
    return read_io(string_or_io, url, encoding, options.to_i)
  end

  # read_memory pukes on empty docs
  return new if string_or_io.nil? or string_or_io.empty?

  read_memory(string_or_io, url, encoding, options.to_i)
end

.read_io(io, url, encoding, options) ⇒ Object

Create a new document from an IO object



162
163
164
165
166
# File 'ext/nokogiri/xml_document.c', line 162

def self.read_io io, url, encoding, options
  wrap_with_error_handling do
    LibXML.xmlReadIO(IoCallbacks.reader(io), nil, nil, url, encoding, options)
  end
end

.read_memory(string, url, encoding, options) ⇒ Object

Create a new document from a String



208
209
210
211
212
# File 'ext/nokogiri/xml_document.c', line 208

def self.read_memory(string, url, encoding, options)
  wrap_with_error_handling do
    LibXML.xmlReadMemory(string, string.length, url, encoding, options)
  end
end

.recursively_remove_namespaces_from_node(node) ⇒ Object



124
125
126
127
128
129
# File 'lib/nokogiri/ffi/xml/document.rb', line 124

def recursively_remove_namespaces_from_node(node)
  node.cstruct[:ns] = nil
  node.children.each do |child|
    recursively_remove_namespaces_from_node(child)
  end
end

.wrap(doc_struct) ⇒ Object



81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/nokogiri/ffi/xml/document.rb', line 81

def wrap doc_struct
  if doc_struct.is_a?(FFI::Pointer)
    # cast native pointers up into a doc cstruct
    return nil if doc_struct.null?
    doc_struct = LibXML::XmlDocument.new(doc_struct)
  end

  doc                  = self.allocate
  doc.cstruct          = doc_struct
  doc.cstruct.ruby_doc = doc
  doc.instance_eval { @decorators = nil; @node_cache = [] }
  doc.send :initialize
  doc
end

.wrap_with_error_handling(&block) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/nokogiri/ffi/xml/document.rb', line 100

def wrap_with_error_handling(&block)
  error_list = []
  LibXML.xmlInitParser()
  LibXML.xmlResetLastError()
  LibXML.xmlSetStructuredErrorFunc(nil, SyntaxError.error_array_pusher(error_list))

  ptr = yield

  LibXML.xmlSetStructuredErrorFunc(nil, nil)

  if ptr.null?
    error = LibXML.xmlGetLastError()
    if error
      raise SyntaxError.wrap(error)
    else
      raise RuntimeError, "Could not parse document"
    end
  end

  document = wrap(ptr)
  document.errors = error_list
  return document
end

Instance Method Details

#add_child(child) ⇒ Object Also known as: <<



145
146
147
148
149
150
151
152
153
# File 'lib/nokogiri/xml/document.rb', line 145

def add_child child
  raise "Document already has a root node" if root
  if child.type == Node::DOCUMENT_FRAG_NODE
    raise "Document cannot have multiple root nodes" if child.children.size > 1
    super(child.children.first)
  else
    super
  end
end

#cloneObject



127
# File 'lib/nokogiri/xml/document.rb', line 127

alias :clone :dup

#collect_namespacesObject

Recursively get all namespaces from this node and its subtree and return them as a hash.

For example, given this document:

<root xmlns:foo="bar">
  <bar xmlns:hello="world" />
</root>

This method will return:

{ 'xmlns:foo' => 'bar', 'xmlns:hello' => 'world' }

WARNING: this method will clobber duplicate names in the keys. For example, given this document:

<root xmlns:foo="bar">
  <bar xmlns:foo="baz" />
</root>

The hash returned will look like this: { ‘xmlns:foo’ => ‘bar’ }



85
86
87
88
89
# File 'lib/nokogiri/xml/document.rb', line 85

def collect_namespaces
  ns = {}
  traverse { |j| ns.merge!(j.namespaces) }
  ns
end

#create_element(name, &block) ⇒ Object

Create an element with name



44
45
46
# File 'lib/nokogiri/xml/document.rb', line 44

def create_element name, &block
  Nokogiri::XML::Element.new(name, self, &block)
end

#create_text_node(text, &block) ⇒ Object

Create a text node with text



49
50
51
# File 'lib/nokogiri/xml/document.rb', line 49

def create_text_node text, &block
  Nokogiri::XML::Text.new(text.to_s, self, &block)
end

#decorate(node) ⇒ Object

Apply any decorators to node



118
119
120
121
122
123
124
# File 'lib/nokogiri/xml/document.rb', line 118

def decorate node
  return unless @decorators
  @decorators.each { |klass,list|
    next unless node.is_a?(klass)
    list.each { |moodule| node.extend(moodule) }
  }
end

#decorators(key) ⇒ Object

Get the list of decorators given key



92
93
94
95
# File 'lib/nokogiri/xml/document.rb', line 92

def decorators key
  @decorators ||= Hash.new
  @decorators[key] ||= []
end

#documentObject

A reference to self



59
60
61
# File 'lib/nokogiri/xml/document.rb', line 59

def document
  self
end

#dupObject

Copy this Document. An optional depth may be passed in, but it defaults to a deep copy. 0 is a shallow copy, 1 is a deep copy.



249
250
251
252
253
254
255
256
257
258
# File 'ext/nokogiri/xml_document.c', line 249

def dup deep = 1
  dup_ptr = LibXML.xmlCopyDoc(cstruct, deep)
  return nil if dup_ptr.null?

  # xmlCopyDoc does not preserve document type. wtf?
  cstruct = LibXML::XmlDocumentCast.new(dup_ptr)
  cstruct[:type] = self.type

  self.class.wrap(dup_ptr)
end

#encodingObject

Get the encoding for this Document



132
133
134
135
# File 'ext/nokogiri/xml_document.c', line 132

def encoding
  ptr = cstruct[:encoding]
  ptr.null? ? nil : ptr.read_string
end

#encoding=(encoding) ⇒ Object

Set the encoding string for this Document



116
117
118
119
# File 'ext/nokogiri/xml_document.c', line 116

def encoding= encoding
  # TODO: if :encoding is already set, then it's probably getting leaked.
  cstruct[:encoding] = LibXML.xmlStrdup(encoding)
end

#fragment(tags = nil) ⇒ Object

Create a Nokogiri::XML::DocumentFragment from tags Returns an empty fragment if tags is nil.



137
138
139
# File 'lib/nokogiri/xml/document.rb', line 137

def fragment tags = nil
  DocumentFragment.new(self, tags)
end

#nameObject

The name of this document. Always returns “document”



54
55
56
# File 'lib/nokogiri/xml/document.rb', line 54

def name
  'document'
end

#namespacesObject

Get the hash of namespaces on the root Nokogiri::XML::Node



130
131
132
# File 'lib/nokogiri/xml/document.rb', line 130

def namespaces
  root ? root.namespaces : {}
end

#remove_namespaces!Object

Remove all namespaces from all nodes in the document.

This could be useful for developers who either don’t understand namespaces or don’t care about them.

The following example shows a use case, and you can decide for yourself whether this is a good thing or not:

doc = Nokogiri::XML <<-EOXML
   <root>
     <car xmlns:part="http://general-motors.com/">
       <part:tire>Michelin Model XGV</part:tire>
     </car>
     <bicycle xmlns:part="http://schwinn.com/">
       <part:tire>I'm a bicycle tire!</part:tire>
     </bicycle>
   </root>
   EOXML

doc.xpath("//tire").to_s # => ""
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "<part:tire>Michelin Model XGV</part:tire>"
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => "<part:tire>I'm a bicycle tire!</part:tire>"

doc.remove_namespaces!

doc.xpath("//tire").to_s # => "<tire>Michelin Model XGV</tire><tire>I'm a bicycle tire!</tire>"
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => ""
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => ""

For more information on why this probably is not a good thing in general, please direct your browser to tenderlovemaking.com/2009/04/23/namespaces-in-xml/



323
324
325
# File 'ext/nokogiri/xml_document.c', line 323

def remove_namespaces!
  self.class.recursively_remove_namespaces_from_node(root)
end

#rootObject

Get the root node for this document.



99
100
101
102
# File 'ext/nokogiri/xml_document.c', line 99

def root
  ptr = LibXML.xmlDocGetRootElement(cstruct)
  ptr.null? ? nil : Node.wrap(LibXML::XmlNode.new(ptr))
end

#root=Object

Set the root element on this document



69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'ext/nokogiri/xml_document.c', line 69

def root= new_root
  old_root = nil
  if new_root.cstruct[:doc] != cstruct[:doc]
    old_root_ptr = LibXML.xmlDocGetRootElement(cstruct)
    new_root_ptr = LibXML.xmlDocCopyNode(new_root.cstruct, cstruct, 1)
    raise RuntimeError "Could not reparent node (xmlDocCopyNode)" if new_root_ptr.null?
    new_root = Node.wrap(new_root_ptr)
  end
  LibXML.xmlDocSetRootElement(cstruct, new_root.cstruct)
  if old_root_ptr && ! old_root_ptr.null?
    LibXML::XmlNode.new(old_root_ptr).keep_reference_from_document!
  end
  new_root
end

#slop!Object

Explore a document with shortcut methods.



107
108
109
110
111
112
113
114
# File 'lib/nokogiri/xml/document.rb', line 107

def slop!
  unless decorators(XML::Node).include? Nokogiri::Decorators::Slop
    decorators(XML::Node) << Nokogiri::Decorators::Slop
    decorate!
  end

  self
end

#urlObject

Get the url name for this document.



53
54
55
# File 'ext/nokogiri/xml_document.c', line 53

def url
  cstruct[:URL]
end

#validateObject

Validate this Document against it’s DTD. Returns a list of errors on the document or nil when there is no DTD.



100
101
102
103
# File 'lib/nokogiri/xml/document.rb', line 100

def validate
  return nil unless internal_subset
  internal_subset.validate self
end

#versionObject

Get the XML version for this Document



147
148
149
# File 'ext/nokogiri/xml_document.c', line 147

def version
  cstruct[:version]
end