Class: Nokogiri::HTML::Document
- Inherits:
-
XML::Document
- Object
- XML::Node
- XML::Document
- Nokogiri::HTML::Document
- Defined in:
- lib/nokogiri/html/document.rb,
ext/nokogiri/html_document.c
Defined Under Namespace
Classes: EncodingFoundException, EncodingReader
Constant Summary
Constants inherited from XML::Node
XML::Node::ATTRIBUTE_DECL, XML::Node::ATTRIBUTE_NODE, XML::Node::CDATA_SECTION_NODE, XML::Node::COMMENT_NODE, XML::Node::DOCB_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE, XML::Node::DOCUMENT_NODE, XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE, XML::Node::ELEMENT_DECL, XML::Node::ELEMENT_NODE, XML::Node::ENTITY_DECL, XML::Node::ENTITY_NODE, XML::Node::ENTITY_REF_NODE, XML::Node::HTML_DOCUMENT_NODE, XML::Node::NAMESPACE_DECL, XML::Node::NOTATION_NODE, XML::Node::PI_NODE, XML::Node::TEXT_NODE, XML::Node::XINCLUDE_END, XML::Node::XINCLUDE_START
Instance Attribute Summary
Attributes inherited from XML::Document
Class Method Summary collapse
-
.new ⇒ Object
Create a new document.
-
.parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML) {|options| ... } ⇒ Object
Parse HTML.
-
.read_io(io, url, encoding, options) ⇒ Object
Read the HTML document from
io
with givenurl
,encoding
, andoptions
. -
.read_memory(string, url, encoding, options) ⇒ Object
Read the HTML document contained in
string
with givenurl
,encoding
, andoptions
.
Instance Method Summary collapse
-
#fragment(tags = nil) ⇒ Object
Create a Nokogiri::XML::DocumentFragment from
tags
. -
#meta_encoding ⇒ Object
Get the meta tag encoding for this document.
-
#meta_encoding=(encoding) ⇒ Object
Set the meta tag encoding for this document.
-
#serialize(options = {}) ⇒ Object
Serialize Node using
options
. -
#title ⇒ Object
Get the title string of this document.
-
#title=(text) ⇒ Object
Set the title string of this document.
-
#type ⇒ Object
The type for this document.
Methods inherited from XML::Document
#add_child, #collect_namespaces, #create_cdata, #create_element, #create_entity, #create_text_node, #decorate, #decorators, #document, #dup, #encoding, #encoding=, #initialize, #name, #namespaces, #remove_namespaces!, #root, #root=, #slop!, #to_java, #url, #validate, #version, wrap
Methods inherited from XML::Node
#<<, #<=>, #==, #>, #[], #[]=, #accept, #add_child, #add_namespace_definition, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #at, #at_css, #at_xpath, #attribute, #attribute_nodes, #attribute_with_ns, #attributes, #before, #blank?, #cdata?, #child, #children, #children=, #comment?, #content, #content=, #create_external_subset, #create_internal_subset, #css, #css_path, #decorate!, #default_namespace=, #description, #document, #dup, #each, #element?, #element_children, #encode_special_chars, #external_subset, #first_element_child, #fragment?, #html?, #initialize, #inner_html, #inner_html=, #internal_subset, #key?, #keys, #last_element_child, #line, #matches?, #namespace, #namespace=, #namespace_definitions, #namespace_scopes, #namespaced_key?, #namespaces, #next_element, #next_sibling, #node_name, #node_name=, #node_type, #parent, #parent=, #parse, #path, #pointer_id, #previous_element, #previous_sibling, #read_only?, #remove_attribute, #replace, #search, #swap, #text?, #to_html, #to_s, #to_xhtml, #to_xml, #traverse, #unlink, #values, #write_html_to, #write_to, #write_xhtml_to, #write_xml_to, #xml?, #xpath
Methods included from XML::PP::Node
Constructor Details
This class inherits a constructor from Nokogiri::XML::Document
Class Method Details
.new ⇒ Object
Create a new document
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'ext/nokogiri/html_document.c', line 9
static VALUE new(int argc, VALUE *argv, VALUE klass)
{
VALUE uri, external_id, rest, rb_doc;
htmlDocPtr doc;
rb_scan_args(argc, argv, "0*", &rest);
uri = rb_ary_entry(rest, (long)0);
external_id = rb_ary_entry(rest, (long)1);
doc = htmlNewDoc(
RTEST(uri) ? (const xmlChar *)StringValuePtr(uri) : NULL,
RTEST(external_id) ? (const xmlChar *)StringValuePtr(external_id) : NULL
);
rb_doc = Nokogiri_wrap_xml_document(klass, doc);
rb_obj_call_init(rb_doc, argc, argv);
return rb_doc ;
}
|
.parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML) {|options| ... } ⇒ Object
Parse HTML. string_or_io
may be a String, or any object that responds to read and close such as an IO, or StringIO. url
is resource where this document is located. encoding
is the encoding that should be used when processing the document. options
is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/nokogiri/html/document.rb', line 80 def parse string_or_io, url = nil, encoding = nil, = XML::ParseOptions::DEFAULT_HTML = Nokogiri::XML::ParseOptions.new() if Fixnum === # Give the options to the user yield if block_given? if string_or_io.respond_to?(:encoding) unless string_or_io.encoding.name == "ASCII-8BIT" encoding ||= string_or_io.encoding.name end end if string_or_io.respond_to?(:read) url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil if !encoding # Perform advanced encoding detection that libxml2 does # not do. string_or_io = EncodingReader.new(string_or_io) begin return read_io(string_or_io, url, encoding, .to_i) rescue EncodingFoundException => e # A retry is required because libxml2 has a problem in # that it cannot switch encoding well in the middle of # parsing, especially if it has already seen a # non-ASCII character when it finds an encoding hint. encoding = e.encoding end end return read_io(string_or_io, url, encoding, .to_i) end # read_memory pukes on empty docs return new if string_or_io.nil? or string_or_io.empty? if !encoding encoding = EncodingReader.detect_encoding(string_or_io) end read_memory(string_or_io, url, encoding, .to_i) end |
.read_io(io, url, encoding, options) ⇒ Object
Read the HTML document from io
with given url
, encoding
, and options
. See Nokogiri::HTML.parse
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'ext/nokogiri/html_document.c', line 34
static VALUE read_io( VALUE klass,
VALUE io,
VALUE url,
VALUE encoding,
VALUE options )
{
const char * c_url = NIL_P(url) ? NULL : StringValuePtr(url);
const char * c_enc = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
VALUE error_list = rb_ary_new();
VALUE document;
htmlDocPtr doc;
xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
doc = htmlReadIO(
io_read_callback,
io_close_callback,
(void *)io,
c_url,
c_enc,
(int)NUM2INT(options)
);
xmlSetStructuredErrorFunc(NULL, NULL);
if(doc == NULL) {
xmlErrorPtr error;
xmlFreeDoc(doc);
error = xmlGetLastError();
if(error)
rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
else
rb_raise(rb_eRuntimeError, "Could not parse document");
return Qnil;
}
document = Nokogiri_wrap_xml_document(klass, doc);
rb_iv_set(document, "@errors", error_list);
return document;
}
|
.read_memory(string, url, encoding, options) ⇒ Object
Read the HTML document contained in string
with given url
, encoding
, and options
. See Nokogiri::HTML.parse
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'ext/nokogiri/html_document.c', line 85
static VALUE read_memory( VALUE klass,
VALUE string,
VALUE url,
VALUE encoding,
VALUE options )
{
const char * c_buffer = StringValuePtr(string);
const char * c_url = NIL_P(url) ? NULL : StringValuePtr(url);
const char * c_enc = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
int len = (int)RSTRING_LEN(string);
VALUE error_list = rb_ary_new();
VALUE document;
htmlDocPtr doc;
xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
xmlSetStructuredErrorFunc(NULL, NULL);
if(doc == NULL) {
xmlErrorPtr error;
xmlFreeDoc(doc);
error = xmlGetLastError();
if(error)
rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
else
rb_raise(rb_eRuntimeError, "Could not parse document");
return Qnil;
}
document = Nokogiri_wrap_xml_document(klass, doc);
rb_iv_set(document, "@errors", error_list);
return document;
}
|
Instance Method Details
#fragment(tags = nil) ⇒ Object
Create a Nokogiri::XML::DocumentFragment from tags
67 68 69 |
# File 'lib/nokogiri/html/document.rb', line 67 def fragment = nil DocumentFragment.new(self, , self.root) end |
#meta_encoding ⇒ Object
Get the meta tag encoding for this document. If there is no meta tag, then nil is returned.
7 8 9 10 |
# File 'lib/nokogiri/html/document.rb', line 7 def = and /charset\s*=\s*([\w-]+)/i.match(['content'])[1] end |
#meta_encoding=(encoding) ⇒ Object
Set the meta tag encoding for this document. If there is no meta content tag, the encoding is not set.
15 16 17 18 |
# File 'lib/nokogiri/html/document.rb', line 15 def encoding = and ['content'] = "text/html; charset=%s" % encoding end |
#serialize(options = {}) ⇒ Object
Serialize Node using options
. Save options can also be set using a block. See SaveOptions.
These two statements are equivalent:
node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
or
node.serialize(:encoding => 'UTF-8') do |config|
config.format.as_xml
end
60 61 62 63 |
# File 'lib/nokogiri/html/document.rb', line 60 def serialize = {} [:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML super end |
#title ⇒ Object
Get the title string of this document. Return nil if there is no title tag.
30 31 32 |
# File 'lib/nokogiri/html/document.rb', line 30 def title title = at('title') and title.inner_text end |
#title=(text) ⇒ Object
Set the title string of this document. If there is no head element, the title is not set.
37 38 39 40 41 42 43 44 |
# File 'lib/nokogiri/html/document.rb', line 37 def title=(text) unless title = at('title') head = at('head') or return nil title = Nokogiri::XML::Node.new('title', self) head << title end title.children = XML::Text.new(text, self) end |
#type ⇒ Object
The type for this document
130 131 132 133 134 135 |
# File 'ext/nokogiri/html_document.c', line 130
static VALUE type(VALUE self)
{
htmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
return INT2NUM((long)doc->type);
}
|