Class: LibXML::XML::HTMLParser
- Inherits:
-
Object
- Object
- LibXML::XML::HTMLParser
- Defined in:
- ext/libxml/ruby_xml_html_parser.c,
lib/libxml/html_parser.rb,
ext/libxml/ruby_xml_html_parser.c
Overview
The HTML parser implements an HTML 4.0 non-verifying parser with an API compatible with the XML::Parser. In contrast with the XML::Parser, it can parse “real world” HTML, even if it severely broken from a specification point of view.
The HTML parser creates an in-memory document object that consist of any number of XML::Node instances. This is simple and powerful model, but has the major limitation that the size of the document that can be processed is limited by the amount of memory available.
Using the html parser is simple:
parser = XML::HTMLParser.file('my_file')
doc = parser.parse
You can also parse documents (see XML::HTMLParser.document), strings (see XML::HTMLParser.string) and io objects (see XML::HTMLParser.io).
Defined Under Namespace
Modules: Options Classes: Context
Instance Attribute Summary collapse
-
#input ⇒ Object
readonly
Atributes.
Class Method Summary collapse
-
.file(path, options = {}) ⇒ Object
call-seq: XML::HTMLParser.file(path) -> XML::HTMLParser XML::HTMLParser.file(path, :encoding => XML::Encoding::UTF_8, :options => XML::HTMLParser::Options::NOENT) -> XML::HTMLParser.
-
.io(io, options = {}) ⇒ Object
call-seq: XML::HTMLParser.io(io) -> XML::HTMLParser XML::HTMLParser.io(io, :encoding => XML::Encoding::UTF_8, :options => XML::HTMLParser::Options::NOENT :base_uri=“libxml.org”) -> XML::HTMLParser.
-
.string(string, options = {}) ⇒ Object
call-seq: XML::HTMLParser.string(string) XML::HTMLParser.string(string, :encoding => XML::Encoding::UTF_8, :options => XML::HTMLParser::Options::NOENT :base_uri=“libxml.org”) -> XML::HTMLParser.
Instance Method Summary collapse
-
#file=(value) ⇒ Object
:enddoc:.
-
#XML::HTMLParser.initialize ⇒ Object
constructor
Initializes a new parser instance with no pre-determined source.
- #io=(value) ⇒ Object
-
#parse ⇒ XML::Document
Parse the input XML and create an XML::Document with it’s content.
- #string=(value) ⇒ Object
Constructor Details
#XML::HTMLParser.initialize ⇒ Object
Initializes a new parser instance with no pre-determined source.
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'ext/libxml/ruby_xml_html_parser.c', line 37
static VALUE rxml_html_parser_initialize(int argc, VALUE *argv, VALUE self)
{
VALUE context = Qnil;
rb_scan_args(argc, argv, "01", &context);
if (context == Qnil)
{
rb_warn("Passing no parameters to XML::HTMLParser.new is deprecated. Pass an instance of XML::Parser::Context instead.");
context = rb_class_new_instance(0, NULL, cXMLParserContext);
}
rb_ivar_set(self, CONTEXT_ATTR, context);
return self;
}
|
Instance Attribute Details
#input ⇒ Object (readonly)
Atributes
Class Method Details
.file(path, options = {}) ⇒ Object
call-seq:
XML::HTMLParser.file(path) -> XML::HTMLParser
XML::HTMLParser.file(path, :encoding => XML::Encoding::UTF_8,
:options => XML::HTMLParser::Options::NOENT) -> XML::HTMLParser
Creates a new parser by parsing the specified file or uri.
You may provide an optional hash table to control how the parsing is performed. Valid options are:
encoding - The document encoding, defaults to nil. Valid values
are the encoding constants defined on XML::Encoding.
options - Parser options. Valid values are the constants defined on
XML::HTMLParser::Options. Mutliple options can be combined
by using Bitwise OR (|).
21 22 23 24 25 26 |
# File 'lib/libxml/html_parser.rb', line 21 def self.file(path, = {}) context = XML::HTMLParser::Context.file(path) context.encoding = [:encoding] if [:encoding] context. = [:options] if [:options] self.new(context) end |
.io(io, options = {}) ⇒ Object
call-seq:
XML::HTMLParser.io(io) -> XML::HTMLParser
XML::HTMLParser.io(io, :encoding => XML::Encoding::UTF_8,
:options => XML::HTMLParser::Options::NOENT
:base_uri="http://libxml.org") -> XML::HTMLParser
Creates a new reader by parsing the specified io object.
Parameters:
io - io object that contains the xml to parser
base_uri - The base url for the parsed document.
encoding - The document encoding, defaults to nil. Valid values
are the encoding constants defined on XML::Encoding.
options - Parser options. Valid values are the constants defined on
XML::HTMLParser::Options. Mutliple options can be combined
by using Bitwise OR (|).
45 46 47 48 49 50 51 |
# File 'lib/libxml/html_parser.rb', line 45 def self.io(io, = {}) context = XML::HTMLParser::Context.io(io) context.base_uri = [:base_uri] if [:base_uri] context.encoding = [:encoding] if [:encoding] context. = [:options] if [:options] self.new(context) end |
.string(string, options = {}) ⇒ Object
call-seq:
XML::HTMLParser.string(string)
XML::HTMLParser.string(string, :encoding => XML::Encoding::UTF_8,
:options => XML::HTMLParser::Options::NOENT
:base_uri="http://libxml.org") -> XML::HTMLParser
Creates a new parser by parsing the specified string.
You may provide an optional hash table to control how the parsing is performed. Valid options are:
base_uri - The base url for the parsed document.
encoding - The document encoding, defaults to nil. Valid values
are the encoding constants defined on XML::Encoding.
options - Parser options. Valid values are the constants defined on
XML::HTMLParser::Options. Mutliple options can be combined
by using Bitwise OR (|).
70 71 72 73 74 75 76 |
# File 'lib/libxml/html_parser.rb', line 70 def self.string(string, = {}) context = XML::HTMLParser::Context.string(string) context.base_uri = [:base_uri] if [:base_uri] context.encoding = [:encoding] if [:encoding] context. = [:options] if [:options] self.new(context) end |
Instance Method Details
#file=(value) ⇒ Object
:enddoc:
80 81 82 83 |
# File 'lib/libxml/html_parser.rb', line 80 def file=(value) warn("XML::HTMLParser#file is deprecated. Use XML::HTMLParser.file instead") @context = XML::HTMLParser::Context.file(value) end |
#io=(value) ⇒ Object
85 86 87 88 |
# File 'lib/libxml/html_parser.rb', line 85 def io=(value) warn("XML::HTMLParser#io is deprecated. Use XML::HTMLParser.io instead") @context = XML::HTMLParser::Context.io(value) end |
#parse ⇒ XML::Document
Parse the input XML and create an XML::Document with it’s content. If an error occurs, XML::Parser::ParseError is thrown.
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'ext/libxml/ruby_xml_html_parser.c', line 61
static VALUE rxml_html_parser_parse(VALUE self)
{
xmlParserCtxtPtr ctxt;
VALUE context = rb_ivar_get(self, CONTEXT_ATTR);
Data_Get_Struct(context, xmlParserCtxt, ctxt);
if (htmlParseDocument(ctxt) == -1 && ! ctxt->recovery)
{
if (ctxt->myDoc)
xmlFreeDoc(ctxt->myDoc);
rxml_raise(&ctxt->lastError);
}
rb_funcall(context, rb_intern("close"), 0);
return rxml_document_wrap(ctxt->myDoc);
}
|
#string=(value) ⇒ Object
90 91 92 93 |
# File 'lib/libxml/html_parser.rb', line 90 def string=(value) warn("XML::HTMLParser#string is deprecated. Use XML::HTMLParser.string instead") @context = XML::HTMLParser::Context.string(value) end |