Module: Nokogumbo
- Defined in:
- lib/nokogumbo.rb,
lib/nokogumbo/version.rb,
ext/nokogumbo/nokogumbo.c
Constant Summary collapse
- DEFAULT_MAX_ATTRIBUTES =
The default maximum number of attributes per element.
400
- DEFAULT_MAX_ERRORS =
The default maximum number of errors for parsing a document or a fragment.
0
- DEFAULT_MAX_TREE_DEPTH =
The default maximum depth of the DOM tree produced by parsing a document or fragment.
400
- VERSION =
"2.0.5"
- LINE_SUPPORTED =
Add private constant for testing.
line_supported
Class Method Summary collapse
- .fragment ⇒ Object
-
.parse(input, url, max_attributes, max_errors, max_depth) ⇒ Object
Parse a string using gumbo_parse into a Nokogiri document.
Class Method Details
.fragment ⇒ Object
596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 |
# File 'ext/nokogumbo/nokogumbo.c', line 596
static VALUE fragment (
VALUE self,
VALUE doc_fragment,
VALUE tags,
VALUE ctx,
VALUE max_attributes,
VALUE max_errors,
VALUE max_depth
) {
ID name = rb_intern_const("name");
const char *ctx_tag;
GumboNamespaceEnum ctx_ns;
GumboQuirksModeEnum quirks_mode;
bool form = false;
const char *encoding = NULL;
if (NIL_P(ctx)) {
ctx_tag = "body";
ctx_ns = GUMBO_NAMESPACE_HTML;
} else if (TYPE(ctx) == T_STRING) {
ctx_tag = StringValueCStr(ctx);
ctx_ns = GUMBO_NAMESPACE_HTML;
size_t len = RSTRING_LEN(ctx);
const char *colon = memchr(ctx_tag, ':', len);
if (colon) {
switch (colon - ctx_tag) {
case 3:
if (st_strncasecmp(ctx_tag, "svg", 3) != 0)
goto error;
ctx_ns = GUMBO_NAMESPACE_SVG;
break;
case 4:
if (st_strncasecmp(ctx_tag, "html", 4) == 0)
ctx_ns = GUMBO_NAMESPACE_HTML;
else if (st_strncasecmp(ctx_tag, "math", 4) == 0)
ctx_ns = GUMBO_NAMESPACE_MATHML;
else
goto error;
break;
default:
error:
rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
}
ctx_tag = colon+1;
} else {
// For convenience, put 'svg' and 'math' in their namespaces.
if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0)
ctx_ns = GUMBO_NAMESPACE_SVG;
else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0)
ctx_ns = GUMBO_NAMESPACE_MATHML;
}
// Check if it's a form.
form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
} else {
ID element_ = rb_intern_const("element?");
// Context fragment name.
VALUE tag_name = rb_funcall(ctx, name, 0);
assert(RTEST(tag_name));
Check_Type(tag_name, T_STRING);
ctx_tag = StringValueCStr(tag_name);
// Context fragment namespace.
ctx_ns = lookup_namespace(ctx, true);
// Check for a form ancestor, including self.
for (VALUE node = ctx;
!NIL_P(node);
node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
if (!RTEST(rb_funcall(node, element_, 0)))
continue;
VALUE element_name = rb_funcall(node, name, 0);
if (RSTRING_LEN(element_name) == 4
&& !st_strcasecmp(RSTRING_PTR(element_name), "form")
&& lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
form = true;
break;
}
}
// Encoding.
if (RSTRING_LEN(tag_name) == 14
&& !st_strcasecmp(ctx_tag, "annotation-xml")) {
VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
rb_utf8_str_new_static("encoding", 8));
if (RTEST(enc)) {
Check_Type(enc, T_STRING);
encoding = StringValueCStr(enc);
}
}
}
// Quirks mode.
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
VALUE dtd = rb_funcall(doc, internal_subset, 0);
if (NIL_P(dtd)) {
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
} else {
VALUE dtd_name = rb_funcall(dtd, name, 0);
VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
quirks_mode = gumbo_compute_quirks_mode (
NIL_P(dtd_name)? NULL:StringValueCStr(dtd_name),
NIL_P(pubid)? NULL:StringValueCStr(pubid),
NIL_P(sysid)? NULL:StringValueCStr(sysid)
);
}
// Perform a fragment parse.
int depth = NUM2INT(max_depth);
GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
// Add one to account for the HTML element.
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
options.fragment_context = ctx_tag;
options.fragment_namespace = ctx_ns;
options.fragment_encoding = encoding;
options.quirks_mode = quirks_mode;
options.fragment_context_has_form_ancestor = form;
GumboOutput *output = perform_parse(&options, tags);
ParseArgs args = {
.output = output,
.input = tags,
.url_or_frag = doc_fragment,
.doc = (xmlDocPtr)extract_xml_node(doc),
};
VALUE parse_args = wrap_parse_args(&args);
rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
return Qnil;
}
|
.parse(input, url, max_attributes, max_errors, max_depth) ⇒ Object
Parse a string using gumbo_parse into a Nokogiri document
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 |
# File 'ext/nokogumbo/nokogumbo.c', line 517
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
options.max_tree_depth = NUM2INT(max_depth);
GumboOutput *output = perform_parse(&options, input);
ParseArgs args = {
.output = output,
.input = input,
.url_or_frag = url,
.doc = NIL,
};
VALUE parse_args = wrap_parse_args(&args);
return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
}
|