Module: Nokogumbo

Defined in:
lib/nokogumbo.rb,
lib/nokogumbo/version.rb,
ext/nokogumbo/nokogumbo.c

Constant Summary collapse

DEFAULT_MAX_ATTRIBUTES =

The default maximum number of attributes per element.

400
DEFAULT_MAX_ERRORS =

The default maximum number of errors for parsing a document or a fragment.

0
DEFAULT_MAX_TREE_DEPTH =

The default maximum depth of the DOM tree produced by parsing a document or fragment.

400
VERSION =
"2.0.5"
LINE_SUPPORTED =

Add private constant for testing.

line_supported

Class Method Summary collapse

Class Method Details

.fragmentObject



596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
# File 'ext/nokogumbo/nokogumbo.c', line 596

static VALUE fragment (
  VALUE self,
  VALUE doc_fragment,
  VALUE tags,
  VALUE ctx,
  VALUE max_attributes,
  VALUE max_errors,
  VALUE max_depth
) {
  ID name = rb_intern_const("name");
  const char *ctx_tag;
  GumboNamespaceEnum ctx_ns;
  GumboQuirksModeEnum quirks_mode;
  bool form = false;
  const char *encoding = NULL;

  if (NIL_P(ctx)) {
    ctx_tag = "body";
    ctx_ns = GUMBO_NAMESPACE_HTML;
  } else if (TYPE(ctx) == T_STRING) {
    ctx_tag = StringValueCStr(ctx);
    ctx_ns = GUMBO_NAMESPACE_HTML;
    size_t len = RSTRING_LEN(ctx);
    const char *colon = memchr(ctx_tag, ':', len);
    if (colon) {
      switch (colon - ctx_tag) {
      case 3:
        if (st_strncasecmp(ctx_tag, "svg", 3) != 0)
          goto error;
        ctx_ns = GUMBO_NAMESPACE_SVG;
        break;
      case 4:
        if (st_strncasecmp(ctx_tag, "html", 4) == 0)
          ctx_ns = GUMBO_NAMESPACE_HTML;
        else if (st_strncasecmp(ctx_tag, "math", 4) == 0)
          ctx_ns = GUMBO_NAMESPACE_MATHML;
        else
          goto error;
        break;
      default:
      error:
        rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
      }
      ctx_tag = colon+1;
    } else {
      // For convenience, put 'svg' and 'math' in their namespaces.
      if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0)
        ctx_ns = GUMBO_NAMESPACE_SVG;
      else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0)
        ctx_ns = GUMBO_NAMESPACE_MATHML;
    }

    // Check if it's a form.
    form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
  } else {
    ID element_ = rb_intern_const("element?");

    // Context fragment name.
    VALUE tag_name = rb_funcall(ctx, name, 0);
    assert(RTEST(tag_name));
    Check_Type(tag_name, T_STRING);
    ctx_tag = StringValueCStr(tag_name);

    // Context fragment namespace.
    ctx_ns = lookup_namespace(ctx, true);

    // Check for a form ancestor, including self.
    for (VALUE node = ctx;
         !NIL_P(node);
         node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
      if (!RTEST(rb_funcall(node, element_, 0)))
        continue;
      VALUE element_name = rb_funcall(node, name, 0);
      if (RSTRING_LEN(element_name) == 4
          && !st_strcasecmp(RSTRING_PTR(element_name), "form")
          && lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
        form = true;
        break;
      }
    }

    // Encoding.
    if (RSTRING_LEN(tag_name) == 14
        && !st_strcasecmp(ctx_tag, "annotation-xml")) {
      VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
                             rb_utf8_str_new_static("encoding", 8));
      if (RTEST(enc)) {
        Check_Type(enc, T_STRING);
        encoding = StringValueCStr(enc);
      }
    }
  }

  // Quirks mode.
  VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
  VALUE dtd = rb_funcall(doc, internal_subset, 0);
  if (NIL_P(dtd)) {
    quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
  } else {
    VALUE dtd_name = rb_funcall(dtd, name, 0);
    VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
    VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
    quirks_mode = gumbo_compute_quirks_mode (
      NIL_P(dtd_name)? NULL:StringValueCStr(dtd_name),
      NIL_P(pubid)? NULL:StringValueCStr(pubid),
      NIL_P(sysid)? NULL:StringValueCStr(sysid)
    );
  }

  // Perform a fragment parse.
  int depth = NUM2INT(max_depth);
  GumboOptions options = kGumboDefaultOptions;
  options.max_attributes = NUM2INT(max_attributes);
  options.max_errors = NUM2INT(max_errors);
  // Add one to account for the HTML element.
  options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
  options.fragment_context = ctx_tag;
  options.fragment_namespace = ctx_ns;
  options.fragment_encoding = encoding;
  options.quirks_mode = quirks_mode;
  options.fragment_context_has_form_ancestor = form;

  GumboOutput *output = perform_parse(&options, tags);
  ParseArgs args = {
    .output = output,
    .input = tags,
    .url_or_frag = doc_fragment,
    .doc = (xmlDocPtr)extract_xml_node(doc),
  };
  VALUE parse_args = wrap_parse_args(&args);
  rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
  return Qnil;
}

.parse(input, url, max_attributes, max_errors, max_depth) ⇒ Object

Parse a string using gumbo_parse into a Nokogiri document



517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
# File 'ext/nokogumbo/nokogumbo.c', line 517

static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
  GumboOptions options = kGumboDefaultOptions;
  options.max_attributes = NUM2INT(max_attributes);
  options.max_errors = NUM2INT(max_errors);
  options.max_tree_depth = NUM2INT(max_depth);

  GumboOutput *output = perform_parse(&options, input);
  ParseArgs args = {
    .output = output,
    .input = input,
    .url_or_frag = url,
    .doc = NIL,
  };
  VALUE parse_args = wrap_parse_args(&args);

  return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
}