Module: Rejectu

Defined in:
ext/rejectu/rejectu.c

Class Method Summary collapse

Class Method Details

.scrub(*args) ⇒ Object



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'ext/rejectu/rejectu.c', line 169

static VALUE
scrub(int argc, VALUE *argv, VALUE self)
{
  VALUE input, token;
  rb_scan_args(argc, argv, "11", &input, &token);

  if (is_valid(self, input) == Qtrue) {
    return input;
  }

  if (token == Qnil) {
    token = defaultToken;
  }

  return do_scrub(input, token);
}

.scrub!(*args) ⇒ Object



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'ext/rejectu/rejectu.c', line 186

static VALUE
scrub_bang(int argc, VALUE *argv, VALUE self)
{
  VALUE input, token;
  rb_scan_args(argc, argv, "11", &input, &token);

  if (!is_valid(self, input)) {
    if (token == Qnil) {
      token = defaultToken;
    }

    VALUE repl = do_scrub(input, token);
    if (!NIL_P(repl)) {
      rb_str_replace(input, repl);
    }
  }

  return input;
}

.valid?(str) ⇒ Boolean

Returns:

  • (Boolean)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'ext/rejectu/rejectu.c', line 34

static VALUE
is_valid(VALUE self, VALUE str)
{
  unsigned char *p, *end;
  long len, remain;
#ifdef __SSE2__
  __m128i chunk, part;
  int mask;
#endif

  validate_utf8_input(str);

  len = RSTRING_LEN(str);
  p = RSTRING_PTR(str);
  end = RSTRING_END(str);

#ifdef __SSE2__
  /* advance p until it's 16 byte aligned */
  while (((uintptr_t) p & 0xf) != 0 && p < end) {
    if ((*p & 0xf0) == 0xf0) {
      return Qfalse;
    }
    p++;
  }

  while (p < end) {
    if (end - p < 16)
      break;

    chunk = _mm_load_si128((__m128i *) p);
    /* check if the top bit of any of the bytes is set, which is 1 if the character is multibyte */
    mask = _mm_movemask_epi8(chunk);
    if (mask) {
      /*
       * If there's a multi-byte character somewhere in this chunk, we need to check if it's a codepoint
       * from the supplementary plane (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx).
       *
       * 1) Unpack the chunk into two halves (16-bit integers)
       * 2) Shift each 16-bit integer 4 bits to the right
       * 3) Check if the value is 0xf (first four bits set to 1)
       * 4) Check the high bit of each 8-bit integer
       *
       * If the result of step 4 is non-zero, the part has a supplementary plane character.
       *
       * Example: the string "hello test! \xf0\x9f\x98\x80" (13 characters, 16 bytes)
       *
       * UTF-8 representation:
       * h        e        l        l        o        <space>  t        e
       * 01101000 01100001 01101100 01101100 01101111 00100000 01110100 01100001
       *
       * s        t        !        <space>  😀 GRINNING FACE (1F600)
       * 01110011 01110100 00100001 00100000 11110000 10011111 10011000 10000000
       *
       * Low part:
       *
       * 1) Compare the low part into 16 bit values   = 0x00680065006c006c006f002000740065
       * 2) Shift each 16 bit value to the right by 4 = 0x00060006000600060006000000070006
       * 3) Compare each 16 bit value to 0xf          = 0x00000000000000000000000000000000
       * 4) Check the high bit of each 8-bit value    = 0
       *
       * No supplementary plane characters in this part
       *
       * High part:
       *
       * 1) Compare the low part into 16 bit values   = 0x007300740021002000f0009f00980080
       * 2) Shift each 16 bit value to the right by 4 = 0x0007000700020002000f000900090008
       * 3) Compare each 16 bit value to 0xf          = 0x0000000000000000ffff000000000000
       * 4) Check the high bit of each 8-bit value    = 0xc0 (0b0000000011000000)
       *
       * The result is non-zero, so this part has a supplementary plane character.
       *
       */
      if (has_utf8_supplementary_planes(_mm_unpacklo_epi8(chunk, _mm_setzero_si128())) ||
          has_utf8_supplementary_planes(_mm_unpackhi_epi8(chunk, _mm_setzero_si128()))) {
        return Qfalse;
      }
    }

    p += 16;
  }
#endif

  remain = end - p;
  while (remain) {
    if ((*p & 0xf0) == 0xf0) {
      return Qfalse;
    }
    p++;
    remain = end - p;
  }

  return Qtrue;
}