Module: Gonzui::TextTokenizer
- Defined in:
- ext/texttokenizer/texttokenizer.c
Class Method Summary collapse
-
.each_word(text) ⇒ Object
Iterate over each word.
Class Method Details
.each_word(text) ⇒ Object
Iterate over each word. word: [a-zA-Z0-9]+ or single multi-byte UTF-8 character
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
# File 'ext/texttokenizer/texttokenizer.c', line 60
static VALUE texttokenizer_each_word(VALUE obj, VALUE text)
{
VALUE str;
unsigned char *s, *beg, *eot;
str = rb_obj_as_string(text);
beg = RSTRING_PTR(str);
eot = beg + RSTRING_LEN(str);
s = skip(beg, eot);
while (s < eot) {
unsigned char *b = s;
if (*s >= 0x80) {
s += utf8len(s, eot);
} else {
for (; s < eot; s++)
if (!((isalnum(*s) || *s == '_')))
break;
}
rb_yield_values(2, rb_str_new(b, s - b), INT2FIX(b - beg));
s = skip(s, eot);
}
return Qnil;
}
|