Module: Gonzui::TextTokenizer

Defined in:
ext/texttokenizer/texttokenizer.c

Class Method Summary collapse

Class Method Details

.each_word(text) ⇒ Object

Iterate over each word. word: [a-zA-Z0-9]+ or single multi-byte UTF-8 character



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'ext/texttokenizer/texttokenizer.c', line 60

static VALUE texttokenizer_each_word(VALUE obj, VALUE text)
{
  VALUE str;
  unsigned char *s, *beg, *eot;

  str = rb_obj_as_string(text);
  beg = RSTRING_PTR(str);
  eot = beg + RSTRING_LEN(str);
  s = skip(beg, eot);

  while (s < eot) {
    unsigned char *b = s;
    if (*s >= 0x80) {
      s += utf8len(s, eot);
    } else {
      for (; s < eot; s++)
        if (!((isalnum(*s) || *s == '_')))
          break;
    }
    rb_yield_values(2, rb_str_new(b, s - b), INT2FIX(b - beg));
    s = skip(s, eot);
  }
  return Qnil;
}