Module: UTF8Proc

Includes:
JRuby
Defined in:
lib/utf8_proc.rb,
lib/utf8_proc/jruby.rb,
lib/utf8_proc/version.rb,
lib/utf8_proc/benchmark.rb,
lib/utf8_proc/core_ext/string.rb,
lib/utf8_proc/core_ext/string_jruby.rb,
ext/utf8_proc/utf8_proc.c

Overview

Unicode string normalization library using UTF8Proc

Defined Under Namespace

Modules: Benchmark, JRuby, StringExtension

Constant Summary collapse

VERSION =

The gem version

"0.6.0".freeze
LIBRARY_VERSION =

Displays the library version of the utf8proc library

rb_str_freeze(
  rb_enc_str_new(libVersion, strlen(libVersion), enc_utf8)
)

Class Method Summary collapse

Class Method Details

.NFC(string) ⇒ String Also known as: nfc

Normalizes a String using NFC (Canonical Decomposition, followed by Canonical Composition)

Parameters:

  • string (String)

    the String to normalize

Returns:

  • (String)

    a normalized string

Raises:

  • (EncodingError)

    if string is not encoded in UTF-8 or US-ASCII


77
78
79
# File 'ext/utf8_proc/utf8_proc.c', line 77

static VALUE toNFC(VALUE self, VALUE string) {
  return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
}

.NFD(string) ⇒ String Also known as: nfd

Normalizes a string using NFD (Canonical Decomposition)

Parameters:

  • string (String)

    the String to normalize

Returns:

  • (String)

    a normalized string

Raises:

  • (EncodingError)

    if string is not encoded in UTF-8 or US-ASCII


100
101
102
# File 'ext/utf8_proc/utf8_proc.c', line 100

static VALUE toNFD(VALUE self, VALUE string) {
  return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
}

.NFKC(string) ⇒ String Also known as: nfkc

Normalizes a string using NFKC (Compatibility Decomposition, followed by Canonical Composition)

Parameters:

  • string (String)

    the String to normalize

Returns:

  • (String)

    a normalized string

Raises:

  • (EncodingError)

    if string is not encoded in UTF-8 or US-ASCII


123
124
125
# File 'ext/utf8_proc/utf8_proc.c', line 123

static VALUE toNFKC(VALUE self, VALUE string) {
  return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
}

.NFKC_CF(string) ⇒ String Also known as: nfkc_cf

Normalizes a string using NFKC (Compatibility Decomposition, followed by Canonical Composition) with case-folding

Parameters:

  • string (String)

    the String to normalize

Returns:

  • (String)

    a normalized string

Raises:

  • (EncodingError)

    if string is not encoded in UTF-8 or US-ASCII


169
170
171
# File 'ext/utf8_proc/utf8_proc.c', line 169

static VALUE toNFKC_CF(VALUE self, VALUE string) {
  return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
}

.NFKD(string) ⇒ String Also known as: nfkd

Normalizes a string using NFKD (Compatibility Decomposition)

Parameters:

  • string (String)

    the String to normalize

Returns:

  • (String)

    a normalized string

Raises:

  • (EncodingError)

    if string is not encoded in UTF-8 or US-ASCII


146
147
148
# File 'ext/utf8_proc/utf8_proc.c', line 146

static VALUE toNFKD(VALUE self, VALUE string) {
  return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
}

.normalize(string, form = :nfc) ⇒ String

Normalizes a string according to one of the 5 possible forms

Parameters:

  • string (String)

    the String to normalize

  • form (:nfc, :nfd, :nfkc, :nfkd, :nfkc_cf) (defaults to: :nfc)

    the normalization form

Returns:

  • (String)

    a normalized string

Raises:

  • (EncodingError)

    if string is not encoded in UTF-8 or US-ASCII

  • (ArgumentError)

    if form is not one of the 5 valid forms


195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'ext/utf8_proc/utf8_proc.c', line 195

static VALUE toNorm(int argc, VALUE* argv, VALUE self){
  VALUE string;
  VALUE form;
  rb_scan_args(argc, argv, "11", &string, &form);

  if (NIL_P(form)) {
    return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
  }

  ID s_form;
  s_form = SYM2ID(form);
  if (s_form == NFC) {
    return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE);
  } else if (s_form == NFD) {
    return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE);
  } else if (s_form == NFKC) {
    return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
  } else if (s_form == NFKD) {
    return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
  } else if (s_form == NFKC_CF) {
    return normInternal(&string, UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD);
  } else {
    rb_raise(rb_eArgError, "%s",
             "Second argument must be one of [:nfc (default), :nfd, :nfkc, " \
             ":nfkd, :nfkc_cf]");
  }
}