Class: ICU::UCharsetDetector

Inherits:
Object
  • Object
show all
Defined in:
lib/uchardet.rb,
ext/uchardet.c

Overview

:main: README

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#new(text = nil, declared_encoding = nil) ⇒ Object

Create a new charset detector. Optionally set the input text and declared encoding.



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'ext/uchardet.c', line 162

static VALUE
UCharsetDetector_initialize(int argc, VALUE *argv, VALUE self)
{
    VALUE text;
    VALUE declared_encoding;
    
    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
    if (NIL_P(text))
        UCharsetDetector_set_text(self, Qnil);
    else
        set_text(self, text);
    
    if (NIL_P(declared_encoding))
        UCharsetDetector_set_declared_encoding(self, Qnil);
    else
        set_declared_encoding(self, declared_encoding);
    
    return self;
}

Class Method Details

.detect(*args) ⇒ Object

Shortcut for ICU::UCharsetDetector#detect



7
8
9
# File 'lib/uchardet.rb', line 7

def self.detect(*args)
  self.new.detect(*args)
end

.detect_all(*args) ⇒ Object

Shortcut for ICU::UCharsetDetector#detect_all



12
13
14
# File 'lib/uchardet.rb', line 12

def self.detect_all(*args)
  self.new.detect_all(*args)
end

.detectable_charsetsObject

Shortcut for ICU::UCharsetDetector#detectable_charsets



17
18
19
# File 'lib/uchardet.rb', line 17

def self.detectable_charsets
  self.new.detectable_charsets
end

Instance Method Details

#declared_encodingObject

Get the declared encoding for charset detection.



101
102
103
104
105
# File 'ext/uchardet.c', line 101

static VALUE
UCharsetDetector_get_declared_encoding(VALUE self)
{
    return rb_iv_get(self, "@declared_encoding");
}

#declared_encoding=Object

Set the declared encoding for charset detection. The declared encoding of an input text is an encoding obtained by the user from an HTTP header or XML declaration or similar source that can be provided as an additional hint to the charset detector.



116
117
118
119
120
# File 'ext/uchardet.c', line 116

static VALUE
UCharsetDetector_set_declared_encoding(VALUE self, VALUE declared_encoding)
{
    return rb_iv_set(self, "@declared_encoding", declared_encoding);
}

#detect(text = nil, declared_encoding = nil) ⇒ Object

Return the charset that best matches the supplied input data. If no match could be found, this method returns nil.

Note though, that because the detection only looks at the start of the input data, there is a possibility that the returned charset will fail to handle the full set of input data.

The function will fail if

  • no charset appears to match the data

  • no input text has been provided (with text or set with #text= )



198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# File 'ext/uchardet.c', line 198

static VALUE
UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
{
    VALUE text;
    VALUE declared_encoding;
    UErrorCode status = U_ZERO_ERROR;
    UCharsetDetector *detector;
    const UCharsetMatch *match = NULL;
    const char *encoding_name = "";
    int32_t encoding_confidence = 0;
    const char *encoding_language = "";
    VALUE hash = rb_hash_new();
    
    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
    set_text(self, text);
    set_declared_encoding(self, declared_encoding);
    
    Data_Get_Struct(self, UCharsetDetector, detector);

    match = ucsdet_detect(detector, &status);
    ensure(status);

    if (match) {
        encoding_name = ucsdet_getName(match, &status);
        ensure(status);
        encoding_confidence = ucsdet_getConfidence(match, &status);
        ensure(status);
        encoding_language = ucsdet_getLanguage(match, &status);
        ensure(status);
    }

    rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
    rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
    rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
    
    return hash;
}

#detect_all(text = nil, declared_encoding = nil) ⇒ Object

Find all charset matches that appear to be consistent with the input, returning an array of results. The results are ordered with the best quality match first.

Because the detection only looks at a limited amount of the input byte data, some of the returned charsets may fail to handle the all of input data.

Return an error if

  • no charset appears to match the data

  • no input text has been provided (with text or set with #text= )



252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# File 'ext/uchardet.c', line 252

static VALUE
UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
{
    VALUE text;
    VALUE declared_encoding;
    UCharsetDetector *detector;
    UErrorCode status = U_ZERO_ERROR;
    const UCharsetMatch **matches = NULL;
    int32_t matches_found = 0;
    VALUE ary = rb_ary_new();
    int i;
    
    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
    set_text(self, text);
    set_declared_encoding(self, declared_encoding);
    
    Data_Get_Struct(self, UCharsetDetector, detector);
    
    matches = ucsdet_detectAll(detector, &matches_found, &status);
    ensure(status);
    
    for (i = 0; i < matches_found; i++) {
        const char *encoding_name = "";
        int32_t encoding_confidence = 0;
        const char *encoding_language = "";
        VALUE hash = rb_hash_new();

        if (matches[i]) {
            encoding_name = ucsdet_getName(matches[i], &status);
            ensure(status);
            encoding_confidence = ucsdet_getConfidence(matches[i], &status);
            ensure(status);
            encoding_language = ucsdet_getLanguage(matches[i], &status);
            ensure(status);
        }
        
        rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
        rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
        rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));

        rb_ary_push(ary, hash);
    }
    
    return ary;
}

#detectable_charsetsObject

Get array of names of all detectable charsets that are known to the charset detection service.



304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
# File 'ext/uchardet.c', line 304

static VALUE
UCharsetDetector_get_detectable_charsets(VALUE self)
{
    UCharsetDetector *detector;
    UErrorCode status = U_ZERO_ERROR;
    UEnumeration *charsets = NULL;
    const char *charset_name = "";
    int32_t result_length = 0;
    VALUE ary = rb_ary_new();

    Data_Get_Struct(self, UCharsetDetector, detector);
    
    charsets = ucsdet_getAllDetectableCharsets(detector, &status);
    ensure(status);
    
    while (charset_name = uenum_next(charsets, &result_length, &status)) {
        ensure(status);
        rb_ary_push(ary, rb_str_new2(charset_name));
    }
    uenum_close(charsets);
    
    return ary;
}

#input_filtered=Object

Enable filtering of input text. If filtering is enabled, text within angle brackets (“<” and “>”) will be removed before detection, which will remove most HTML or XML markup.



61
62
63
64
65
66
67
68
69
# File 'ext/uchardet.c', line 61

static VALUE
UCharsetDetector_set_input_filtered(VALUE self, VALUE flag)
{
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    
    ucsdet_enableInputFilter(detector, RTEST(flag) ? TRUE : FALSE);
    return self;
}

#input_filteredBoolean

Return filtering flag value this charset detector.

Returns:

  • (Boolean)


44
45
46
47
48
49
50
51
# File 'ext/uchardet.c', line 44

static VALUE
UCharsetDetector_get_input_filtered(VALUE self)
{
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    
    return ucsdet_isInputFilterEnabled(detector) ? Qtrue : Qfalse;
}

#textObject

Get input text for this detector.



77
78
79
80
81
# File 'ext/uchardet.c', line 77

static VALUE
UCharsetDetector_get_text(VALUE self)
{
    return rb_iv_get(self, "@text");
}

#text=Object

Set input text for this detector.



89
90
91
92
93
# File 'ext/uchardet.c', line 89

static VALUE
UCharsetDetector_set_text(VALUE self, VALUE text)
{
    return rb_iv_set(self, "@text", text);
}