Module: CompactEncDet
- Defined in:
- lib/compact_enc_det/version.rb,
ext/compact_enc_det/compact_enc_det.cc
Defined Under Namespace
Classes: DetectEncodingResult
Constant Summary collapse
- VERSION =
"1.0.0"
Class Method Summary collapse
-
.detect_encoding(*args) ⇒ Object
for the CompactEncDet::DetectEncoding C++ function.
Class Method Details
.detect_encoding(*args) ⇒ Object
for the CompactEncDet::DetectEncoding C++ function
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'ext/compact_enc_det/compact_enc_det.cc', line 35
static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
{
VALUE text,
text_length,
url_hint,
http_charset_hint,
meta_charset_hint,
encoding_hint,
language_hint,
corpus_type,
ignore_7bit_mail_encodings;
// Parse the Ruby arguments
rb_scan_args(argc, argv, "17",
&text,
&text_length,
&url_hint,
&http_charset_hint,
&meta_charset_hint,
&encoding_hint,
&language_hint,
&corpus_type,
&ignore_7bit_mail_encodings);
// Ensure the text argument is a Ruby string
Check_Type(text, T_STRING);
// Convert the Ruby arguments to C++ types
const char* c_text = RSTRING_PTR(text);
const int c_text_length = NIL_P(text_length) ? RSTRING_LEN(text) : NUM2INT(text_length);
// Declare the output variables
int bytes_consumed;
bool is_reliable;
// Detect the encoding using CompactEncDet::DetectEncoding
Encoding encoding = CompactEncDet::DetectEncoding(
c_text,
c_text_length,
NIL_P(url_hint) ? nullptr : StringValueCStr(url_hint),
NIL_P(http_charset_hint) ? nullptr : StringValueCStr(http_charset_hint),
NIL_P(meta_charset_hint) ? nullptr : StringValueCStr(meta_charset_hint),
NIL_P(encoding_hint) ? UNKNOWN_ENCODING : NUM2INT(encoding_hint),
NIL_P(language_hint) ? UNKNOWN_LANGUAGE : static_cast<Language>(NUM2INT(language_hint)),
NIL_P(corpus_type) ? CompactEncDet::WEB_CORPUS : static_cast<CompactEncDet::TextCorpusType>(NUM2INT(corpus_type)),
NIL_P(ignore_7bit_mail_encodings) ? false : RTEST(ignore_7bit_mail_encodings),
&bytes_consumed,
&is_reliable);
// Convert the encoding enum to string using MimeEncodingName
const char* encoding_mime_name = MimeEncodingName(encoding);
VALUE rb_encoding_mime_name = rb_str_new_cstr(encoding_mime_name);
// Find the Ruby Encoding class
VALUE rb_encoding = rb_funcall(rb_cEncoding, rb_intern("find"), 1, rb_encoding_mime_name);
// Return the detected encoding as a Ruby class
VALUE result = rb_class_new_instance(0, NULL, rb_cDetectEncodingResult);
rb_iv_set(result, "@encoding", rb_encoding);
rb_iv_set(result, "@bytes_consumed", rb_int_new(bytes_consumed));
rb_iv_set(result, "@is_reliable", is_reliable ? Qtrue : Qfalse);
return result;
}
|