Module: CompactEncDet

Defined in:
lib/compact_enc_det/version.rb,
ext/compact_enc_det/compact_enc_det.cc

Defined Under Namespace

Classes: DetectEncodingResult

Constant Summary collapse

VERSION =
"1.0.0"

Class Method Summary collapse

Class Method Details

.detect_encoding(*args) ⇒ Object

for the CompactEncDet::DetectEncoding C++ function



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'ext/compact_enc_det/compact_enc_det.cc', line 35

static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
{
  VALUE text,
      text_length,
      url_hint,
      http_charset_hint,
      meta_charset_hint,
      encoding_hint,
      language_hint,
      corpus_type,
      ignore_7bit_mail_encodings;

  // Parse the Ruby arguments
  rb_scan_args(argc, argv, "17",
               &text,
               &text_length,
               &url_hint,
               &http_charset_hint,
               &meta_charset_hint,
               &encoding_hint,
               &language_hint,
               &corpus_type,
               &ignore_7bit_mail_encodings);

  // Ensure the text argument is a Ruby string
  Check_Type(text, T_STRING);

  // Convert the Ruby arguments to C++ types
  const char* c_text = RSTRING_PTR(text);
  const int c_text_length = NIL_P(text_length) ? RSTRING_LEN(text) : NUM2INT(text_length);

  // Declare the output variables
  int bytes_consumed;
  bool is_reliable;

  // Detect the encoding using CompactEncDet::DetectEncoding
  Encoding encoding = CompactEncDet::DetectEncoding(
      c_text,
      c_text_length,
      NIL_P(url_hint) ? nullptr : StringValueCStr(url_hint),
      NIL_P(http_charset_hint) ? nullptr : StringValueCStr(http_charset_hint),
      NIL_P(meta_charset_hint) ? nullptr : StringValueCStr(meta_charset_hint),
      NIL_P(encoding_hint) ? UNKNOWN_ENCODING : NUM2INT(encoding_hint),
      NIL_P(language_hint) ? UNKNOWN_LANGUAGE : static_cast<Language>(NUM2INT(language_hint)),
      NIL_P(corpus_type) ? CompactEncDet::WEB_CORPUS : static_cast<CompactEncDet::TextCorpusType>(NUM2INT(corpus_type)),
      NIL_P(ignore_7bit_mail_encodings) ? false : RTEST(ignore_7bit_mail_encodings),
      &bytes_consumed,
      &is_reliable);

  // Convert the encoding enum to string using MimeEncodingName
  const char* encoding_mime_name = MimeEncodingName(encoding);
  VALUE rb_encoding_mime_name = rb_str_new_cstr(encoding_mime_name);

  // Find the Ruby Encoding class
  VALUE rb_encoding = rb_funcall(rb_cEncoding, rb_intern("find"), 1, rb_encoding_mime_name);

  // Return the detected encoding as a Ruby class
  VALUE result = rb_class_new_instance(0, NULL, rb_cDetectEncodingResult);
  rb_iv_set(result, "@encoding", rb_encoding);
  rb_iv_set(result, "@bytes_consumed", rb_int_new(bytes_consumed));
  rb_iv_set(result, "@is_reliable", is_reliable ? Qtrue : Qfalse);
  return result;
}