Module: CMess::GuessEncoding::Automatic::EncodingGuessers

Includes:
Encoding
Defined in:
lib/cmess/guess_encoding/automatic.rb

Overview

Definition of guessing heuristics. Order matters!

Instance Method Summary collapse

Methods included from Encoding

#[], #all_encodings

Instance Method Details

#encoding_01_ASCIIObject

ASCII, if all bytes are within the lower 128 bytes. Unfortunately, we have to read the whole file to make that decision.



252
253
254
# File 'lib/cmess/guess_encoding/automatic.rb', line 252

def encoding_01_ASCII
  ASCII if eof? && byte_count_sum(0x00..0x7f) == byte_total
end

#encoding_02_UTF_32_and_UTF_16BE_and_UTF_16LE_and_UTF_16Object

UTF-16 / UTF-32, if lots of NULL bytes present.



259
260
261
262
263
264
265
266
267
268
# File 'lib/cmess/guess_encoding/automatic.rb', line 259

def encoding_02_UTF_32_and_UTF_16BE_and_UTF_16LE_and_UTF_16
  if relative_byte_count(byte_count[0]) > 0.25
    case first_byte
      when 0x00 then UTF_32
      when 0xfe then UTF_16BE
      when 0xff then UTF_16LE
      else           UTF_16
    end
  end
end

#encoding_03_UTF_8Object

UTF-8, if number of escape-bytes and following bytes is matching.



272
273
274
275
276
277
278
279
280
281
# File 'lib/cmess/guess_encoding/automatic.rb', line 272

def encoding_03_UTF_8
  esc_bytes = byte_count_sum(0xc0..0xdf)     +
              # => 110xxxxx 10xxxxxx
              byte_count_sum(0xe0..0xef) * 2 +
              # => 1110xxxx 10xxxxxx 10xxxxxx
              byte_count_sum(0xf0..0xf7) * 3
              # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

  UTF_8 if esc_bytes > 0 && esc_bytes == byte_count_sum(0x80..0xbf)
end

#encoding_04_TEST_ENCODINGSObject

TEST_ENCODINGS, if frequency of TEST_CHARS exceeds TEST_THRESHOLD_DIRECT (direct match) or TEST_THRESHOLD_APPROX (approximate match).



285
286
287
288
289
290
291
292
293
294
295
# File 'lib/cmess/guess_encoding/automatic.rb', line 285

def encoding_04_TEST_ENCODINGS
  ratios = {}

  TEST_ENCODINGS.find(lambda {
    ratio, encoding = ratios.sort.last
    encoding if ratio >= TEST_THRESHOLD_APPROX
  }) { |encoding|
    ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
    ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
  }
end