Module: Condenser::EncodingUtils

Included in:: Asset

Defined in:: lib/condenser/encoding_utils.rb

Constant Summary collapse

BOM = Internal: Mapping unicode encodings to byte order markers.

{
  ::Encoding::UTF_32LE => [0xFF, 0xFE, 0x00, 0x00],
  ::Encoding::UTF_32BE => [0x00, 0x00, 0xFE, 0xFF],
  ::Encoding::UTF_8    => [0xEF, 0xBB, 0xBF],
  ::Encoding::UTF_16LE => [0xFF, 0xFE],
  ::Encoding::UTF_16BE => [0xFE, 0xFF]
}

CHARSET_START = Internal: @charset bytes

[0x40, 0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x20, 0x22]

CHARSET_SIZE =

CHARSET_START.size

Instance Method Summary collapse

#detect(str) ⇒ Object

Public: Basic string detecter.
#detect_css(str) ⇒ Object

Public: Detect and strip @charset from CSS style sheet.
#detect_html(str) ⇒ Object

Public: Detect charset from HTML document.
#detect_unicode(str) ⇒ Object

Public: Detect Unicode string.
#detect_unicode_bom(str) ⇒ Object

Public: Detect and strip BOM from possible unicode string.
#scan_css_charset(str) ⇒ Object

Internal: Scan binary CSS string for @charset encoding name.

Instance Method Details

#detect(str) ⇒ `Object`

Public: Basic string detecter.

Attempts to parse any Unicode BOM otherwise falls back to the environment’s external encoding.

str - ASCII-8BIT encoded String

Returns encoded String.

# File 'lib/condenser/encoding_utils.rb', line 21

def detect(str)
  str = detect_unicode_bom(str)

  # Fallback to environment's external encoding
  if str.encoding == Encoding::BINARY
    str.force_encoding(Encoding.default_external)
  end

  str
end

#detect_css(str) ⇒ `Object`

Public: Detect and strip @charset from CSS style sheet.

str - String.

Returns a encoded String.

# File 'lib/condenser/encoding_utils.rb', line 77

def detect_css(str)
  str = detect_unicode_bom(str)

  if name = scan_css_charset(str)
    encoding = Encoding.find(name)
    str = str.dup
    str.force_encoding(encoding)
    len = "@charset \"#{name}\";".encode(encoding).size
    str.slice!(0, len)
    str
  end

  # Fallback to UTF-8
  if str.encoding == Encoding::BINARY
    str.force_encoding(Encoding::UTF_8)
  end

  str
end

#detect_html(str) ⇒ `Object`

Public: Detect charset from HTML document.

Attempts to parse any Unicode BOM otherwise attempt Charlock detection and finally falls back to the environment’s external encoding.

str - String.

Returns a encoded String.

# File 'lib/condenser/encoding_utils.rb', line 143

def detect_html(str)
  str = detect_unicode_bom(str)

  # Fallback to environment's external encoding
  if str.encoding == Encoding::BINARY
    str.force_encoding(Encoding.default_external)
  end

  str
end

#detect_unicode(str) ⇒ `Object`

Public: Detect Unicode string.

Attempts to parse Unicode BOM and falls back to UTF-8.

str - ASCII-8BIT encoded String

Returns encoded String.

# File 'lib/condenser/encoding_utils.rb', line 39

def detect_unicode(str)
  str = detect_unicode_bom(str)

  # Fallback to UTF-8
  if str.encoding == Encoding::BINARY
    str.force_encoding(Encoding::UTF_8)
  end

  str
end

#detect_unicode_bom(str) ⇒ `Object`

Public: Detect and strip BOM from possible unicode string.

str - ASCII-8BIT encoded String

Returns UTF 8/16/32 encoded String without BOM or the original String if no BOM was present.

# File 'lib/condenser/encoding_utils.rb', line 56

def detect_unicode_bom(str)
  bom_bytes = str.byteslice(0, 4).bytes.to_a

  BOM.each do |encoding, bytes|
    if bom_bytes[0, bytes.size] == bytes
      str = str.dup
      str.force_encoding(Encoding::BINARY)
      str.slice!(0, bytes.size)
      str.force_encoding(encoding)
      return str
    end
  end

  return str
end

#scan_css_charset(str) ⇒ `Object`

Internal: Scan binary CSS string for @charset encoding name.

str - ASCII-8BIT encoded String

Returns encoding String name or nil.

# File 'lib/condenser/encoding_utils.rb', line 106

def scan_css_charset(str)
  buf = []
  i = 0

  str.each_byte.each do |byte|
    # Halt on line breaks
    break if byte == 0x0A || byte == 0x0D

    # Only ascii bytes
    next unless 0x0 < byte && byte <= 0xFF

    if i < CHARSET_SIZE
    elsif i == CHARSET_SIZE
      if buf == CHARSET_START
        buf = []
      else
        break
      end
    elsif byte == 0x22
      return buf.pack('C*')
    end

    buf << byte
    i += 1
  end

  nil
end

Module: Condenser::EncodingUtils

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#detect(str) ⇒ Object

#detect_css(str) ⇒ Object

#detect_html(str) ⇒ Object

#detect_unicode(str) ⇒ Object

#detect_unicode_bom(str) ⇒ Object

#scan_css_charset(str) ⇒ Object

#detect(str) ⇒ `Object`

#detect_css(str) ⇒ `Object`

#detect_html(str) ⇒ `Object`

#detect_unicode(str) ⇒ `Object`

#detect_unicode_bom(str) ⇒ `Object`

#scan_css_charset(str) ⇒ `Object`