Class: HTML5::EncodingParser

Inherits:

Object

Object
HTML5::EncodingParser

show all

Defined in:: lib/html5/inputstream.rb

Overview

Mini parser for detecting character encoding from meta elements

Constant Summary collapse

ASCII_PUNCTUATION =

%r{[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]}

ENCODINGS = a (hopefully) temporary hack to deal with the fact that ruby doesn’t have a built in encodings library

['euc_jp', 'utf-8', "iso8859-2", "iso-8859-1", "utf-16", "UTF-16LE", "UTF-16BE"].inject({}){|m, v| m[v.downcase.gsub(ASCII_PUNCTUATION, '')] = v; m}

@@method_dispatch =

[
  ['<!--', :handle_comment],
  ['<meta', :handle_meta],
  ['</', :handle_possible_end_tag],
  ['<!', :handle_other],
  ['<?', :handle_other],
  ['<', :handle_possible_start_tag]
]

Instance Method Summary collapse

#codec_name(encoding) ⇒ Object
#get_attribute ⇒ Object

Return a name,value pair for the next attribute in the stream, if one is found, or nil.
#get_encoding ⇒ Object
#handle_comment ⇒ Object

Skip over comments.
#handle_meta ⇒ Object
#handle_other ⇒ Object
#handle_possible_end_tag ⇒ Object
#handle_possible_start_tag ⇒ Object
#handle_possible_tag(end_tag) ⇒ Object
#initialize(data) ⇒ EncodingParser constructor

string - the data to work on for encoding detection.

Constructor Details

#initialize(data) ⇒ `EncodingParser`

string - the data to work on for encoding detection

# File 'lib/html5/inputstream.rb', line 485

def initialize(data)
  @data = EncodingBytes.new(data.to_s)
  @encoding = nil
end

Instance Method Details

#codec_name(encoding) ⇒ `Object`

# File 'lib/html5/inputstream.rb', line 676

def codec_name(encoding)
  if (!encoding.nil? && encoding.kind_of?(String))
    canonical_name = encoding.downcase.gsub(ASCII_PUNCTUATION, '')
    ENCODINGS[canonical_name]
    # p encoding
    # encoding
  else
    nil
  end
end

#get_attribute ⇒ `Object`

Return a name,value pair for the next attribute in the stream, if one is found, or nil

# File 'lib/html5/inputstream.rb', line 595

def get_attribute
  @data.skip(SPACE_CHARACTERS + ['/'])

  if @data.current_byte == '<'
    @data.position -= 1
    return nil
  elsif @data.current_byte == '>'
    return nil
  end

  attr_name = []
  attr_value = []
  space_found = false
  #Step 5 attribute name
  while true
    if @data.current_byte == '=' and attr_name
      break
    elsif SPACE_CHARACTERS.include?(@data.current_byte)
      space_found = true
      break
    elsif ['/', '<', '>'].include?(@data.current_byte)
      return [attr_name.join(''), '']
    elsif ASCII_UPPERCASE.include?(@data.current_byte)
      attr_name.push(@data.current_byte.downcase)
    else
      attr_name.push(@data.current_byte)
    end
    #Step 6
    @data.position += 1
  end
  #Step 7
  if space_found
    @data.skip
    #Step 8
    unless @data.current_byte == '='
      @data.position -= 1
      return [attr_name.join(''), '']
    end
  end
  #XXX need to advance position in both spaces and value case
  #Step 9
  @data.position += 1
  #Step 10
  @data.skip
  #Step 11
  if ["'", '"'].include?(@data.current_byte)
    #11.1
    quote_char = @data.current_byte
    while true
      @data.position+=1
      #11.3
      if @data.current_byte == quote_char
        @data.position += 1
        return [attr_name.join(''), attr_value.join('')]
      #11.4
      elsif ASCII_UPPERCASE.include?(@data.current_byte)
        attr_value.push(@data.current_byte.downcase)
      #11.5
      else
        attr_value.push(@data.current_byte)
      end
    end
  elsif ['>', '<'].include?(@data.current_byte)
    return [attr_name.join(''), '']
  elsif ASCII_UPPERCASE.include?(@data.current_byte)
    attr_value.push(@data.current_byte.downcase)
  else
    attr_value.push(@data.current_byte)
  end
  while true
    @data.position += 1
    if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
      return [attr_name.join(''), attr_value.join('')]
    elsif ASCII_UPPERCASE.include?(@data.current_byte)
      attr_value.push(@data.current_byte.downcase)
    else
      attr_value.push(@data.current_byte)
    end
  end
end

#get_encoding ⇒ `Object`

# File 'lib/html5/inputstream.rb', line 499

def get_encoding
  @data.each do |byte|
    keep_parsing = true
    @@method_dispatch.each do |(key, method)|
      if @data.match_bytes(key, lower = true)
        keep_parsing = send(method)
        break
      end
    end
    break unless keep_parsing
  end

  unless @encoding.nil?
    @encoding = @encoding.strip
    if ["utf16", "utf16be", "utf16le", "utf32", "utf32be", "utf32le"].include?(@encoding.downcase.gsub(ASCII_PUNCTUATION, ''))
      @encoding = 'utf-8'
    end
  end
  
  return @encoding
end

#handle_comment ⇒ `Object`

Skip over comments



522
523
524

# File 'lib/html5/inputstream.rb', line 522

def handle_comment
  return @data.jump_to('-->')
end

#handle_meta ⇒ `Object`

# File 'lib/html5/inputstream.rb', line 526

def handle_meta
  # if we have <meta not followed by a space so just keep going
  return true unless SPACE_CHARACTERS.include?(@data.current_byte)

  #We have a valid meta element we want to search for attributes
  while true
    #Try to find the next attribute after the current position
    attr = get_attribute

    return true if attr.nil?
    if attr[0] == 'charset'
      tentative_encoding = attr[1]
      codec = codec_name(tentative_encoding)
      if codec
        @encoding = codec
        return false
      end
    elsif attr[0] == 'content'
      content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
      tentative_encoding = content_parser.parse
      codec = codec_name(tentative_encoding)
      if codec
        @encoding = codec
        return false
      end
    end
  end
end

#handle_other ⇒ `Object`



589
590
591

# File 'lib/html5/inputstream.rb', line 589

def handle_other
  return @data.jump_to('>')
end

#handle_possible_end_tag ⇒ `Object`

# File 'lib/html5/inputstream.rb', line 559

def handle_possible_end_tag
  @data.position += 1
  return handle_possible_tag(true)
end

#handle_possible_start_tag ⇒ `Object`



555
556
557

# File 'lib/html5/inputstream.rb', line 555

def handle_possible_start_tag
  return handle_possible_tag(false)
end

#handle_possible_tag(end_tag) ⇒ `Object`

# File 'lib/html5/inputstream.rb', line 564

def handle_possible_tag(end_tag)
  unless ASCII_LETTERS.include?(@data.current_byte)
    #If the next byte is not an ascii letter either ignore this
    #fragment (possible start tag case) or treat it according to 
    #handleOther
    if end_tag
      @data.position -= 1
      handle_other
    end
    return true
  end

  @data.find_next(SPACE_CHARACTERS + ['<', '>'])

  if @data.current_byte == '<'
    #return to the first step in the overall "two step" algorithm
    #reprocessing the < byte
    @data.position -= 1  
  else
    #Read all attributes
    {} until get_attribute.nil?
  end
  return true
end

Class: HTML5::EncodingParser

Overview

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data) ⇒ EncodingParser

Instance Method Details

#codec_name(encoding) ⇒ Object

#get_attribute ⇒ Object

#get_encoding ⇒ Object

#handle_comment ⇒ Object

#handle_meta ⇒ Object

#handle_other ⇒ Object

#handle_possible_end_tag ⇒ Object

#handle_possible_start_tag ⇒ Object

#handle_possible_tag(end_tag) ⇒ Object