Class: HTML5::EncodingParser

Inherits:
Object
  • Object
show all
Defined in:
lib/html5/inputstream.rb

Overview

Mini parser for detecting character encoding from meta elements

Constant Summary collapse

ASCII_PUNCTUATION =
%r{[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]}
ENCODINGS =

a (hopefully) temporary hack to deal with the fact that ruby doesn’t have a built in encodings

library
['euc_jp', 'utf-8', "iso8859-2", "iso-8859-1", "utf-16", "UTF-16LE", "UTF-16BE"].inject({}){|m, v| m[v.downcase.gsub(ASCII_PUNCTUATION, '')] = v; m}
@@method_dispatch =
[
  ['<!--', :handle_comment],
  ['<meta', :handle_meta],
  ['</', :handle_possible_end_tag],
  ['<!', :handle_other],
  ['<?', :handle_other],
  ['<', :handle_possible_start_tag]
]

Instance Method Summary collapse

Constructor Details

#initialize(data) ⇒ EncodingParser

string - the data to work on for encoding detection



485
486
487
488
# File 'lib/html5/inputstream.rb', line 485

def initialize(data)
  @data = EncodingBytes.new(data.to_s)
  @encoding = nil
end

Instance Method Details

#codec_name(encoding) ⇒ Object



676
677
678
679
680
681
682
683
684
685
# File 'lib/html5/inputstream.rb', line 676

def codec_name(encoding)
  if (!encoding.nil? && encoding.kind_of?(String))
    canonical_name = encoding.downcase.gsub(ASCII_PUNCTUATION, '')
    ENCODINGS[canonical_name]
    # p encoding
    # encoding
  else
    nil
  end
end

#get_attributeObject

Return a name,value pair for the next attribute in the stream, if one is found, or nil



595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
# File 'lib/html5/inputstream.rb', line 595

def get_attribute
  @data.skip(SPACE_CHARACTERS + ['/'])

  if @data.current_byte == '<'
    @data.position -= 1
    return nil
  elsif @data.current_byte == '>'
    return nil
  end

  attr_name = []
  attr_value = []
  space_found = false
  #Step 5 attribute name
  while true
    if @data.current_byte == '=' and attr_name
      break
    elsif SPACE_CHARACTERS.include?(@data.current_byte)
      space_found = true
      break
    elsif ['/', '<', '>'].include?(@data.current_byte)
      return [attr_name.join(''), '']
    elsif ASCII_UPPERCASE.include?(@data.current_byte)
      attr_name.push(@data.current_byte.downcase)
    else
      attr_name.push(@data.current_byte)
    end
    #Step 6
    @data.position += 1
  end
  #Step 7
  if space_found
    @data.skip
    #Step 8
    unless @data.current_byte == '='
      @data.position -= 1
      return [attr_name.join(''), '']
    end
  end
  #XXX need to advance position in both spaces and value case
  #Step 9
  @data.position += 1
  #Step 10
  @data.skip
  #Step 11
  if ["'", '"'].include?(@data.current_byte)
    #11.1
    quote_char = @data.current_byte
    while true
      @data.position+=1
      #11.3
      if @data.current_byte == quote_char
        @data.position += 1
        return [attr_name.join(''), attr_value.join('')]
      #11.4
      elsif ASCII_UPPERCASE.include?(@data.current_byte)
        attr_value.push(@data.current_byte.downcase)
      #11.5
      else
        attr_value.push(@data.current_byte)
      end
    end
  elsif ['>', '<'].include?(@data.current_byte)
    return [attr_name.join(''), '']
  elsif ASCII_UPPERCASE.include?(@data.current_byte)
    attr_value.push(@data.current_byte.downcase)
  else
    attr_value.push(@data.current_byte)
  end
  while true
    @data.position += 1
    if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
      return [attr_name.join(''), attr_value.join('')]
    elsif ASCII_UPPERCASE.include?(@data.current_byte)
      attr_value.push(@data.current_byte.downcase)
    else
      attr_value.push(@data.current_byte)
    end
  end
end

#get_encodingObject



499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
# File 'lib/html5/inputstream.rb', line 499

def get_encoding
  @data.each do |byte|
    keep_parsing = true
    @@method_dispatch.each do |(key, method)|
      if @data.match_bytes(key, lower = true)
        keep_parsing = send(method)
        break
      end
    end
    break unless keep_parsing
  end

  unless @encoding.nil?
    @encoding = @encoding.strip
    if ["utf16", "utf16be", "utf16le", "utf32", "utf32be", "utf32le"].include?(@encoding.downcase.gsub(ASCII_PUNCTUATION, ''))
      @encoding = 'utf-8'
    end
  end
  
  return @encoding
end

#handle_commentObject

Skip over comments



522
523
524
# File 'lib/html5/inputstream.rb', line 522

def handle_comment
  return @data.jump_to('-->')
end

#handle_metaObject



526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
# File 'lib/html5/inputstream.rb', line 526

def handle_meta
  # if we have <meta not followed by a space so just keep going
  return true unless SPACE_CHARACTERS.include?(@data.current_byte)

  #We have a valid meta element we want to search for attributes
  while true
    #Try to find the next attribute after the current position
    attr = get_attribute

    return true if attr.nil?
    if attr[0] == 'charset'
      tentative_encoding = attr[1]
      codec = codec_name(tentative_encoding)
      if codec
        @encoding = codec
        return false
      end
    elsif attr[0] == 'content'
      content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
      tentative_encoding = content_parser.parse
      codec = codec_name(tentative_encoding)
      if codec
        @encoding = codec
        return false
      end
    end
  end
end

#handle_otherObject



589
590
591
# File 'lib/html5/inputstream.rb', line 589

def handle_other
  return @data.jump_to('>')
end

#handle_possible_end_tagObject



559
560
561
562
# File 'lib/html5/inputstream.rb', line 559

def handle_possible_end_tag
  @data.position += 1
  return handle_possible_tag(true)
end

#handle_possible_start_tagObject



555
556
557
# File 'lib/html5/inputstream.rb', line 555

def handle_possible_start_tag
  return handle_possible_tag(false)
end

#handle_possible_tag(end_tag) ⇒ Object



564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
# File 'lib/html5/inputstream.rb', line 564

def handle_possible_tag(end_tag)
  unless ASCII_LETTERS.include?(@data.current_byte)
    #If the next byte is not an ascii letter either ignore this
    #fragment (possible start tag case) or treat it according to 
    #handleOther
    if end_tag
      @data.position -= 1
      handle_other
    end
    return true
  end

  @data.find_next(SPACE_CHARACTERS + ['<', '>'])

  if @data.current_byte == '<'
    #return to the first step in the overall "two step" algorithm
    #reprocessing the < byte
    @data.position -= 1  
  else
    #Read all attributes
    {} until get_attribute.nil?
  end
  return true
end