Class: HTML5::EncodingParser
- Inherits:
-
Object
- Object
- HTML5::EncodingParser
- Defined in:
- lib/html5/inputstream.rb
Overview
Mini parser for detecting character encoding from meta elements
Constant Summary collapse
- ASCII_PUNCTUATION =
%r{[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]}
- ENCODINGS =
a (hopefully) temporary hack to deal with the fact that ruby doesn’t have a built in encodings
library
['euc_jp', 'utf-8', "iso8859-2", "iso-8859-1", "utf-16", "UTF-16LE", "UTF-16BE"].inject({}){|m, v| m[v.downcase.gsub(ASCII_PUNCTUATION, '')] = v; m}
- @@method_dispatch =
[ ['<!--', :handle_comment], ['<meta', :handle_meta], ['</', :handle_possible_end_tag], ['<!', :handle_other], ['<?', :handle_other], ['<', :handle_possible_start_tag] ]
Instance Method Summary collapse
- #codec_name(encoding) ⇒ Object
-
#get_attribute ⇒ Object
Return a name,value pair for the next attribute in the stream, if one is found, or nil.
- #get_encoding ⇒ Object
-
#handle_comment ⇒ Object
Skip over comments.
- #handle_meta ⇒ Object
- #handle_other ⇒ Object
- #handle_possible_end_tag ⇒ Object
- #handle_possible_start_tag ⇒ Object
- #handle_possible_tag(end_tag) ⇒ Object
-
#initialize(data) ⇒ EncodingParser
constructor
string - the data to work on for encoding detection.
Constructor Details
#initialize(data) ⇒ EncodingParser
string - the data to work on for encoding detection
485 486 487 488 |
# File 'lib/html5/inputstream.rb', line 485 def initialize(data) @data = EncodingBytes.new(data.to_s) @encoding = nil end |
Instance Method Details
#codec_name(encoding) ⇒ Object
676 677 678 679 680 681 682 683 684 685 |
# File 'lib/html5/inputstream.rb', line 676 def codec_name(encoding) if (!encoding.nil? && encoding.kind_of?(String)) canonical_name = encoding.downcase.gsub(ASCII_PUNCTUATION, '') ENCODINGS[canonical_name] # p encoding # encoding else nil end end |
#get_attribute ⇒ Object
Return a name,value pair for the next attribute in the stream, if one is found, or nil
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 |
# File 'lib/html5/inputstream.rb', line 595 def get_attribute @data.skip(SPACE_CHARACTERS + ['/']) if @data.current_byte == '<' @data.position -= 1 return nil elsif @data.current_byte == '>' return nil end attr_name = [] attr_value = [] space_found = false #Step 5 attribute name while true if @data.current_byte == '=' and attr_name break elsif SPACE_CHARACTERS.include?(@data.current_byte) space_found = true break elsif ['/', '<', '>'].include?(@data.current_byte) return [attr_name.join(''), ''] elsif ASCII_UPPERCASE.include?(@data.current_byte) attr_name.push(@data.current_byte.downcase) else attr_name.push(@data.current_byte) end #Step 6 @data.position += 1 end #Step 7 if space_found @data.skip #Step 8 unless @data.current_byte == '=' @data.position -= 1 return [attr_name.join(''), ''] end end #XXX need to advance position in both spaces and value case #Step 9 @data.position += 1 #Step 10 @data.skip #Step 11 if ["'", '"'].include?(@data.current_byte) #11.1 quote_char = @data.current_byte while true @data.position+=1 #11.3 if @data.current_byte == quote_char @data.position += 1 return [attr_name.join(''), attr_value.join('')] #11.4 elsif ASCII_UPPERCASE.include?(@data.current_byte) attr_value.push(@data.current_byte.downcase) #11.5 else attr_value.push(@data.current_byte) end end elsif ['>', '<'].include?(@data.current_byte) return [attr_name.join(''), ''] elsif ASCII_UPPERCASE.include?(@data.current_byte) attr_value.push(@data.current_byte.downcase) else attr_value.push(@data.current_byte) end while true @data.position += 1 if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte) return [attr_name.join(''), attr_value.join('')] elsif ASCII_UPPERCASE.include?(@data.current_byte) attr_value.push(@data.current_byte.downcase) else attr_value.push(@data.current_byte) end end end |
#get_encoding ⇒ Object
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 |
# File 'lib/html5/inputstream.rb', line 499 def get_encoding @data.each do |byte| keep_parsing = true @@method_dispatch.each do |(key, method)| if @data.match_bytes(key, lower = true) keep_parsing = send(method) break end end break unless keep_parsing end unless @encoding.nil? @encoding = @encoding.strip if ["utf16", "utf16be", "utf16le", "utf32", "utf32be", "utf32le"].include?(@encoding.downcase.gsub(ASCII_PUNCTUATION, '')) @encoding = 'utf-8' end end return @encoding end |
#handle_comment ⇒ Object
Skip over comments
522 523 524 |
# File 'lib/html5/inputstream.rb', line 522 def handle_comment return @data.jump_to('-->') end |
#handle_meta ⇒ Object
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 |
# File 'lib/html5/inputstream.rb', line 526 def # if we have <meta not followed by a space so just keep going return true unless SPACE_CHARACTERS.include?(@data.current_byte) #We have a valid meta element we want to search for attributes while true #Try to find the next attribute after the current position attr = get_attribute return true if attr.nil? if attr[0] == 'charset' tentative_encoding = attr[1] codec = codec_name(tentative_encoding) if codec @encoding = codec return false end elsif attr[0] == 'content' content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1])) tentative_encoding = content_parser.parse codec = codec_name(tentative_encoding) if codec @encoding = codec return false end end end end |
#handle_other ⇒ Object
589 590 591 |
# File 'lib/html5/inputstream.rb', line 589 def handle_other return @data.jump_to('>') end |
#handle_possible_end_tag ⇒ Object
559 560 561 562 |
# File 'lib/html5/inputstream.rb', line 559 def handle_possible_end_tag @data.position += 1 return handle_possible_tag(true) end |
#handle_possible_start_tag ⇒ Object
555 556 557 |
# File 'lib/html5/inputstream.rb', line 555 def handle_possible_start_tag return handle_possible_tag(false) end |
#handle_possible_tag(end_tag) ⇒ Object
564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 |
# File 'lib/html5/inputstream.rb', line 564 def handle_possible_tag(end_tag) unless ASCII_LETTERS.include?(@data.current_byte) #If the next byte is not an ascii letter either ignore this #fragment (possible start tag case) or treat it according to #handleOther if end_tag @data.position -= 1 handle_other end return true end @data.find_next(SPACE_CHARACTERS + ['<', '>']) if @data.current_byte == '<' #return to the first step in the overall "two step" algorithm #reprocessing the < byte @data.position -= 1 else #Read all attributes {} until get_attribute.nil? end return true end |