Class: HTML5::HTMLParser
- Inherits:
-
Object
- Object
- HTML5::HTMLParser
- Defined in:
- lib/html5/html5parser.rb
Overview
HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
Direct Known Subclasses
Constant Summary collapse
- @@phases =
%w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
- @@new_modes =
{ 'select' => :inSelect, 'td' => :inCell, 'th' => :inCell, 'tr' => :inRow, 'tbody' => :inTableBody, 'thead' => :inTableBody, 'tfoot' => :inTableBody, 'caption' => :inCaption, 'colgroup' => :inColumnGroup, 'table' => :inTable, 'head' => :inBody, 'body' => :inBody, 'frameset' => :inFrameset }
Instance Attribute Summary collapse
-
#errors ⇒ Object
readonly
Returns the value of attribute errors.
-
#first_start_tag ⇒ Object
Returns the value of attribute first_start_tag.
-
#inner_html ⇒ Object
Returns the value of attribute inner_html.
-
#insert_from_table ⇒ Object
Returns the value of attribute insert_from_table.
-
#last_phase ⇒ Object
Returns the value of attribute last_phase.
-
#phase ⇒ Object
Returns the value of attribute phase.
-
#phases ⇒ Object
readonly
Returns the value of attribute phases.
-
#tokenizer ⇒ Object
readonly
Returns the value of attribute tokenizer.
-
#tree ⇒ Object
readonly
Returns the value of attribute tree.
Class Method Summary collapse
Instance Method Summary collapse
- #_(string) ⇒ Object
- #_parse(stream, inner_html, encoding, container = 'div') ⇒ Object
-
#initialize(options = {}) ⇒ HTMLParser
constructor
:strict - raise an exception when a parse error is encountered :tree - a treebuilder class controlling the type of tree that will be returned.
-
#normalize_token(token) ⇒ Object
HTML5 specific normalizations to the token stream.
-
#parse(stream, encoding = nil) ⇒ Object
Parse a HTML document into a well-formed tree.
- #parse_error(code = 'XXX-undefined-error', data = {}) ⇒ Object
-
#parse_fragment(stream, container = 'div', encoding = nil) ⇒ Object
container - name of the element we’re setting the inner_html property if set to nil, default to ‘div’.
- #reset_insertion_mode ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ HTMLParser
:strict - raise an exception when a parse error is encountered :tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through HTML5::TreeBuilders
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/html5/html5parser.rb', line 41 def initialize( = {}) @strict = false @errors = [] @tokenizer = HTMLTokenizer @tree = TreeBuilders::REXML::TreeBuilder .each {|name, value| instance_variable_set("@#{name}", value) } @lowercase_attr_name = nil unless instance_variables.include?("@lowercase_attr_name") @lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name") @tree = @tree.new @phases = @@phases.inject({}) do |phases, phase_name| phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase' phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree) phases end end |
Instance Attribute Details
#errors ⇒ Object (readonly)
Returns the value of attribute errors.
21 22 23 |
# File 'lib/html5/html5parser.rb', line 21 def errors @errors end |
#first_start_tag ⇒ Object
Returns the value of attribute first_start_tag.
19 20 21 |
# File 'lib/html5/html5parser.rb', line 19 def first_start_tag @first_start_tag end |
#inner_html ⇒ Object
Returns the value of attribute inner_html.
19 20 21 |
# File 'lib/html5/html5parser.rb', line 19 def inner_html @inner_html end |
#insert_from_table ⇒ Object
Returns the value of attribute insert_from_table.
19 20 21 |
# File 'lib/html5/html5parser.rb', line 19 def insert_from_table @insert_from_table end |
#last_phase ⇒ Object
Returns the value of attribute last_phase.
19 20 21 |
# File 'lib/html5/html5parser.rb', line 19 def last_phase @last_phase end |
#phase ⇒ Object
Returns the value of attribute phase.
19 20 21 |
# File 'lib/html5/html5parser.rb', line 19 def phase @phase end |
#phases ⇒ Object (readonly)
Returns the value of attribute phases.
21 22 23 |
# File 'lib/html5/html5parser.rb', line 21 def phases @phases end |
#tokenizer ⇒ Object (readonly)
Returns the value of attribute tokenizer.
21 22 23 |
# File 'lib/html5/html5parser.rb', line 21 def tokenizer @tokenizer end |
#tree ⇒ Object (readonly)
Returns the value of attribute tree.
21 22 23 |
# File 'lib/html5/html5parser.rb', line 21 def tree @tree end |
Class Method Details
.parse(stream, options = {}) ⇒ Object
23 24 25 26 |
# File 'lib/html5/html5parser.rb', line 23 def self.parse(stream, = {}) encoding = .delete(:encoding) new().parse(stream,encoding) end |
.parse_fragment(stream, options = {}) ⇒ Object
28 29 30 31 32 |
# File 'lib/html5/html5parser.rb', line 28 def self.parse_fragment(stream, = {}) container = .delete(:container) || 'div' encoding = .delete(:encoding) new().parse_fragment(stream, container, encoding) end |
Instance Method Details
#_(string) ⇒ Object
245 |
# File 'lib/html5/html5parser.rb', line 245 def _(string); string; end |
#_parse(stream, inner_html, encoding, container = 'div') ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/html5/html5parser.rb', line 61 def _parse(stream, inner_html, encoding, container = 'div') @tree.reset @first_start_tag = false @errors = [] @tokenizer = @tokenizer.class unless Class === @tokenizer @tokenizer = @tokenizer.new(stream, :encoding => encoding, :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name) if inner_html case @inner_html = container.downcase when 'title', 'textarea' @tokenizer.content_model_flag = :RCDATA when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript' @tokenizer.content_model_flag = :CDATA when 'plaintext' @tokenizer.content_model_flag = :PLAINTEXT else # content_model_flag already is PCDATA @tokenizer.content_model_flag = :PCDATA end @phase = @phases[:rootElement] @phase.insert_html_element reset_insertion_mode else @inner_html = false @phase = @phases[:initial] end # We only seem to have InBodyPhase testcases where the following is # relevant ... need others too @last_phase = nil # XXX This is temporary for the moment so there isn't any other # changes needed for the parser to work with the iterable tokenizer @tokenizer.each do |token| token = normalize_token(token) method = 'process%s' % token[:type] case token[:type] when :Characters, :SpaceCharacters, :Comment @phase.send method, token[:data] when :StartTag @phase.send method, token[:name], token[:data] when :EndTag @phase.send method, token[:name] when :Doctype @phase.send method, token[:name], token[:publicId], token[:systemId], token[:correct] else parse_error(token[:data], token[:datavars]) end end # When the loop finishes it's EOF @phase.process_eof end |
#normalize_token(token) ⇒ Object
HTML5 specific normalizations to the token stream
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
# File 'lib/html5/html5parser.rb', line 157 def normalize_token(token) if token[:type] == :EmptyTag # When a solidus (/) is encountered within a tag name what happens # depends on whether the current tag name matches that of a void # element. If it matches a void element atheists did the wrong # thing and if it doesn't it's wrong for everyone. unless VOID_ELEMENTS.include?(token[:name]) parse_error("incorrectly-placed-solidus") end token[:type] = :StartTag end if token[:type] == :StartTag token[:name] = token[:name].downcase # We need to remove the duplicate attributes and convert attributes # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} unless token[:data].empty? data = token[:data].reverse.map {|attr, value| [attr.downcase, value] } token[:data] = Hash[*data.flatten] end elsif token[:type] == :EndTag parse_error("attributes-in-end-tag") unless token[:data].empty? token[:name] = token[:name].downcase end token end |
#parse(stream, encoding = nil) ⇒ Object
Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)
129 130 131 132 |
# File 'lib/html5/html5parser.rb', line 129 def parse(stream, encoding=nil) _parse(stream, false, encoding) @tree.get_document end |
#parse_error(code = 'XXX-undefined-error', data = {}) ⇒ Object
150 151 152 153 154 |
# File 'lib/html5/html5parser.rb', line 150 def parse_error(code = 'XXX-undefined-error', data = {}) # XXX The idea is to make data mandatory. @errors.push([@tokenizer.stream.position, code, data]) raise ParseError if @strict end |
#parse_fragment(stream, container = 'div', encoding = nil) ⇒ Object
container - name of the element we’re setting the inner_html property if set to nil, default to ‘div’
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)
145 146 147 148 |
# File 'lib/html5/html5parser.rb', line 145 def parse_fragment(stream, container='div', encoding=nil) _parse(stream, true, encoding, container) @tree.get_fragment end |
#reset_insertion_mode ⇒ Object
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
# File 'lib/html5/html5parser.rb', line 207 def reset_insertion_mode # The name of this method is mostly historical. (It's also used in the # specification.) last = false @tree.open_elements.reverse.each do |node| node_name = node.name if node == @tree.open_elements.first last = true unless ['td', 'th'].include?(node_name) # XXX # assert @inner_html node_name = @inner_html end end # Check for conditions that should only happen in the inner_html # case if ['select', 'colgroup', 'head', 'frameset'].include?(node_name) # XXX # assert @inner_html end if @@new_modes.has_key?(node_name) @phase = @phases[@@new_modes[node_name]] elsif node_name == 'html' @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead] elsif last @phase = @phases[:inBody] else next end break end end |