Module: Newstile::Parser::Html::Parser

Includes:
Constants
Included in:
Newstile::Parser::Html, Newstile
Defined in:
lib/newstile/parser/html.rb

Overview

Contains the parsing methods. This module can be mixed into any parser to get HTML parsing functionality. The only thing that must be provided by the class are instance variable @stack for storing needed state and @src (instance of StringScanner) for the actual parsing.

Constant Summary collapse

HTML_RAW_START =
/(?=<(#{REXML::Parsers::BaseParser::UNAME_STR}|\/|!--|\?))/

Constants included from Constants

Constants::HTML_ATTRIBUTE_RE, Constants::HTML_BLOCK_ELEMENTS, Constants::HTML_COMMENT_RE, Constants::HTML_DOCTYPE_RE, Constants::HTML_ELEMENTS_WITHOUT_BODY, Constants::HTML_ENTITY_RE, Constants::HTML_INSTRUCTION_RE, Constants::HTML_PARSE_AS, Constants::HTML_PARSE_AS_BLOCK, Constants::HTML_PARSE_AS_RAW, Constants::HTML_PARSE_AS_SPAN, Constants::HTML_SPAN_ELEMENTS, Constants::HTML_TAG_CLOSE_RE, Constants::HTML_TAG_RE

Instance Method Summary collapse

Instance Method Details

#handle_html_script_tagObject



101
102
103
104
105
106
107
108
109
110
# File 'lib/newstile/parser/html.rb', line 101

def handle_html_script_tag
  curpos = @src.pos
  if result = @src.scan_until(/(?=<\/script\s*>)/m)
    add_text(extract_string(curpos...@src.pos, @src), @tree.children.last, :raw)
    @src.scan(HTML_TAG_CLOSE_RE)
  else
    add_text(@src.scan(/.*/m), @tree.children.last, :raw)
    warning("Found no end tag for 'script' - auto-closing it")
  end
end

#handle_html_start_tagObject

Process the HTML start tag that has already be scanned/checked. Does the common processing steps and then yields to the caller for further processing.



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/newstile/parser/html.rb', line 80

def handle_html_start_tag
  name = @src[1]
  closed = !@src[4].nil?
  attrs = Utils::OrderedHash.new
  @src[2].scan(HTML_ATTRIBUTE_RE).each {|attr,sep,val| attrs[attr] = val}

  el = Element.new(:html_element, name, attrs, :category => :block)
  @tree.children << el

  if !closed && HTML_ELEMENTS_WITHOUT_BODY.include?(el.value)
    warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it")
    closed = true
  end
  if name == 'script'
    handle_html_script_tag
    yield(el, true)
  else
    yield(el, closed)
  end
end

#parse_raw_html(el, &block) ⇒ Object

Parse raw HTML from the current source position, storing the found elements in el. Parsing continues until one of the following criteria are fulfilled:

  • The end of the document is reached.

  • The matching end tag for the element el is found (only used if el is an HTML element).

When an HTML start tag is found, processing is deferred to #handle_html_start_tag, providing the block given to this method.



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/newstile/parser/html.rb', line 123

def parse_raw_html(el, &block)
  @stack.push(@tree)
  @tree = el

  done = false
  while !@src.eos? && !done
    if result = @src.scan_until(HTML_RAW_START)
      add_text(result, @tree, :text)
      if result = @src.scan(HTML_COMMENT_RE)
        @tree.children << Element.new(:xml_comment, result, nil, :category => :block)
      elsif result = @src.scan(HTML_INSTRUCTION_RE)
        @tree.children << Element.new(:xml_pi, result, nil, :category => :block)
      elsif @src.scan(HTML_TAG_RE)
        handle_html_start_tag(&block)
      elsif @src.scan(HTML_TAG_CLOSE_RE)
        if @tree.value == @src[1]
          done = true
        else
          warning("Found invalidly used HTML closing tag for '#{@src[1]}' - ignoring it")
        end
      else
        add_text(@src.scan(/./), @tree, :text)
      end
    else
      result = @src.scan(/.*/m)
      add_text(result, @tree, :text)
      warning("Found no end tag for '#{@tree.value}' - auto-closing it") if @tree.type == :html_element
      done = true
    end
  end

  @tree = @stack.pop
end