Class: ParseHTML

Inherits:

Object

Object
ParseHTML

Defined in:: lib/parsehtml.rb

Constant Summary collapse

VERSION =

'1.12.2'

EMPTY_TAGS = tags which are always empty ( , etc.)

%w(br hr input img area link meta param)

PREFORMATTED_TAGS = tags with preformatted text - whitespace won’t be touched in them

%w(script style pre code)

BLOCK_ELEMENTS = list of block elements tag_name => bool (is block level)

{'address' => true,
'blockquote' => true,
'center' => true,
'del' => true,
'dir' => true,
'div' => true,
'dl' => true,
'fieldset' => true,
'form' => true,
'h1' => true,
'h2' => true,
'h3' => true,
'h4' => true,
'h5' => true,
'h6' => true,
'hr' => true,
'ins' => true,
'isindex' => true,
'menu' => true,
'noframes' => true,
'noscript' => true,
'ol' => true,
'p' => true,
'pre' => true,
'table' => true,
'ul' => true,
# set table elements and list items to block as well
'thead' => true,
'tbody' => true,
'tfoot' => true,
'td' => true,
'tr' => true,
'th' => true,
'li' => true,
'dd' => true,
'dt' => true,
# header items and html / body as well
'html' => true,
'body' => true,
'head' => true,
'meta' => true,
'link' => true,
'style' => true,
'title' => true,
# media tags to render as block
'map' => true,
'object' => true,
'param' => true,
'embed' => true,
'area' => true,
# inline elements
'a' => false,
'abbr' => false,
'acronym' => false,
'applet' => false,
'b' => false,
'basefont' => false,
'bdo' => false,
'big' => false,
'br' => false,
'button' => false,
'cite' => false,
'code' => false,
'del' => false,
'dfn' => false,
'em' => false,
'font' => false,
'i' => false,
'img' => false,
'ins' => false,
'input' => false,
'iframe' => false,
'kbd' => false,
'label' => false,
'q' => false,
'samp' => false,
'script' => false,
'select' => false,
'small' => false,
'span' => false,
'strong' => false,
'sub' => false,
'sup' => false,
'textarea' => false,
'tt' => false,
'var' => false}

Instance Attribute Summary collapse

#html ⇒ Object

html to be parsed.
#is_block_element ⇒ Object readonly

whether the current tag is a block level element.
#is_empty_tag ⇒ Object readonly

whether current node is an empty tag ( ) or not (<a></a>).
#is_start_tag ⇒ Object readonly

whether the current node is an opening tag (<a>) or not (</a>) - set to nil if current node is not a tag - NOTE: empty tags ( ) set this to true as well!.
#keep_whitespace ⇒ Object readonly

keep whitespace formatting.
#no_tags_in_code ⇒ Object

supress HTML tags inside preformatted tags.
#node ⇒ Object

current node context - either a simple string (text node) or something like - <tag attrib=“value”…>.
#node_type ⇒ Object readonly

node type: - tag (see isStartTag) - text (include cdata) - comment - doctype - pi (processing instruction).
#open_tags ⇒ Object readonly

list of open tags (array) - count this to get current depth.
#tag_attributes ⇒ Object readonly

attributes of current_tag (in hash).
#tag_name ⇒ Object readonly

tag name.

Instance Method Summary collapse

#initialize(html = '') ⇒ ParseHTML constructor

A new instance of ParseHTML.
#next_node ⇒ Object

get next node.
#normalize_node ⇒ Object

normalize self.node.

Constructor Details

#initialize(html = '') ⇒ `ParseHTML`

Returns a new instance of ParseHTML.

# File 'lib/parsehtml.rb', line 146

def initialize(html = '')
  @html = html
  @open_tags = []
  @node_type, @node, @tag_name = '', '', ''
  @is_start_tag, @is_empty_tag, @is_block_element, @no_tags_in_code = false, false, false, false
  @tag_attributes = nil
  @keep_whitespace = 0
end

Instance Attribute Details

#html ⇒ `Object`

html to be parsed



103
104
105

# File 'lib/parsehtml.rb', line 103

def html
  @html
end

#is_block_element ⇒ `Object` (readonly)

whether the current tag is a block level element



130
131
132

# File 'lib/parsehtml.rb', line 130

def is_block_element
  @is_block_element
end

#is_empty_tag ⇒ `Object` (readonly)

whether current node is an empty tag ( ) or not (<a></a>)



127
128
129

# File 'lib/parsehtml.rb', line 127

def is_empty_tag
  @is_empty_tag
end

#is_start_tag ⇒ `Object` (readonly)

whether the current node is an opening tag (<a>) or not (</a>)

set to nil if current node is not a tag
NOTE: empty tags ( ) set this to true as well!



124
125
126

# File 'lib/parsehtml.rb', line 124

def is_start_tag
  @is_start_tag
end

#keep_whitespace ⇒ `Object` (readonly)

keep whitespace formatting



139
140
141

# File 'lib/parsehtml.rb', line 139

def keep_whitespace
  @keep_whitespace
end

#no_tags_in_code ⇒ `Object`

supress HTML tags inside preformatted tags



119
120
121

# File 'lib/parsehtml.rb', line 119

def no_tags_in_code
  @no_tags_in_code
end

#node ⇒ `Object`

current node context

either a simple string (text node) or something like
<tag attrib=“value”…>



116
117
118

# File 'lib/parsehtml.rb', line 116

def node
  @node
end

#node_type ⇒ `Object` (readonly)

node type:

tag (see isStartTag)
text (include cdata)
comment
doctype
pi (processing instruction)



111
112
113

# File 'lib/parsehtml.rb', line 111

def node_type
  @node_type
end

#open_tags ⇒ `Object` (readonly)

list of open tags (array)

count this to get current depth



143
144
145

# File 'lib/parsehtml.rb', line 143

def open_tags
  @open_tags
end

#tag_attributes ⇒ `Object` (readonly)

attributes of current_tag (in hash)



136
137
138

# File 'lib/parsehtml.rb', line 136

def tag_attributes
  @tag_attributes
end

#tag_name ⇒ `Object` (readonly)

tag name



133
134
135

# File 'lib/parsehtml.rb', line 133

def tag_name
  @tag_name
end

Instance Method Details

#next_node ⇒ `Object`

get next node

# File 'lib/parsehtml.rb', line 156

def next_node
  return false if (@html.nil? || @html.empty?)

  skip_whitespace = true # FIXME: should probably be a class variable?
  if (@is_start_tag && !@is_empty_tag)
    @open_tags << @tag_name
    @keep_whitespace += 1 if PREFORMATTED_TAGS.include?(@tag_name)
  end
  
  if (@html[0,1] == '<')
    token = html[0,9]
    if (token[0,2] == '<?')
      # xml, prolog, or other pi's
      # TODO: trigger error (this might need some work)
      pos = @html.index('>')
      set_node('pi', pos+1)
      return true;
    end # end pi tag
    if (token[0,4] == '<!--')
      # HTML comment
      pos = @html.index('-->')
      if pos.nil?
        # could not find a closing -->, use next gt tag instead
        # this is what firefox does with its parsing
        pos = @html.index('>') + 1
      else
        pos += 3
      end
      set_node('comment', pos)
      return true
    end # end comment tag
    if (token == '<!DOCTYPE')
      # doctype
      set_node('doctype', @html.index('>')+1)
      @skip_whitespace = true
      return true
    end # end <!DOCTYPE tag
    if (token == '<![CDATA[')
      # cdata, use text mode
      
      # remove leading <![CDATA[
      @html = @html[9, @html.size-9]
      set_node('text', @html.index(']]>')+3)
      
      # remove trailing ]]> and trim
      @node = @node[0, -3]
      handle_whitespaces
      
      @skip_whitespace = true
      return true
    end # end cdata
    if (parse_tag)
      # seems to be a tag so handle whitespaces
      skip_whitespace = @is_block_element ? true : false
      return true
    end # end parse_tag
  end
  
  skip_whitespace = false if @keep_whitespace
  
  # when we get here it seems to be a text node
  pos = @html.index('<') || @html.size
  
  set_node('text', pos)
  handle_whitespaces
  return next_node if (skip_whitespace && @node == ' ')
  skip_whitespace = false
  return true
end

#normalize_node ⇒ `Object`

normalize self.node

# File 'lib/parsehtml.rb', line 227

def normalize_node
  @node = '<'
  unless (@is_start_tag)
    @node << "/#{@tag_name}>"
    return
  end
  @node << @tag_name
  @tag_attributes.each do |name, value|
    str = " #{name}=\"" + value.gsub(/\"/, '&quot;') + "\""
    @node << str
  end
  @node << ' /' if (@is_empty_tag)
  @node << '>'
end

Class: ParseHTML

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html = '') ⇒ ParseHTML

Instance Attribute Details

#html ⇒ Object

#is_block_element ⇒ Object (readonly)

#is_empty_tag ⇒ Object (readonly)

#is_start_tag ⇒ Object (readonly)

#keep_whitespace ⇒ Object (readonly)

#no_tags_in_code ⇒ Object

#node ⇒ Object

#node_type ⇒ Object (readonly)

#open_tags ⇒ Object (readonly)

#tag_attributes ⇒ Object (readonly)

#tag_name ⇒ Object (readonly)