Class: ParseHTML

Inherits:
Object
  • Object
show all
Defined in:
lib/parsehtml/parsehtml.rb

Overview

Version 1.12 - Aug 20, 2008

Constant Summary collapse

EMPTY_TAGS =

tags which are always empty (<br />, etc.)

%w(br hr input img area link meta param)
PREFORMATTED_TAGS =

tags with preformatted text - whitespace won’t be touched in them

%w(script style pre code)
BLOCK_ELEMENTS =

list of block elements

  • tag_name => bool (is block level)

{'address' => true,
'blockquote' => true,
'center' => true,
'del' => true,
'dir' => true,
'div' => true,
'dl' => true,
'fieldset' => true,
'form' => true,
'h1' => true,
'h2' => true,
'h3' => true,
'h4' => true,
'h5' => true,
'h6' => true,
'hr' => true,
'ins' => true,
'isindex' => true,
'menu' => true,
'noframes' => true,
'noscript' => true,
'ol' => true,
'p' => true,
'pre' => true,
'table' => true,
'ul' => true,
# set table elements and list items to block as well
'thead' => true,
'tbody' => true,
'tfoot' => true,
'td' => true,
'tr' => true,
'th' => true,
'li' => true,
'dd' => true,
'dt' => true,
# header items and html / body as well
'html' => true,
'body' => true,
'head' => true,
'meta' => true,
'link' => true,
'style' => true,
'title' => true,
# media tags to render as block
'map' => true,
'object' => true,
'param' => true,
'embed' => true,
'area' => true,
# inline elements
'a' => false,
'abbr' => false,
'acronym' => false,
'applet' => false,
'b' => false,
'basefont' => false,
'bdo' => false,
'big' => false,
'br' => false,
'button' => false,
'cite' => false,
'code' => false,
'del' => false,
'dfn' => false,
'em' => false,
'font' => false,
'i' => false,
'img' => false,
'ins' => false,
'input' => false,
'iframe' => false,
'kbd' => false,
'label' => false,
'q' => false,
'samp' => false,
'script' => false,
'select' => false,
'small' => false,
'span' => false,
'strong' => false,
'sub' => false,
'sup' => false,
'textarea' => false,
'tt' => false,
'var' => false}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html = '') ⇒ ParseHTML

Returns a new instance of ParseHTML.



146
147
148
149
150
151
152
153
# File 'lib/parsehtml/parsehtml.rb', line 146

def initialize(html = '')
  @html = html
  @open_tags = []
  @node_type, @node, @tag_name = '', '', ''
  @is_start_tag, @is_empty_tag, @is_block_element, @no_tags_in_code = false, false, false, false
  @tag_attributes = nil
  @keep_whitespace = 0
end

Instance Attribute Details

#htmlObject

html to be parsed



103
104
105
# File 'lib/parsehtml/parsehtml.rb', line 103

def html
  @html
end

#is_block_elementObject (readonly)

whether the current tag is a block level element



130
131
132
# File 'lib/parsehtml/parsehtml.rb', line 130

def is_block_element
  @is_block_element
end

#is_empty_tagObject (readonly)

whether current node is an empty tag (<br />) or not (<a></a>)



127
128
129
# File 'lib/parsehtml/parsehtml.rb', line 127

def is_empty_tag
  @is_empty_tag
end

#is_start_tagObject (readonly)

whether the current node is an opening tag (<a>) or not (</a>)

  • set to nil if current node is not a tag

  • NOTE: empty tags (<br />) set this to true as well!



124
125
126
# File 'lib/parsehtml/parsehtml.rb', line 124

def is_start_tag
  @is_start_tag
end

#keep_whitespaceObject (readonly)

keep whitespace formatting



139
140
141
# File 'lib/parsehtml/parsehtml.rb', line 139

def keep_whitespace
  @keep_whitespace
end

#no_tags_in_codeObject

supress HTML tags inside preformatted tags



119
120
121
# File 'lib/parsehtml/parsehtml.rb', line 119

def no_tags_in_code
  @no_tags_in_code
end

#nodeObject

current node context

  • either a simple string (text node) or something like

  • <tag attrib=“value”…>



116
117
118
# File 'lib/parsehtml/parsehtml.rb', line 116

def node
  @node
end

#node_typeObject (readonly)

node type:

  • tag (see isStartTag)

  • text (include cdata)

  • comment

  • doctype

  • pi (processing instruction)



111
112
113
# File 'lib/parsehtml/parsehtml.rb', line 111

def node_type
  @node_type
end

#open_tagsObject (readonly)

list of open tags (array)

  • count this to get current depth



143
144
145
# File 'lib/parsehtml/parsehtml.rb', line 143

def open_tags
  @open_tags
end

#tag_attributesObject (readonly)

attributes of current_tag (in hash)



136
137
138
# File 'lib/parsehtml/parsehtml.rb', line 136

def tag_attributes
  @tag_attributes
end

#tag_nameObject (readonly)

tag name



133
134
135
# File 'lib/parsehtml/parsehtml.rb', line 133

def tag_name
  @tag_name
end

Instance Method Details

#next_nodeObject

get next node



156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/parsehtml/parsehtml.rb', line 156

def next_node
  return false if (@html.nil? || @html.empty?)

  skip_whitespace = true # FIXME: should probably be a class variable?
  if (@is_start_tag && !@is_empty_tag)
    @open_tags << @tag_name
    @keep_whitespace += 1 if PREFORMATTED_TAGS.include?(@tag_name)
  end
  
  if (@html[0,1] == '<')
    token = html[0,9]
    if (token[0,2] == '<?')
      # xml, prolog, or other pi's
      # TODO: trigger error (this might need some work)
      pos = @html.index('>')
      set_node('pi', pos+1)
      return true;
    end # end pi tag
    if (token[0,4] == '<!--')
      # HTML comment
      pos = @html.index('-->')
      if pos.nil?
        # could not find a closing -->, use next gt tag instead
        # this is what firefox does with its parsing
        pos = @html.index('>') + 1
      else
        pos += 3
      end
      set_node('comment', pos)
      return true
    end # end comment tag
    if (token == '<!DOCTYPE')
      # doctype
      set_node('doctype', @html.index('>')+1)
      @skip_whitespace = true
      return true
    end # end <!DOCTYPE tag
    if (token == '<![CDATA[')
      # cdata, use text mode
      
      # remove leading <![CDATA[
      @html = @html[9, @html.size-9]
      set_node('text', @html.index(']]>')+3)
      
      # remove trailing ]]> and trim
      @node = @node[0, -3]
      handle_whitespaces
      
      @skip_whitespace = true
      return true
    end # end cdata
    if (parse_tag)
      # seems to be a tag so handle whitespaces
      skip_whitespace = @is_block_element ? true : false
      return true
    end # end parse_tag
  end
  
  skip_whitespace = false if @keep_whitespace
  
  # when we get here it seems to be a text node
  pos = @html.index('<') || @html.size
  
  set_node('text', pos)
  handle_whitespaces
  return next_node if (skip_whitespace && @node == ' ')
  skip_whitespace = false
  return true
end

#normalize_nodeObject

normalize self.node



227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/parsehtml/parsehtml.rb', line 227

def normalize_node
  @node = '<'
  unless (@is_start_tag)
    @node << "/#{@tag_name}>"
    return
  end
  @node << @tag_name
  @tag_attributes.each do |name, value|
    str = " #{name}=\"" + value.gsub(/\"/, '&quot;') + "\""
    @node << str
  end
  @node << ' /' if (@is_empty_tag)
  @node << '>'
end