Class: ParseHTML
- Inherits:
-
Object
- Object
- ParseHTML
- Defined in:
- lib/parsehtml/parsehtml.rb
Overview
Version 1.12 - Aug 20, 2008
Constant Summary collapse
- EMPTY_TAGS =
tags which are always empty (<br />, etc.)
%w(br hr input img area link meta param)
- PREFORMATTED_TAGS =
tags with preformatted text - whitespace won’t be touched in them
%w(script style pre code)
- BLOCK_ELEMENTS =
list of block elements
-
tag_name => bool (is block level)
-
{'address' => true, 'blockquote' => true, 'center' => true, 'del' => true, 'dir' => true, 'div' => true, 'dl' => true, 'fieldset' => true, 'form' => true, 'h1' => true, 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, 'h6' => true, 'hr' => true, 'ins' => true, 'isindex' => true, 'menu' => true, 'noframes' => true, 'noscript' => true, 'ol' => true, 'p' => true, 'pre' => true, 'table' => true, 'ul' => true, # set table elements and list items to block as well 'thead' => true, 'tbody' => true, 'tfoot' => true, 'td' => true, 'tr' => true, 'th' => true, 'li' => true, 'dd' => true, 'dt' => true, # header items and html / body as well 'html' => true, 'body' => true, 'head' => true, 'meta' => true, 'link' => true, 'style' => true, 'title' => true, # media tags to render as block 'map' => true, 'object' => true, 'param' => true, 'embed' => true, 'area' => true, # inline elements 'a' => false, 'abbr' => false, 'acronym' => false, 'applet' => false, 'b' => false, 'basefont' => false, 'bdo' => false, 'big' => false, 'br' => false, 'button' => false, 'cite' => false, 'code' => false, 'del' => false, 'dfn' => false, 'em' => false, 'font' => false, 'i' => false, 'img' => false, 'ins' => false, 'input' => false, 'iframe' => false, 'kbd' => false, 'label' => false, 'q' => false, 'samp' => false, 'script' => false, 'select' => false, 'small' => false, 'span' => false, 'strong' => false, 'sub' => false, 'sup' => false, 'textarea' => false, 'tt' => false, 'var' => false}
Instance Attribute Summary collapse
-
#html ⇒ Object
html to be parsed.
-
#is_block_element ⇒ Object
readonly
whether the current tag is a block level element.
-
#is_empty_tag ⇒ Object
readonly
whether current node is an empty tag (<br />) or not (<a></a>).
-
#is_start_tag ⇒ Object
readonly
whether the current node is an opening tag (<a>) or not (</a>) - set to nil if current node is not a tag - NOTE: empty tags (<br />) set this to true as well!.
-
#keep_whitespace ⇒ Object
readonly
keep whitespace formatting.
-
#no_tags_in_code ⇒ Object
supress HTML tags inside preformatted tags.
-
#node ⇒ Object
current node context - either a simple string (text node) or something like - <tag attrib=“value”…>.
-
#node_type ⇒ Object
readonly
node type: - tag (see isStartTag) - text (include cdata) - comment - doctype - pi (processing instruction).
-
#open_tags ⇒ Object
readonly
list of open tags (array) - count this to get current depth.
-
#tag_attributes ⇒ Object
readonly
attributes of current_tag (in hash).
-
#tag_name ⇒ Object
readonly
tag name.
Instance Method Summary collapse
-
#initialize(html = '') ⇒ ParseHTML
constructor
A new instance of ParseHTML.
-
#next_node ⇒ Object
get next node.
-
#normalize_node ⇒ Object
normalize self.node.
Constructor Details
#initialize(html = '') ⇒ ParseHTML
Returns a new instance of ParseHTML.
146 147 148 149 150 151 152 153 |
# File 'lib/parsehtml/parsehtml.rb', line 146 def initialize(html = '') @html = html @open_tags = [] @node_type, @node, @tag_name = '', '', '' @is_start_tag, @is_empty_tag, @is_block_element, @no_tags_in_code = false, false, false, false @tag_attributes = nil @keep_whitespace = 0 end |
Instance Attribute Details
#html ⇒ Object
html to be parsed
103 104 105 |
# File 'lib/parsehtml/parsehtml.rb', line 103 def html @html end |
#is_block_element ⇒ Object (readonly)
whether the current tag is a block level element
130 131 132 |
# File 'lib/parsehtml/parsehtml.rb', line 130 def is_block_element @is_block_element end |
#is_empty_tag ⇒ Object (readonly)
whether current node is an empty tag (<br />) or not (<a></a>)
127 128 129 |
# File 'lib/parsehtml/parsehtml.rb', line 127 def is_empty_tag @is_empty_tag end |
#is_start_tag ⇒ Object (readonly)
whether the current node is an opening tag (<a>) or not (</a>)
-
set to nil if current node is not a tag
-
NOTE: empty tags (<br />) set this to true as well!
124 125 126 |
# File 'lib/parsehtml/parsehtml.rb', line 124 def is_start_tag @is_start_tag end |
#keep_whitespace ⇒ Object (readonly)
keep whitespace formatting
139 140 141 |
# File 'lib/parsehtml/parsehtml.rb', line 139 def keep_whitespace @keep_whitespace end |
#no_tags_in_code ⇒ Object
supress HTML tags inside preformatted tags
119 120 121 |
# File 'lib/parsehtml/parsehtml.rb', line 119 def @no_tags_in_code end |
#node ⇒ Object
current node context
-
either a simple string (text node) or something like
-
<tag attrib=“value”…>
116 117 118 |
# File 'lib/parsehtml/parsehtml.rb', line 116 def node @node end |
#node_type ⇒ Object (readonly)
node type:
-
tag (see isStartTag)
-
text (include cdata)
-
comment
-
doctype
-
pi (processing instruction)
111 112 113 |
# File 'lib/parsehtml/parsehtml.rb', line 111 def node_type @node_type end |
#open_tags ⇒ Object (readonly)
list of open tags (array)
-
count this to get current depth
143 144 145 |
# File 'lib/parsehtml/parsehtml.rb', line 143 def @open_tags end |
#tag_attributes ⇒ Object (readonly)
attributes of current_tag (in hash)
136 137 138 |
# File 'lib/parsehtml/parsehtml.rb', line 136 def tag_attributes @tag_attributes end |
#tag_name ⇒ Object (readonly)
tag name
133 134 135 |
# File 'lib/parsehtml/parsehtml.rb', line 133 def tag_name @tag_name end |
Instance Method Details
#next_node ⇒ Object
get next node
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
# File 'lib/parsehtml/parsehtml.rb', line 156 def next_node return false if (@html.nil? || @html.empty?) skip_whitespace = true # FIXME: should probably be a class variable? if (@is_start_tag && !@is_empty_tag) @open_tags << @tag_name @keep_whitespace += 1 if PREFORMATTED_TAGS.include?(@tag_name) end if (@html[0,1] == '<') token = html[0,9] if (token[0,2] == '<?') # xml, prolog, or other pi's # TODO: trigger error (this might need some work) pos = @html.index('>') set_node('pi', pos+1) return true; end # end pi tag if (token[0,4] == '<!--') # HTML comment pos = @html.index('-->') if pos.nil? # could not find a closing -->, use next gt tag instead # this is what firefox does with its parsing pos = @html.index('>') + 1 else pos += 3 end set_node('comment', pos) return true end # end comment tag if (token == '<!DOCTYPE') # doctype set_node('doctype', @html.index('>')+1) @skip_whitespace = true return true end # end <!DOCTYPE tag if (token == '<![CDATA[') # cdata, use text mode # remove leading <![CDATA[ @html = @html[9, @html.size-9] set_node('text', @html.index(']]>')+3) # remove trailing ]]> and trim @node = @node[0, -3] handle_whitespaces @skip_whitespace = true return true end # end cdata if (parse_tag) # seems to be a tag so handle whitespaces skip_whitespace = @is_block_element ? true : false return true end # end parse_tag end skip_whitespace = false if @keep_whitespace # when we get here it seems to be a text node pos = @html.index('<') || @html.size set_node('text', pos) handle_whitespaces return next_node if (skip_whitespace && @node == ' ') skip_whitespace = false return true end |
#normalize_node ⇒ Object
normalize self.node
227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
# File 'lib/parsehtml/parsehtml.rb', line 227 def normalize_node @node = '<' unless (@is_start_tag) @node << "/#{@tag_name}>" return end @node << @tag_name @tag_attributes.each do |name, value| str = " #{name}=\"" + value.gsub(/\"/, '"') + "\"" @node << str end @node << ' /' if (@is_empty_tag) @node << '>' end |