Class: Feedbase::FeedHtmlListener
- Inherits:
-
Object
- Object
- Feedbase::FeedHtmlListener
- Includes:
- REXML::StreamListener
- Defined in:
- lib/feedbase/html_simplifier.rb
Constant Summary collapse
- STRIP_TAGS =
%w[ body font ]
- BLOCK_TAGS =
%w[ p div ]
- HEADER_TAGS =
%w[ h1 h2 h3 h4 h5 h6 ]
- UNIFORM_HEADER_TAG =
"h4"
Instance Method Summary collapse
-
#initialize ⇒ FeedHtmlListener
constructor
A new instance of FeedHtmlListener.
- #path ⇒ Object
- #result ⇒ Object
- #start_of_block? ⇒ Boolean
- #strip_empty_tags(line) ⇒ Object
- #tag_end(name) ⇒ Object
- #tag_start(name, attrs) ⇒ Object
- #text(text) ⇒ Object
Constructor Details
#initialize ⇒ FeedHtmlListener
Returns a new instance of FeedHtmlListener.
77 78 79 80 |
# File 'lib/feedbase/html_simplifier.rb', line 77 def initialize @nested_tags = [] @content = [""] end |
Instance Method Details
#path ⇒ Object
156 157 158 |
# File 'lib/feedbase/html_simplifier.rb', line 156 def path @nested_tags.join('/') end |
#result ⇒ Object
82 83 84 85 86 87 |
# File 'lib/feedbase/html_simplifier.rb', line 82 def result # we call strip_empty_tags twice to catch empty tags nested in a tag like <p> # not full-proof but good enough for now x = @content.map {|line| ( ( line ).strip ) }. select {|line| line != ""}.compact.join("\n\n") end |
#start_of_block? ⇒ Boolean
152 153 154 |
# File 'lib/feedbase/html_simplifier.rb', line 152 def start_of_block? BLOCK_TAGS.include? @nested_tags[-1] end |
#strip_empty_tags(line) ⇒ Object
89 90 91 |
# File 'lib/feedbase/html_simplifier.rb', line 89 def (line) line.gsub(%r{<(\w+)[^>]*>\s*</\1>}, '') end |
#tag_end(name) ⇒ Object
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
# File 'lib/feedbase/html_simplifier.rb', line 123 def tag_end(name) @nested_tags.pop case name when 'a' @content[-1] << "</a>" when *HEADER_TAGS @content[-1] << "</#{UNIFORM_HEADER_TAG}>" when 'blockquote' @content << '</blockquote>' when 'ul', 'ol', 'dl' @content[-1] << "</#{name}>" when 'li', 'dt', 'dd' @content[-1] << " </#{name}>" when 'strong', 'em' @content[-1] << "</#{name}>" when *BLOCK_TAGS @content[-1] << "</p>" when 'pre' @content[-1] << "</pre>" end end |
#tag_start(name, attrs) ⇒ Object
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/feedbase/html_simplifier.rb', line 93 def tag_start(name, attrs) @nested_tags.push name case name when 'a' # effectively strips out all style tags @content[-1] << "<a href='#{attrs['href']}'>" when 'img' if attrs['alt'] text = (attrs['alt'].strip == '') ? 'image ' : "image:#{attrs['alt']} " @content[-1] << text end when *HEADER_TAGS @content << "<#{UNIFORM_HEADER_TAG}>" when 'br' #skip @content << "<br/>" when 'blockquote' @content << "<blockquote>" when 'ul', 'ol', 'dl' @content << "<#{name}>" when 'li', 'dt', 'dd' @content[-1] << " <#{name}>" when 'strong', 'em' @content[-1] << "<#{name}>" when *BLOCK_TAGS @content << "<p>" when 'pre' @content << "<pre>" end end |
#text(text) ⇒ Object
145 146 147 148 149 150 |
# File 'lib/feedbase/html_simplifier.rb', line 145 def text(text) return if text =~ /\a\s*\Z/ # probably slow, but ok for now @content[-1] << text end |