Class: Feedbase::HtmlSimplifier
- Inherits:
-
Object
- Object
- Feedbase::HtmlSimplifier
- Includes:
- FileUtils::Verbose
- Defined in:
- lib/feedbase/html_simplifier.rb
Instance Attribute Summary collapse
-
#result ⇒ Object
readonly
Returns the value of attribute result.
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(html, orig_encoding) ⇒ HtmlSimplifier
constructor
Takes feed data as hash.
- #parse ⇒ Object
- #pre_cleanup(html) ⇒ Object
- #tidy(html) ⇒ Object
Constructor Details
#initialize(html, orig_encoding) ⇒ HtmlSimplifier
Takes feed data as hash. Generate this with FeedParser
24 25 26 27 28 |
# File 'lib/feedbase/html_simplifier.rb', line 24 def initialize(html, orig_encoding) @orig_encoding = orig_encoding @xml = tidy(pre_cleanup(html)) @result = parse.gsub(/<http[^>]+>/, "") end |
Instance Attribute Details
#result ⇒ Object (readonly)
Returns the value of attribute result.
21 22 23 |
# File 'lib/feedbase/html_simplifier.rb', line 21 def result @result end |
Class Method Details
.tidy(html, orig_encoding) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/feedbase/html_simplifier.rb', line 41 def self.tidy(html, orig_encoding) # assumes input encoding of latin 1 #output = Open3.popen3("tidy -q -n -wrap 120 -asxml -latin1") do |stdin, stdout, stderr| #output = IO.popen("tidy -q -n -wrap 120 -asxml -latin1", "r+") do |pipe| #output = IO.popen("tidy -q -wrap 120 -raw -asxml ", "r+") do |pipe| # if from latin1 tidy = "tidy -q -wrap 120 -n -utf8 -asxml 2>/dev/null" output = IO.popen(tidy, "r+") do |pipe| input = <<-END <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head><title></title></head><body>#{html}</body></html> END pipe.puts input pipe.close_write #$stderr.puts stderr.read pipe.read end output end |
Instance Method Details
#parse ⇒ Object
30 31 32 33 34 |
# File 'lib/feedbase/html_simplifier.rb', line 30 def parse @listener = FeedHtmlListener.new REXML::Document.parse_stream(@xml, @listener) @listener.result + "\n\n" end |
#pre_cleanup(html) ⇒ Object
36 37 38 39 |
# File 'lib/feedbase/html_simplifier.rb', line 36 def pre_cleanup(html) html.gsub!("<o:p></o:p>", "") html end |
#tidy(html) ⇒ Object
62 63 64 |
# File 'lib/feedbase/html_simplifier.rb', line 62 def tidy(html) self.class.tidy html, @orig_encoding end |