Class: Feedbase::HtmlSimplifier

Inherits:
Object
  • Object
show all
Includes:
FileUtils::Verbose
Defined in:
lib/feedbase/html_simplifier.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html, orig_encoding) ⇒ HtmlSimplifier

Takes feed data as hash. Generate this with FeedParser



24
25
26
27
28
# File 'lib/feedbase/html_simplifier.rb', line 24

def initialize(html, orig_encoding)
  @orig_encoding = orig_encoding
  @xml = tidy(pre_cleanup(html))
  @result = parse.gsub(/<http[^>]+>/, "")
end

Instance Attribute Details

#resultObject (readonly)

Returns the value of attribute result.



21
22
23
# File 'lib/feedbase/html_simplifier.rb', line 21

def result
  @result
end

Class Method Details

.tidy(html, orig_encoding) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/feedbase/html_simplifier.rb', line 41

def self.tidy(html, orig_encoding)
  # assumes input encoding of latin 1
  #output = Open3.popen3("tidy -q -n -wrap 120 -asxml -latin1") do |stdin, stdout, stderr|
  #output = IO.popen("tidy -q -n -wrap 120 -asxml -latin1", "r+") do |pipe|
  #output = IO.popen("tidy -q -wrap 120 -raw -asxml ", "r+") do |pipe| # if from latin1

  tidy = "tidy -q -wrap 120 -n -utf8 -asxml 2>/dev/null"
  output = IO.popen(tidy, "r+") do |pipe| 
    input = <<-END
  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head><title></title></head><body>#{html}</body></html>
    END
    pipe.puts input
    pipe.close_write
    #$stderr.puts stderr.read
    pipe.read
  end
  output
end

Instance Method Details

#parseObject



30
31
32
33
34
# File 'lib/feedbase/html_simplifier.rb', line 30

def parse
  @listener = FeedHtmlListener.new
  REXML::Document.parse_stream(@xml, @listener)
  @listener.result + "\n\n"
end

#pre_cleanup(html) ⇒ Object



36
37
38
39
# File 'lib/feedbase/html_simplifier.rb', line 36

def pre_cleanup(html)
  html.gsub!("<o:p></o:p>", "")
  html
end

#tidy(html) ⇒ Object



62
63
64
# File 'lib/feedbase/html_simplifier.rb', line 62

def tidy(html)
  self.class.tidy html, @orig_encoding
end