Class: Feedbase::FeedHtmlListener

Inherits:
Object
  • Object
show all
Includes:
REXML::StreamListener
Defined in:
lib/feedbase/html_simplifier.rb

Constant Summary collapse

STRIP_TAGS =
%w[ body font ]
BLOCK_TAGS =
%w[ p div ]
HEADER_TAGS =
%w[ h1 h2 h3 h4 h5 h6 ]
UNIFORM_HEADER_TAG =
"h4"

Instance Method Summary collapse

Constructor Details

#initializeFeedHtmlListener

Returns a new instance of FeedHtmlListener.



77
78
79
80
# File 'lib/feedbase/html_simplifier.rb', line 77

def initialize
  @nested_tags = []
  @content = [""]
end

Instance Method Details

#pathObject



156
157
158
# File 'lib/feedbase/html_simplifier.rb', line 156

def path
  @nested_tags.join('/')
end

#resultObject



82
83
84
85
86
87
# File 'lib/feedbase/html_simplifier.rb', line 82

def result
  # we call strip_empty_tags twice to catch empty tags nested in a tag like <p>
  # not full-proof but good enough for now
  x = @content.map {|line| strip_empty_tags( strip_empty_tags( line ).strip ) }.
    select {|line| line != ""}.compact.join("\n\n")
end

#start_of_block?Boolean

Returns:

  • (Boolean)


152
153
154
# File 'lib/feedbase/html_simplifier.rb', line 152

def start_of_block?
  BLOCK_TAGS.include? @nested_tags[-1] 
end

#strip_empty_tags(line) ⇒ Object



89
90
91
# File 'lib/feedbase/html_simplifier.rb', line 89

def strip_empty_tags(line)
  line.gsub(%r{<(\w+)[^>]*>\s*</\1>}, '')
end

#tag_end(name) ⇒ Object



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/feedbase/html_simplifier.rb', line 123

def tag_end(name)
  @nested_tags.pop
  case name
  when 'a'
    @content[-1] << "</a>" 
  when *HEADER_TAGS
    @content[-1] << "</#{UNIFORM_HEADER_TAG}>" 
  when 'blockquote'
    @content << '</blockquote>'
  when 'ul', 'ol', 'dl'
    @content[-1] << "</#{name}>"
  when 'li', 'dt', 'dd'
    @content[-1] << "  </#{name}>"
  when 'strong', 'em'
    @content[-1] << "</#{name}>"
  when *BLOCK_TAGS
    @content[-1] << "</p>"
  when 'pre'
    @content[-1] << "</pre>"
  end
end

#tag_start(name, attrs) ⇒ Object



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/feedbase/html_simplifier.rb', line 93

def tag_start(name, attrs)
  @nested_tags.push name
  case name 
  when 'a'
    # effectively strips out all style tags
    @content[-1] << "<a href='#{attrs['href']}'>"
  when 'img'
    if attrs['alt']
      text = (attrs['alt'].strip == '') ? 'image ' : "image:#{attrs['alt']} "
      @content[-1] << text
    end
  when *HEADER_TAGS
    @content << "<#{UNIFORM_HEADER_TAG}>" 
  when 'br' #skip
    @content << "<br/>"
  when 'blockquote'
    @content << "<blockquote>"
  when 'ul', 'ol', 'dl'
    @content << "<#{name}>"
  when 'li', 'dt', 'dd'
    @content[-1] << "  <#{name}>"
  when 'strong', 'em'
    @content[-1] << "<#{name}>"
  when *BLOCK_TAGS
    @content << "<p>"
  when 'pre'
    @content << "<pre>"
  end
end

#text(text) ⇒ Object



145
146
147
148
149
150
# File 'lib/feedbase/html_simplifier.rb', line 145

def text(text)
  return if text =~ /\a\s*\Z/

  # probably slow, but ok for now
  @content[-1] << text
end