Class: Readability::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/pismo/readability.rb

Constant Summary collapse

TEXT_LENGTH_THRESHOLD =
25
RETRY_LENGTH =
250
REGEXES =
{
    :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
    :okMaybeItsACandidateRe => /and|article|body|column|main/i,
    :positiveRe => /article|body|content|entry|hentry|page|pagination|post|story|text/i,
    :negativeRe => /combx|comment|contact|foot|box_wrap|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags/i,
    :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
    :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
    :replaceFontsRe => /<(\/?)font[^>]*>/i,
    :trimRe => /^\s+|\s+$/,
    :normalizeRe => /\s{2,}/,
    :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
    :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input, options = {}) ⇒ Document

Returns a new instance of Document.



26
27
28
29
30
# File 'lib/pismo/readability.rb', line 26

def initialize(input, options = {})
  @input = input
  @options = options
  make_html
end

Instance Attribute Details

#htmlObject

Returns the value of attribute html.



24
25
26
# File 'lib/pismo/readability.rb', line 24

def html
  @html
end

#optionsObject

Returns the value of attribute options.



24
25
26
# File 'lib/pismo/readability.rb', line 24

def options
  @options
end

Instance Method Details

#class_weight(e) ⇒ Object



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/pismo/readability.rb', line 164

def class_weight(e)
  weight = 0
  if e[:class] && e[:class] != ""
    if e[:class] =~ REGEXES[:negativeRe]
      weight -= 25
    end

    if e[:class] =~ REGEXES[:positiveRe]
      weight += 25
    end
  end

  if e[:id] && e[:id] != ""
    if e[:id] =~ REGEXES[:negativeRe]
      weight -= 25
    end

    if e[:id] =~ REGEXES[:positiveRe]
      weight += 25
    end
  end

  weight
end

#content(remove_unlikely_candidates = true) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/pismo/readability.rb', line 50

def content(remove_unlikely_candidates = true)
  @html.css("script, style").each { |i| i.remove }

  remove_unlikely_candidates! if remove_unlikely_candidates
  transform_misused_divs_into_paragraphs!
  candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
  best_candidate = select_best_candidate(candidates)
  article = get_article(candidates, best_candidate)
  cleaned_article = sanitize(article, candidates, options)
  cleaned_article.gsub!(/^\s+\n/, "\n")
  cleaned_article.gsub!(/[\ \t]+/, ' ')
  cleaned_article.gsub!(/^\s+/, '')
  cleaned_article.gsub!(/\<\!\-\-.*?\-\-\>/m, '')
  if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
    make_html
    content(false)
  else
    cleaned_article
  end
end

#debug(str) ⇒ Object



204
205
206
# File 'lib/pismo/readability.rb', line 204

def debug(str)
  puts str if options[:debug]
end

#get_article(candidates, best_candidate) ⇒ Object



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/pismo/readability.rb', line 71

def get_article(candidates, best_candidate)
  # Now that we have the top candidate, look through its siblings for content that might also be related.
  # Things like preambles, content split by ads that we removed, etc.

  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
  output = Nokogiri::XML::Node.new('div', @html)
  
  return output unless best_candidate[:elem]
  
  best_candidate[:elem].parent.children.each do |sibling|
    append = false
    append = true if sibling == best_candidate[:elem]
    append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold

    if sibling.name.downcase == "p"
      link_density = get_link_density(sibling)
      node_content = sibling.text
      node_length = node_content.length

      if node_length > 80 && link_density < 0.25
        append = true
      elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
        append = true
      end
    end

    if append
      sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
      output << sibling
    end
  end

  output
end


120
121
122
123
124
# File 'lib/pismo/readability.rb', line 120

def get_link_density(elem)
  link_length = elem.css("a").map {|i| i.text}.join("").length
  text_length = elem.text.length
  link_length / text_length.to_f
end

#make_htmlObject



32
33
34
# File 'lib/pismo/readability.rb', line 32

def make_html
  @html = Nokogiri::HTML(@input) #, nil, 'UTF-8')
end

#remove_unlikely_candidates!Object



208
209
210
211
212
213
214
215
216
# File 'lib/pismo/readability.rb', line 208

def remove_unlikely_candidates!
  @html.css("*").each do |elem|
    str = "#{elem[:class]}#{elem[:id]}"
    if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
      debug("Removing unlikely candidate - #{str}")
      elem.remove
    end
  end
end

#sanitize(node, candidates, options = {}) ⇒ Object



239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/pismo/readability.rb', line 239

def sanitize(node, candidates, options = {})
  node.css("h1, h2, h3, h4, h5, h6").each do |header|
    header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
  end

  node.css("form, object, iframe, embed").each do |elem|
    elem.remove
  end

  # Remove empty <p> tags
  node.css("p").each do |elem|
    elem.remove if elem.content.strip.empty?
  end

  # Remove empty <div> tags
  node.css("div").each do |elem|
    elem.remove if elem.content.strip.empty?
  end
  
  

  # Conditionally clean <table>s, <ul>s, and <div>s
  node.css("table, ul, div").each do |el|
    weight = class_weight(el)
    content_score = candidates[el] ? candidates[el][:content_score] : 0
    name = el.name.downcase

    if weight + content_score < 0
      el.remove
      debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
    elsif (IS_RUBY19 && el.text.force_encoding("ASCII-8BIT").count(",") < 10) || (!IS_RUBY19 && el.text.count(",") < 10)
      counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
      counts["li"] -= 100

      content_length = el.text.length
      link_density = get_link_density(el)
      to_remove = false
      reason = ""

      if counts["img"] > counts["p"]
        reason = "too many images"
        to_remove = true
      elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
        reason = "more <li>s than <p>s"
        to_remove = true
      elsif counts["input"] > (counts["p"] / 3).to_i
        reason = "less than 3x <p>s than <input>s"
        to_remove = true
      elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
        reason = "too short a content length without a single image"
        to_remove = true
      elsif weight < 25 && link_density > 0.2
        reason = "too many links for its weight (#{weight})"
        to_remove = true
      elsif weight >= 25 && link_density > 0.5
        reason = "too many links for its weight (#{weight})"
        to_remove = true
      elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
        reason = "<embed>s with too short a content length, or too many <embed>s"
        to_remove = true
      end

      if to_remove
        debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
        el.remove
      end
    end
  end

  # We'll sanitize all elements using a whitelist
  whitelist = @options[:tags] || %w[p]

  # Use a hash for speed (don't want to make a million calls to include?)
  whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]

  ([node] + node.css("*")).each do |el|

    # If element is in whitelist, delete all its attributes
    if whitelist[el.node_name]
      el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }

      # Otherwise, replace the element with its contents
    else
      begin
        el.swap(el.text)
      rescue => e
        raise e unless IS_RUBY19
        el.swap(el.text.force_encoding("ASCII-8BIT"))
      end
    end

  end

  # Get rid of duplicate whitespace
  begin
    node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
  rescue => e
    raise e unless IS_RUBY19
    node.to_html.force_encoding("ASCII-8BIT").gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
  end
end

#score_node(elem) ⇒ Object



189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/pismo/readability.rb', line 189

def score_node(elem)
  content_score = class_weight(elem)
  case elem.name.downcase
    when "div"
      content_score += 5
    when "blockquote"
      content_score += 3
    when "form"
      content_score -= 3
    when "th"
      content_score -= 5
  end
  { :content_score => content_score, :elem => elem }
end

#score_paragraphs(min_text_length) ⇒ Object



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/pismo/readability.rb', line 126

def score_paragraphs(min_text_length)
  candidates = {}
  @html.css("p,td").each do |elem|
    parent_node = elem.parent
    grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
    inner_text = elem.text

    # If this paragraph is less than 25 characters, don't even count it.
    next if inner_text.length < min_text_length

    candidates[parent_node] ||= score_node(parent_node)
    candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node

    content_score = 1
    
    begin
      content_score += inner_text.split(',').length          
      content_score += [(inner_text.length / 100).to_i, 3].min
    rescue => e
      raise e unless IS_RUBY19
      inner_text.force_encoding('ASCII-8BIT')
      content_score += inner_text.split(',').length          
      content_score += [(inner_text.length / 100).to_i, 3].min
    end

    candidates[parent_node][:content_score] += content_score
    candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
  end

  # Scale the final candidates score based on link density. Good content should have a
  # relatively small link density (5% or less) and be mostly unaffected by this operation.
  candidates.each do |elem, candidate|
    candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
  end

  candidates
end

#select_best_candidate(candidates) ⇒ Object



106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/pismo/readability.rb', line 106

def select_best_candidate(candidates)
  sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }

  debug("Top 5 canidates:")
  sorted_candidates[0...5].each do |candidate|
    debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
  end

  best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
  #debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")

  best_candidate
end

#transform_misused_divs_into_paragraphs!Object



218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/pismo/readability.rb', line 218

def transform_misused_divs_into_paragraphs!
  @html.css("*").each do |elem|
    if elem.name.downcase == "div"
      # transform <div>s that do not contain other block elements into <p>s
      elem_inner_html = IS_RUBY19 ? elem.inner_html.dup.force_encoding('ASCII-8BIT') : elem.inner_html
      if elem_inner_html !~ REGEXES[:divToPElementsRe]
        debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
        elem.name = "p"
      end
    else
      # wrap text nodes in p tags
#          elem.children.each do |child|
#            if child.text?
##              debug("wrapping text node with a p")
#              child.swap("<p>#{child.text}</p>")
#            end
#          end
    end
  end
end