Class: Readability::Document

Inherits:

Object

Object
Readability::Document

show all

Defined in:: lib/pismo/readability.rb

Constant Summary collapse

TEXT_LENGTH_THRESHOLD =

RETRY_LENGTH =

REGEXES =

{
    :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
    :okMaybeItsACandidateRe => /and|article|body|column|main/i,
    :positiveRe => /article|body|content|entry|hentry|page|pagination|post|story|text/i,
    :negativeRe => /combx|comment|contact|foot|box_wrap|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags/i,
    :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
    :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
    :replaceFontsRe => /<(\/?)font[^>]*>/i,
    :trimRe => /^\s+|\s+$/,
    :normalizeRe => /\s{2,}/,
    :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
    :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
}

Instance Attribute Summary collapse

#html ⇒ Object

Returns the value of attribute html.
#options ⇒ Object

Returns the value of attribute options.

Instance Method Summary collapse

Constructor Details

#initialize(input, options = {}) ⇒ `Document`

Returns a new instance of Document.

# File 'lib/pismo/readability.rb', line 26

def initialize(input, options = {})
  @input = input
  @options = options
  make_html
end

Instance Attribute Details

#html ⇒ `Object`

Returns the value of attribute html.



24
25
26

# File 'lib/pismo/readability.rb', line 24

def html
  @html
end

#options ⇒ `Object`

Returns the value of attribute options.



24
25
26

# File 'lib/pismo/readability.rb', line 24

def options
  @options
end

Instance Method Details

#class_weight(e) ⇒ `Object`

# File 'lib/pismo/readability.rb', line 164

def class_weight(e)
  weight = 0
  if e[:class] && e[:class] != ""
    if e[:class] =~ REGEXES[:negativeRe]
      weight -= 25
    end

    if e[:class] =~ REGEXES[:positiveRe]
      weight += 25
    end
  end

  if e[:id] && e[:id] != ""
    if e[:id] =~ REGEXES[:negativeRe]
      weight -= 25
    end

    if e[:id] =~ REGEXES[:positiveRe]
      weight += 25
    end
  end

  weight
end

#content(remove_unlikely_candidates = true) ⇒ `Object`

# File 'lib/pismo/readability.rb', line 50

def content(remove_unlikely_candidates = true)
  @html.css("script, style").each { |i| i.remove }

  remove_unlikely_candidates! if remove_unlikely_candidates
  transform_misused_divs_into_paragraphs!
  candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
  best_candidate = select_best_candidate(candidates)
  article = get_article(candidates, best_candidate)
  cleaned_article = sanitize(article, candidates, options)
  cleaned_article.gsub!(/^\s+\n/, "\n")
  cleaned_article.gsub!(/[\ \t]+/, ' ')
  cleaned_article.gsub!(/^\s+/, '')
  cleaned_article.gsub!(/\<\!\-\-.*?\-\-\>/m, '')
  if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
    make_html
    content(false)
  else
    cleaned_article
  end
end

#debug(str) ⇒ `Object`



204
205
206

# File 'lib/pismo/readability.rb', line 204

def debug(str)
  puts str if options[:debug]
end

#get_article(candidates, best_candidate) ⇒ `Object`

# File 'lib/pismo/readability.rb', line 71

def get_article(candidates, best_candidate)
  # Now that we have the top candidate, look through its siblings for content that might also be related.
  # Things like preambles, content split by ads that we removed, etc.

  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
  output = Nokogiri::XML::Node.new('div', @html)
  
  return output unless best_candidate[:elem]
  
  best_candidate[:elem].parent.children.each do |sibling|
    append = false
    append = true if sibling == best_candidate[:elem]
    append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold

    if sibling.name.downcase == "p"
      link_density = get_link_density(sibling)
      node_content = sibling.text
      node_length = node_content.length

      if node_length > 80 && link_density < 0.25
        append = true
      elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
        append = true
      end
    end

    if append
      sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
      output << sibling
    end
  end

  output
end

#get_link_density(elem) ⇒ `Object`

# File 'lib/pismo/readability.rb', line 120

def get_link_density(elem)
  link_length = elem.css("a").map {|i| i.text}.join("").length
  text_length = elem.text.length
  link_length / text_length.to_f
end

#make_html ⇒ `Object`



32
33
34

# File 'lib/pismo/readability.rb', line 32

def make_html
  @html = Nokogiri::HTML(@input) #, nil, 'UTF-8')
end

#remove_unlikely_candidates! ⇒ `Object`

# File 'lib/pismo/readability.rb', line 208

def remove_unlikely_candidates!
  @html.css("*").each do |elem|
    str = "#{elem[:class]}#{elem[:id]}"
    if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
      debug("Removing unlikely candidate - #{str}")
      elem.remove
    end
  end
end

#sanitize(node, candidates, options = {}) ⇒ `Object`

# File 'lib/pismo/readability.rb', line 239

def sanitize(node, candidates, options = {})
  node.css("h1, h2, h3, h4, h5, h6").each do |header|
    header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
  end

  node.css("form, object, iframe, embed").each do |elem|
    elem.remove
  end

  # Remove empty <p> tags
  node.css("p").each do |elem|
    elem.remove if elem.content.strip.empty?
  end

  # Remove empty <div> tags
  node.css("div").each do |elem|
    elem.remove if elem.content.strip.empty?
  end
  
  

  # Conditionally clean <table>s, <ul>s, and <div>s
  node.css("table, ul, div").each do |el|
    weight = class_weight(el)
    content_score = candidates[el] ? candidates[el][:content_score] : 0
    name = el.name.downcase

    if weight + content_score < 0
      el.remove
      debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
    elsif (IS_RUBY19 && el.text.force_encoding("ASCII-8BIT").count(",") < 10) || (!IS_RUBY19 && el.text.count(",") < 10)
      counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
      counts["li"] -= 100

      content_length = el.text.length
      link_density = get_link_density(el)
      to_remove = false
      reason = ""

      if counts["img"] > counts["p"]
        reason = "too many images"
        to_remove = true
      elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
        reason = "more <li>s than <p>s"
        to_remove = true
      elsif counts["input"] > (counts["p"] / 3).to_i
        reason = "less than 3x <p>s than <input>s"
        to_remove = true
      elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
        reason = "too short a content length without a single image"
        to_remove = true
      elsif weight < 25 && link_density > 0.2
        reason = "too many links for its weight (#{weight})"
        to_remove = true
      elsif weight >= 25 && link_density > 0.5
        reason = "too many links for its weight (#{weight})"
        to_remove = true
      elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
        reason = "<embed>s with too short a content length, or too many <embed>s"
        to_remove = true
      end

      if to_remove
        debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
        el.remove
      end
    end
  end

  # We'll sanitize all elements using a whitelist
  whitelist = @options[:tags] || %w[p]

  # Use a hash for speed (don't want to make a million calls to include?)
  whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]

  ([node] + node.css("*")).each do |el|

    # If element is in whitelist, delete all its attributes
    if whitelist[el.node_name]
      el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }

      # Otherwise, replace the element with its contents
    else
      begin
        el.swap(el.text)
      rescue => e
        raise e unless IS_RUBY19
        el.swap(el.text.force_encoding("ASCII-8BIT"))
      end
    end

  end

  # Get rid of duplicate whitespace
  begin
    node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
  rescue => e
    raise e unless IS_RUBY19
    node.to_html.force_encoding("ASCII-8BIT").gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
  end
end

#score_node(elem) ⇒ `Object`

# File 'lib/pismo/readability.rb', line 189

def score_node(elem)
  content_score = class_weight(elem)
  case elem.name.downcase
    when "div"
      content_score += 5
    when "blockquote"
      content_score += 3
    when "form"
      content_score -= 3
    when "th"
      content_score -= 5
  end
  { :content_score => content_score, :elem => elem }
end

#score_paragraphs(min_text_length) ⇒ `Object`

# File 'lib/pismo/readability.rb', line 126

def score_paragraphs(min_text_length)
  candidates = {}
  @html.css("p,td").each do |elem|
    parent_node = elem.parent
    grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
    inner_text = elem.text

    # If this paragraph is less than 25 characters, don't even count it.
    next if inner_text.length < min_text_length

    candidates[parent_node] ||= score_node(parent_node)
    candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node

    content_score = 1
    
    begin
      content_score += inner_text.split(',').length          
      content_score += [(inner_text.length / 100).to_i, 3].min
    rescue => e
      raise e unless IS_RUBY19
      inner_text.force_encoding('ASCII-8BIT')
      content_score += inner_text.split(',').length          
      content_score += [(inner_text.length / 100).to_i, 3].min
    end

    candidates[parent_node][:content_score] += content_score
    candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
  end

  # Scale the final candidates score based on link density. Good content should have a
  # relatively small link density (5% or less) and be mostly unaffected by this operation.
  candidates.each do |elem, candidate|
    candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
  end

  candidates
end

#select_best_candidate(candidates) ⇒ `Object`

# File 'lib/pismo/readability.rb', line 106

def select_best_candidate(candidates)
  sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }

  debug("Top 5 canidates:")
  sorted_candidates[0...5].each do |candidate|
    debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
  end

  best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
  #debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")

  best_candidate
end

#transform_misused_divs_into_paragraphs! ⇒ `Object`

# File 'lib/pismo/readability.rb', line 218

def transform_misused_divs_into_paragraphs!
  @html.css("*").each do |elem|
    if elem.name.downcase == "div"
      # transform <div>s that do not contain other block elements into <p>s
      elem_inner_html = IS_RUBY19 ? elem.inner_html.dup.force_encoding('ASCII-8BIT') : elem.inner_html
      if elem_inner_html !~ REGEXES[:divToPElementsRe]
        debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
        elem.name = "p"
      end
    else
      # wrap text nodes in p tags
#          elem.children.each do |child|
#            if child.text?
##              debug("wrapping text node with a p")
#              child.swap("<p>#{child.text}</p>")
#            end
#          end
    end
  end
end

Class: Readability::Document

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input, options = {}) ⇒ Document

Instance Attribute Details

#html ⇒ Object

#options ⇒ Object

Instance Method Details

#class_weight(e) ⇒ Object

#content(remove_unlikely_candidates = true) ⇒ Object

#debug(str) ⇒ Object

#get_article(candidates, best_candidate) ⇒ Object

#get_link_density(elem) ⇒ Object

#make_html ⇒ Object

#remove_unlikely_candidates! ⇒ Object

#sanitize(node, candidates, options = {}) ⇒ Object

#score_node(elem) ⇒ Object

#score_paragraphs(min_text_length) ⇒ Object

#select_best_candidate(candidates) ⇒ Object

#transform_misused_divs_into_paragraphs! ⇒ Object

#initialize(input, options = {}) ⇒ `Document`

#html ⇒ `Object`

#options ⇒ `Object`

#class_weight(e) ⇒ `Object`

#content(remove_unlikely_candidates = true) ⇒ `Object`

#debug(str) ⇒ `Object`

#get_article(candidates, best_candidate) ⇒ `Object`

#get_link_density(elem) ⇒ `Object`

#make_html ⇒ `Object`

#remove_unlikely_candidates! ⇒ `Object`

#sanitize(node, candidates, options = {}) ⇒ `Object`

#score_node(elem) ⇒ `Object`

#score_paragraphs(min_text_length) ⇒ `Object`

#select_best_candidate(candidates) ⇒ `Object`

#transform_misused_divs_into_paragraphs! ⇒ `Object`