Class: Pismo::Reader::Cluster

Inherits:

Base

Object
Base
Pismo::Reader::Cluster

show all

Defined in:: lib/pismo/reader/cluster.rb

Constant Summary collapse

DEFAULTS = Default option parameters

{
  :threshold => 100,                                       # threshold for score of the text
  :min_length => 80,                                       # minimum length of evaluated blocks
  :decay_factor => 0.73,                                   # decay factor for block score
  :continuous_factor => 1.62,                              # continuous factor for block score ( the larger, the harder to continue )
  :no_body_factor => 0.72,                                 # no body factor that reduces block score if waste expressions are present
  :punctuation_weight => 10,                               # score weight for punctuation
  :punctuations => /(\.[^A-Za-z0-9]|,[^0-9]|!|\?)/,        # punctuation characters
  :waste_expressions => /Copyright|All Rights Reserved/i,  # characteristic keywords including footer
  :debug => false,                                         # if true, output block information to stdout
}

Constants inherited from Base

Base::BAD_WORDS, Base::BLOCK_OUTPUT_ELEMENTS, Base::COULD_CONTAIN_FULL_CONTENT, Base::FATAL_WORDS, Base::GOOD_WORDS, Base::INLINE_OUTPUT_ELEMENTS, Base::META_WORDS, Base::NON_HEADER_ELEMENTS, Base::OK_ATTRIBUTES, Base::OK_CLEAN_ATTRIBUTES, Base::OK_ELEMENTS, Base::OUTPUT_ELEMENTS, Base::WONT_CONTAIN_FULL_CONTENT

Instance Attribute Summary

Attributes inherited from Base

#content_candidates, #doc, #options, #raw_content

Instance Method Summary collapse

#analyze ⇒ Object

Analyze the structure of the HTML document and score content blocks for likelihood of containing useful content.
#content_at(index) ⇒ Object

Methods inherited from Base

#build_doc, #content, #images, #initialize, #sentences, #strip

Constructor Details

This class inherits a constructor from Pismo::Reader::Base

Instance Method Details

#analyze ⇒ `Object`

Analyze the structure of the HTML document and score content blocks for likelihood of containing useful content

# File 'lib/pismo/reader/cluster.rb', line 45

def analyze
  
  opt = DEFAULTS.clone
  opt.merge!(@options)

  @sections = []
  factor = continuous = 1.0
  body = ''
  score = 0

  # The content is split into blocks of divs
  list = @raw_content.split(/<\/?(?:div)[^>]*>/)
  list.each do |block|
    next unless block
    block.gsub!(/\n/, '')

    # Ignore blocks that have no tex
    next if has_only_tags?(block)

    # Each new block iterated over makes it less likely for it to belong
    # to the existing block
    continuous /= opt[:continuous_factor] if body.length > 0

    # Clean up and strip block of html tags for scoring
    clean = clean_block(block)
    #clean = strip_tags(block)
    next if clean.length < opt[:min_length]

    # Calculate scores for clustering of blocks
    
    # c represents how probable it is for this block to be a content block
    c = (clean.length + clean.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor

    # The further down the document we go (i.e. the more blocks we see),
    # the less likely they are to be valid content blocks
    factor *= opt[:decay_factor]

    # The not body rate represents how likely this is to be a junk block
    not_body_rate = block.scan(opt[:waste_expressions]).length

    # The block score is reduced if there is a not_body_rate
    c *= (opt[:no_body_factor] ** not_body_rate) if not_body_rate>0

    # c1 represents how probable it is for this block to belong to the
    # existing block or if it is a new one
    c1 = c * continuous

    puts "----- #{c}*#{continuous}=#{c1} #{clean.length} \n\n" if opt[:debug]

    if c1 > opt[:threshold]
      # Treat continuous blocks as cluster
      body += block + "\n"
      score += c1
      continuous = opt[:continuous_factor]
    elsif c > opt[:threshold]
      # Continuous block end
      @sections << { :body => body, :score => score }
      body = block + "\n"
      score = c
      continuous = opt[:continuous_factor]
    else
      # We drop blocks that don't have a high enough c score
    end
  end
  # Add the last block as we've finished iterating
  @sections << { :body => body, :score => score } if body
  # Sort the sections by score
  sorted_sections = @sections.sort_by { |section| section[:score] }
  # Convert to nokogiri representation for compatibility with the content method
  @content_candidates = sorted_sections.reverse.map { |section| Nokogiri::HTML(section[:body], nil, 'utf-8') }
end

#content_at(index) ⇒ `Object`



117
118
119

# File 'lib/pismo/reader/cluster.rb', line 117

def content_at(index)
  @content_candidates[index]
end