Module: Pismo::InternalAttributes

Included in:: Document

Defined in:: lib/pismo/internal_attributes.rb

Overview

Internal attributes are different pieces of data we can extract from a document’s content

Instance Method Summary collapse

#author(all = false) ⇒ Object

Returns the author of the page/content.
#authors ⇒ Object
#body ⇒ Object

Returns body text as determined by Arc90’s Readability algorithm.
#datetime ⇒ Object

Return an estimate of when the page/content was created As clients of this library should be doing HTTP retrieval themselves, they can fall to the Last-Updated HTTP header if they so wish.
#description ⇒ Object

Returns the “description” of the page, usually comes from a meta tag.
#favicon ⇒ Object

Returns URL to the site’s favicon.
#feed(all = false) ⇒ Object

Returns URL(s) of Web feed(s).
#feeds ⇒ Object
#html_title ⇒ Object

HTML title.
#keywords(options = {}) ⇒ Object

Returns the “keywords” in the document (not the meta keywords - they’re next to useless now).
#lede(all = false) ⇒ Object

Returns the “lede(s)” or first paragraph(s) of the story/page.
#ledes ⇒ Object
#title(all = false) ⇒ Object

Returns the title of the page/content - attempts to strip site name, etc, if possible.
#titles ⇒ Object

Instance Method Details

#author(all = false) ⇒ `Object`

Returns the author of the page/content

# File 'lib/pismo/internal_attributes.rb', line 120

def author(all = false)
  author = @doc.match([
                      '.post-author .fn',
                      '.wire_author',
                      '.cnnByline b',
                      '.editorlink',
                      '.authors p',
                      ['meta[@name="author"]', lambda { |el| el.attr('content') }],     # Traditional meta tag style
                      ['meta[@name="Author"]', lambda { |el| el.attr('content') }],     # CNN style
                      ['meta[@name="AUTHOR"]', lambda { |el| el.attr('content') }],     # CNN style
                      '.byline a',                                                      # Ruby Inside style
                      '.byline',
                      '.post_subheader_left a',                                         # TechCrunch style
                      '.byl',                                                           # BBC News style
                      '.meta a',
                      '.articledata .author a',
                      '#owners a',                                                      # Google Code style
                      '.author a',
                      '.author',
                      '.auth a',
                      '.auth',
                      '.cT-storyDetails h5',                                            # smh.com.au - worth dropping maybe..
                      ['meta[@name="byl"]', lambda { |el| el.attr('content') }],
                      '.timestamp a',
                      '.fn a',
                      '.fn',
                      '.byline-author',
                      '.ArticleAuthor a',
                      '.blog_meta a',
                      'cite a',
                      'cite',
                      '.contributor_details h4 a'
                      ], all)
                      
  return unless author

  # Strip off any "By [whoever]" section
  if String === author
    author.sub!(/^(post(ed)?\s)?by\W+/i, '')
    author.tr!('^a-zA-Z 0-9\'', '|')
    author = author.split(/\|{2,}/).first.to_s
    author.gsub!(/\s+/, ' ')
    author.gsub!(/\|/, '')
    author.strip!
  elsif Array === author
    author.map! { |a| a.sub(/^(post(ed)?\s)?by\W+/i, '') }.uniq!
  end
  
  author
end

#authors ⇒ `Object`



171
172
173

# File 'lib/pismo/internal_attributes.rb', line 171

def authors
  author(true)
end

#body ⇒ `Object`

Returns body text as determined by Arc90’s Readability algorithm

# File 'lib/pismo/internal_attributes.rb', line 257

def body
  @body ||= Readability::Document.new(@doc.to_s).content.strip
  
  # HACK: Remove annoying DIV that readability leaves around
  @body.sub!(/\A\<div\>/, '')
  @body.sub!(/\<\/div\>\Z/, '')
  
  return @body
end

#datetime ⇒ `Object`

Return an estimate of when the page/content was created As clients of this library should be doing HTTP retrieval themselves, they can fall to the Last-Updated HTTP header if they so wish. This method is just rough and based on content only.

# File 'lib/pismo/internal_attributes.rb', line 76

def datetime
  # TODO: Clean all this mess up
  
  mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)}i
  
  regexen = [
    /#{mo}\b\s+\d+\D{1,10}\d{4}/i,
    /(on\s+)?\d+\s+#{mo}\s+\D{1,10}\d+/i,
    /(on[^\d+]{1,10})\d+(th|st|rd)?.{1,10}#{mo}\b[^\d]{1,10}\d+/i,
    /\b\d{4}\-\d{2}\-\d{2}\b/i,
    /\d+(th|st|rd).{1,10}#{mo}\b[^\d]{1,10}\d+/i,
    /\d+\s+#{mo}\b[^\d]{1,10}\d+/i,
    /on\s+#{mo}\s+\d+/i,
    /#{mo}\s+\d+/i,
    /\d{4}[\.\/\-]\d{2}[\.\/\-]\d{2}/,
    /\d{2}[\.\/\-]\d{2}[\.\/\-]\d{4}/
  ]
  
  datetime = 10
  
  regexen.each do |r|
    datetime = @doc.to_html[r]
    # p datetime
    break if datetime
  end
  
  return unless datetime && datetime.length > 4
  
  # Clean up the string for use by Chronic
  datetime.strip!
  datetime.gsub!(/(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)[^\w]*/i, '')
  datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)[^\w]*/i, '')
  datetime.sub!(/on\s+/, '')
  datetime.gsub!(/\,/, '')
  datetime.sub!(/(\d+)(th|st|rd)/, '\1')
  
  Chronic.parse(datetime) || datetime
end

#description ⇒ `Object`

Returns the “description” of the page, usually comes from a meta tag

# File 'lib/pismo/internal_attributes.rb', line 177

def description
  @doc.match([
              ['meta[@name="description"]', lambda { |el| el.attr('content') }],
              ['meta[@name="Description"]', lambda { |el| el.attr('content') }],
              ['meta[@name="DESCRIPTION"]', lambda { |el| el.attr('content') }],
              'rdf:Description[@name="dc:description"]',
              '.description'
   ])
end

#favicon ⇒ `Object`

Returns URL to the site’s favicon

# File 'lib/pismo/internal_attributes.rb', line 268

def favicon
  url = @doc.match([['link[@rel="fluid-icon"]', lambda { |el| el.attr('href') }],      # Get a Fluid icon if possible..
                    ['link[@rel="shortcut icon"]', lambda { |el| el.attr('href') }],
                    ['link[@rel="icon"]', lambda { |el| el.attr('href') }]])
  if url && url !~ /^http/ && @url
    url = URI.join(@url , url).to_s
  end
  
  url
end

#feed(all = false) ⇒ `Object`

Returns URL(s) of Web feed(s)

# File 'lib/pismo/internal_attributes.rb', line 280

def feed(all = false)
  url = @doc.match([['link[@type="application/rss+xml"]', lambda { |el| el.attr('href') }],
                    ['link[@type="application/atom+xml"]', lambda { |el| el.attr('href') }]], all
  )
  
  if url && String === url && url !~ /^http/ && @url
    url = URI.join(@url , url).to_s
  elsif url && Array === url
    url.map! do |u|
      if u !~ /^http/ && @url
        URI.join(@url, u).to_s
      else
        u
      end
    end
    url.uniq!
  end
  
  url
end

#feeds ⇒ `Object`



301
302
303

# File 'lib/pismo/internal_attributes.rb', line 301

def feeds
  feed(true)
end

#html_title ⇒ `Object`

HTML title

# File 'lib/pismo/internal_attributes.rb', line 65

def html_title
  title = @doc.match('title')
  return unless title
  title
  # Strip off any leading or trailing site names - a scrappy way to try it out..
  #title = title.split(/\s+(\-|\||\:)\s+/).sort_by { |i| i.length }.last.to_s.strip
end

#keywords(options = {}) ⇒ `Object`

Returns the “keywords” in the document (not the meta keywords - they’re next to useless now)

# File 'lib/pismo/internal_attributes.rb', line 230

def keywords(options = {})
  options = { :stem_at => 20, :word_length_limit => 15, :limit => 20 }.merge(options)
  
  words = {}
  
  # Convert doc to lowercase, scrub out most HTML tags, then keep track of words
  cached_title = title
  content_to_use = body.to_s.downcase + description.to_s.downcase

  # old regex for safe keeping -- \b[a-z][a-z\+\.\'\+\#\-]*\b
  content_to_use.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\.+\s+/, ' ').gsub(/\&\w+\;/, '').scan(/(\b|\s|\A)([a-z0-9][a-z0-9\+\.\'\+\#\-\/\\]*)(\b|\s|\Z)/i).map{ |ta1| ta1[1] }.each do |word|
    next if word.length > options[:word_length_limit]
    word.gsub!(/\'\w+/, '')
    words[word] ||= 0
    words[word] += (cached_title.downcase.include?(word) ? 5 : 1)
  end

  # Stem the words and stop words if necessary
  d = words.keys.uniq.map { |a| a.length > options[:stem_at] ? a.stem : a }
  s = Pismo.stopwords.map { |a| a.length > options[:stem_at] ? a.stem : a }

        
  w = words.delete_if { |k1, v1| s.include?(k1) || (v1 < 2 && words.size > 80) }.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
  return w
end

#lede(all = false) ⇒ `Object`

Returns the “lede(s)” or first paragraph(s) of the story/page

# File 'lib/pismo/internal_attributes.rb', line 188

def lede(all = false)
  lede = @doc.match([ 
              '.post-text p',
              '#blogpost p',
              '.story-teaser',
              '.subhead',
              '//div[@class="entrytext"]//p[string-length()>10]',                      # Ruby Inside / Kubrick style
              'section p',
              '.entry .text p',
              '.entry-content p',
              '#wikicontent p',                                                        # Google Code style
              '//td[@class="storybody"]/p[string-length()>10]',                        # BBC News style
              '//div[@class="entry"]//p[string-length()>100]',
              # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
              # don't use <p> tags..
              ['.entry-content', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
              ['.entry', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
              '.entry',
              '#content p',
              '#article p',
              '.post-body',
              '.entry-content',
              '.body p',
              '.document_description_short p',    # Scribd
              '.single-post p',
              'p'
              ], all)
   
  if lede && String === lede
    return lede[/^(.*?\.\s){2}/m] || lede
  elsif lede && Array === lede
    return lede.map { |l| l.to_s[/^(.*?\.\s){2}/m] || l }.uniq
  else
    return body ? body[/^(.*?\.\s){2}/m] : nil
  end
end

#ledes ⇒ `Object`



225
226
227

# File 'lib/pismo/internal_attributes.rb', line 225

def ledes
  lede(true)
end

#title(all = false) ⇒ `Object`

Returns the title of the page/content - attempts to strip site name, etc, if possible

# File 'lib/pismo/internal_attributes.rb', line 5

def title(all = false)
  # TODO: Memoizations
  title = @doc.match( 
                      [
                        '#pname a',                                                       # Google Code style
                        '.entryheader h1',                                                # Ruby Inside/Kubrick
                        '.entry-title a',                                               # Common Blogger/Blogspot rules
                        '.post-title a',
                        '.post_title a',
                        '.posttitle a',
                        '.post-header h1',
                        '.entry-title',
                        '.post-title',
                        '.post h3 a',
                        'a.datitle',          # Slashdot style
                        '.posttitle',
                        '.post_title',
                        '.pageTitle',
                        '#main h1.title',
                        '.title h1',                          
                        '.post h2',
                        'h2.title',
                        '.entry h2',                                                      # Common style
                        '.boite_titre a',
                        ['meta[@name="title"]', lambda { |el| el.attr('content') }],
                        'h1.headermain',
                        'h1.title',
                        '.mxb h1',                                                        # BBC News
                        '#content h1',
                        '#content h2',
                        '#content h3',
                        'a[@rel="bookmark"]',
                        '.products h2',
                        '.caption h3',
                        '#main h2',
                        '#body h1',
                        '#wrapper h1',
                        '#page h1',
                        '.asset-header h1',
                        '#body_content h2'
                      ],
                      all
                    )
  
  # If all else fails, go to the HTML title
  if all
    return [html_title] if !title
    return ([*title] + [html_title]).uniq
  else
    return html_title if !title
    return title
  end
end

#titles ⇒ `Object`



59
60
61

# File 'lib/pismo/internal_attributes.rb', line 59

def titles
  title(true)
end

Module: Pismo::InternalAttributes

Overview

Instance Method Summary collapse

Instance Method Details

#author(all = false) ⇒ Object

#authors ⇒ Object

#body ⇒ Object

#datetime ⇒ Object

#description ⇒ Object

#favicon ⇒ Object

#feed(all = false) ⇒ Object

#feeds ⇒ Object

#html_title ⇒ Object

#keywords(options = {}) ⇒ Object

#lede(all = false) ⇒ Object

#ledes ⇒ Object

#title(all = false) ⇒ Object

#titles ⇒ Object