Class: Raev::Url

Inherits:

Object

Object
Raev::Url

show all

Defined in:: lib/raev/url.rb

Instance Attribute Summary collapse

#doc ⇒ Object readonly

Returns the value of attribute doc.
#url ⇒ Object readonly

Returns the value of attribute url.

Instance Method Summary collapse

#author ⇒ Object
#base ⇒ Object
#bestRating ⇒ Object
#clean ⇒ Object
#feed ⇒ Object
#headline ⇒ Object
#initialize(url) ⇒ Url constructor

A new instance of Url.
#pubdate ⇒ Object
#ratingValue ⇒ Object
#resolved ⇒ Object
#resolved_and_clean ⇒ Object
#twitter ⇒ Object
#without_http ⇒ Object

Constructor Details

#initialize(url) ⇒ `Url`

Returns a new instance of Url.

# File 'lib/raev/url.rb', line 12

def initialize(url)
  @url = url
  @doc = nil
  @linked_data = nil
end

Instance Attribute Details

#doc ⇒ `Object` (readonly)

Returns the value of attribute doc.



9
10
11

# File 'lib/raev/url.rb', line 9

def doc
  @doc
end

#url ⇒ `Object` (readonly)

Returns the value of attribute url.



8
9
10

# File 'lib/raev/url.rb', line 8

def url
  @url
end

Instance Method Details

#author ⇒ `Object`

# File 'lib/raev/url.rb', line 153

def author
			node = document.search('meta[name="author"]').first

			if node && node.attribute("content")
return node.attribute("content").value
			end
			
  cssSelectors = [
    '.author-info .name',
    '.author-top a',
'.yt-user-info a',
    'a[rel~="author"]',
    'a[itemprop~="author"]',
    '.author h3 a',
    '.author',
    '.posted-by a',
    '.entryAuthor a',
    'a.names',
    'a.byline-author',
    '.byline a',
    '.author.vcard a',
    'p.info a',
    '.author-name',
    '.upcased',
    'a[rel~="nofollow"]'
  ]

  node = document.search(cssSelectors.join(", ")).first
  
  if node
    words = node.content.split.size
  
    if words <= 4
      return Sanitize.clean(node.content).strip[0..255]
    end
  end
  
  ""
end

#base ⇒ `Object`

# File 'lib/raev/url.rb', line 18

def base      
  base_url = @url.split('/')[2]  
  base_url = base_url.gsub('www.', '') unless base_url.nil?
  base_url
end

#bestRating ⇒ `Object`

# File 'lib/raev/url.rb', line 211

def bestRating
	node = document.search('*[itemprop="bestRating"]').first
	
	if node
		if node.attribute("content")
			value = node.attribute("content").value
		
			if value
				return value.to_f
			end
		end
	end
	
	nil			
end

#clean ⇒ `Object`

# File 'lib/raev/url.rb', line 24

def clean
  unless @url.nil?
    utm_index = @url.index(/(\?|&)utm_/)
    unless(utm_index.nil?)
      return url.slice(0, utm_index)
    end
  end
  
  @url
end

#feed ⇒ `Object`

# File 'lib/raev/url.rb', line 67

def feed
  feed_url = nil
  
  node = document.css('link[type="application/rss+xml"][rel="alternate"]')
  
  if node.first
    feed_url = node.first["href"]
  else
    node = document.css('a:match_href("http://feeds.")', Raev::Parser.new)
            
    if node.first
      feed_url = node.first["href"]
    end
  end
  
  if feed_url && feed_url[0,1] == "/"
    feed_url = @url + feed_url
  end
  
  feed_url
end

#headline ⇒ `Object`

# File 'lib/raev/url.rb', line 89

def headline
  if linked_data && linked_data["headline"]
    return Sanitize.clean(linked_data["headline"])
  end
  
  page_title = nil
  
  node = document.css(".twitter-share-button")
  
  if node.first
    if node.first['data-text']
      page_title = node.first['data-text']
    end
  end

  if page_title.nil?
    document.css("head meta").each do |meta|
      if meta['property'] == 'og:title' || meta['property'] == 'twitter:title'
        page_title = meta['content']
      end
    end
  end
        
  if page_title.nil?
    node = document.css("#article h1, a[rel=\"bookmark\"], h2[itemprop=\"name\"]")
            
    if node.first
      page_title = node.first.content
    end
  end
  
  unless page_title.nil?
    page_title.gsub!(/ +/, ' ')
  end
  
  page_title
end

#pubdate ⇒ `Object`

# File 'lib/raev/url.rb', line 127

def pubdate      
  if linked_data && linked_data["datePublished"]
    return Date.parse(linked_data["datePublished"])
  end
  
  date_elements = @url.match(/[0-9]{4}\/[0-9]{1,2}\/[0-9]{1,2}/).to_s.split("/")
  
  if date_elements.size == 3
    return Date.new(date_elements[0].to_i, date_elements[1].to_i, date_elements[2].to_i)      
  else
    node = document.search("meta[itemprop='datePublished'], meta[name='pub_date']").first
    
    if node
      return Date.parse(node.attribute("content"))
    else
      node = document.search(".entryDate, .entrydate").first

      if node
        return Chronic.parse(node.content.gsub(/[^a-zA-Z0-9\s]/,"").strip)
      end
    end
  end
  
  nil
end

#ratingValue ⇒ `Object`

# File 'lib/raev/url.rb', line 193

def ratingValue
	node = document.search('*[itemprop="ratingValue"]').first
	
	if node
		if node.attribute("content")
			value = node.attribute("content").value
		else
			value = node.content
		end
	end

	if value
		value.to_f
	else
		nil
	end
end

#resolved ⇒ `Object`

# File 'lib/raev/url.rb', line 35

def resolved
  unless @url.nil?
    begin
      return RedirectFollower(@url, 5)
    rescue => ex
      puts "Could not resolve #{@url}. #{ex.class}: #{ex.message}"
    end
  end

  @url
end

#resolved_and_clean ⇒ `Object`

# File 'lib/raev/url.rb', line 47

def resolved_and_clean
  resolved_url = Url.new(self.resolved)
  resolved_url.clean      
end

#twitter ⇒ `Object`

# File 'lib/raev/url.rb', line 56

def twitter
  node = document.css('a:match_href("twitter.com")', Raev::Parser.new)
        
  if node.first
    twitter_url = node.first["href"]
    twitter_url.split('/').last
  else
    nil
  end
end

#without_http ⇒ `Object`



52
53
54

# File 'lib/raev/url.rb', line 52

def without_http
  @url.sub("http://", "")
end

Class: Raev::Url

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Url

Instance Attribute Details

#doc ⇒ Object (readonly)

#url ⇒ Object (readonly)

Instance Method Details

#author ⇒ Object

#base ⇒ Object

#bestRating ⇒ Object

#clean ⇒ Object

#feed ⇒ Object

#headline ⇒ Object

#pubdate ⇒ Object

#ratingValue ⇒ Object

#resolved ⇒ Object

#resolved_and_clean ⇒ Object

#twitter ⇒ Object

#without_http ⇒ Object

#initialize(url) ⇒ `Url`

#doc ⇒ `Object` (readonly)

#url ⇒ `Object` (readonly)

#author ⇒ `Object`

#base ⇒ `Object`

#bestRating ⇒ `Object`

#clean ⇒ `Object`

#feed ⇒ `Object`

#headline ⇒ `Object`

#pubdate ⇒ `Object`

#ratingValue ⇒ `Object`

#resolved ⇒ `Object`

#resolved_and_clean ⇒ `Object`

#twitter ⇒ `Object`

#without_http ⇒ `Object`