Module: UrlScrubber

.find_identity_from_url(url) ⇒ `Object`

# File 'lib/url_scrubber.rb', line 124

def self.find_identity_from_url(url)
  return nil unless url.present?
  url = UrlScrubber.scrub(url)
  url ? url.split("/").last : nil
end

.find_linkedin_identity_from_url(url) ⇒ `Object`

# File 'lib/url_scrubber.rb', line 131

def self.find_linkedin_identity_from_url(url)
  return nil if url.nil?
  scrubbed_url = scrub(url)
  if scrubbed_url && linkedin_company_url?(scrubbed_url)
    scrubbed_url.split("/").last
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
    scrubbed_url.split("/").last
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/pub/')
    id_partition = scrubbed_url.partition('linkedin.com/pub/')
    id_partition[2] && id_partition[2] != "" ? drop_url_ampersand!(id_partition[2].split('/').first) : nil
  elsif scrubbed_url.include?('linkedin.com/groups/')
    scrubbed_url.split("/").last
  elsif scrubbed_url.include?('linkedin.com/groups?gid=')
    id_partition = scrubbed_url.partition('linkedin.com/groups?gid=')
    drop_url_ampersand!(id_partition[2])
  end
end

.ideal_form?(url) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/url_scrubber.rb', line 70

def self.ideal_form?(url)
  url = scrub(url)
  return false unless url

  case service_of(url)
  when :vkontakte
    !!url.match(%r{^http://vk\.com/[\w_]+$})
  when :weibo
    !!url.match(%r{^http://weibo\.com/[\w_-]+$})
  when :youtube
    !!url.match(%r{^http://youtube\.com/[\w_-]+$})
  when :twitter
    !!url.match(%r{^http://twitter\.com/[\w_]+$})
  when :facebook
    !!url.match(%r{^http://facebook\.com/(profile\.php?id=\d+|[\w_\.-]+)$}) || !!url.match(%r{^http://facebook\.com/groups/[\w_\.-]+$})
  when :linkedin
    !!url.match(%r{^http://linkedin\.com/pub/[\w-]+/[\w]+/[\w]+/[\w]+$}) || !!url.match(%r{^http://linkedin\.com/in/[\w_-]+$}) || !!url.match(%r{^http://linkedin\.com/(company/[\w_-]+|profile/view\?id=\d+)$}) || !!url.match(%r{^http://linkedin\.com/(groups\?gid=[0-9]+)$}) || !!url.match(%r{^http://linkedin\.com/(groups/[\w_-]+)$})
  when :google
    !!url.match(%r{^http://plus\.google\.com/(\+[\w_-]+|\d+)$}) || !!url.match(%r{^http://plus\.google\.com/communities/\d+$})
  when :slideshare
    !!url.match(%r{^http://slideshare\.net/[\w_-]+$})
  when :flickr
    !!url.match(%r{^http://flickr\.com/[\w_\@-]+$}) || !!url.match(%r{^http://flickr\.com/groups/[\w_\@\.-]+$})
  when :pinterest
    !!url.match(%r{^http://pinterest\.com/[\w_-]+$})
  when :yelp
    !!url.match(%r{^http://yelp\.com/[\w_-]+$})
  when :vimeo
    (!!url.match(%r{^http://vimeo\.com/[\w_-]+$}) && !url.match(%r{/\d+$})) || !!url.match(%r{^http://vimeo\.com/groups/[\w_\.-]+$})
  when :instagram
    !!url.match(%r{^http://instagram\.com/[\w_]+$})
  when :tumblr
    #Rails.logger.debug "CCC  Tumblr - url=#{url}, ideal=#{!!url.match(%r{^http://[\w_]+\.tumblr\.com$})}, www=#{url.index("://www.") ? url.index("://www.") : 'NIL'}"
    !!url.match(%r{^http://[\w_]+\.tumblr\.com$}) && !url.index("://www.")
  else
    true
  end
end

.linkedin_company_url?(url) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/url_scrubber.rb', line 110

def self.linkedin_company_url?(url)
  url = scrub(url)
  return false unless url
  return url.include?('http://linkedin.com/company/')
end

.linkedin_personal_url?(url) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/url_scrubber.rb', line 117

def self.linkedin_personal_url?(url)
  url = scrub(url)
  return false unless url
  return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
end

.maps_to_public_url(url) ⇒ `Object`

# File 'lib/url_scrubber.rb', line 163

def self.maps_to_public_url(url)
  scrubbed = scrub(url)
  parsed = URI.parse(URI.escape(url)) or return nil
  host = Domainatrix.parse(parsed.host)
  if host.domain == "facebook" && host.subdomain == "business"
    public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com")
  elsif host.domain == "google" && host.subdomain == "business"
    public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com")
  else
    public_url = nil
  end
  public_url
end

.scrub(url) ⇒ `Object`

# File 'lib/url_scrubber.rb', line 9

def self.scrub(url)
  return url if url.blank?
  return url if /^app:\/\//.match(url)  # Do not scrub app-only URLs
  return url if /^https?:\/\/(www.)?business.tiktok\.com\/manage\//.match(url) # Don't scrub tik tok business manager urls, quick fix until we can implement a different solution, https://business.tiktok.com/manage/overview?org_id=6974497704617492482

  url = url.clone # don't modify the original argument

  m = url.match(/(htt?ps?:\/\/\S+)/i)
  return nil unless m

  url = m[1]
  url.sub!(/^https/i, 'http')
  url.sub!(/^htp/i, 'http')
  url.sub!(/\/+$/, '')
  url.sub!(/;+$/, '')
  url.sub!('#!/', '')
  url.sub!('%27', '\'')
  url = downcase_domain(url)
  remove_subdomain!(url)
  remove_html_tags!(url)
  # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
  url = drop_anchor!(special_cases(url))
  url.sub!(/,+$/, "")    # remove one or more trailing commas at the end of the URL
  url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
  return url
end

.service_of(url) ⇒ `Object`

# File 'lib/url_scrubber.rb', line 37

def self.service_of(url)
  url_parts = Domainatrix.parse(url)
  if url_parts.host.present?

    case url_parts.domain
    when 'facebook'           then return :facebook
    when 'fb'                 then return :facebook
    when 'flickr'             then return :flickr
    when 'instagram'          then return :instagram
    when 'linkedin'           then return :linkedin
    when 'pinterest'          then return :pinterest
    when 'slideshare'         then return :slideshare
    when 'tumblr'             then return :tumblr
    when 'twitter'            then return :twitter
    when 'vimeo'              then return :vimeo
    when 'vk'                 then return :vkontakte
    when 'weibo'              then return :weibo
    when 'yelp'               then return :yelp
    when 'youtube'            then return :youtube
    end

    case url_parts.host
    when /\bplus\.google\.com$/ then return :google
    end

  else
    Rails.logger.debug "No Domain Match"
  end

  :other
end

.valid_url?(url) ⇒ `Boolean`

Requirements:

must have http/https scheme
no “@” in any of the passed in url string
valid uri as determined by Addressable::URI

Returns:

(Boolean)

# File 'lib/url_scrubber.rb', line 154

def self.valid_url?(url)
  schemes = %w(http https)
  parsed = URI.parse(URI.escape(url)) or return false
  schemes.include?(parsed.scheme) && !url.include?("@")
  rescue URI::InvalidURIError
  false
end

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.find_identity_from_url(url) ⇒ `Object`

.find_linkedin_identity_from_url(url) ⇒ `Object`

.ideal_form?(url) ⇒ `Boolean`

.linkedin_company_url?(url) ⇒ `Boolean`

.linkedin_personal_url?(url) ⇒ `Boolean`

.maps_to_public_url(url) ⇒ `Object`

.scrub(url) ⇒ `Object`

.service_of(url) ⇒ `Object`

.valid_url?(url) ⇒ `Boolean`

Module: UrlScrubber

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.find_identity_from_url(url) ⇒ Object

.find_linkedin_identity_from_url(url) ⇒ Object

.ideal_form?(url) ⇒ Boolean

.linkedin_company_url?(url) ⇒ Boolean

.linkedin_personal_url?(url) ⇒ Boolean

.maps_to_public_url(url) ⇒ Object

.scrub(url) ⇒ Object

.service_of(url) ⇒ Object

.valid_url?(url) ⇒ Boolean

.find_identity_from_url(url) ⇒ `Object`

.find_linkedin_identity_from_url(url) ⇒ `Object`

.ideal_form?(url) ⇒ `Boolean`

.linkedin_company_url?(url) ⇒ `Boolean`

.linkedin_personal_url?(url) ⇒ `Boolean`

.maps_to_public_url(url) ⇒ `Object`

.scrub(url) ⇒ `Object`

.service_of(url) ⇒ `Object`

.valid_url?(url) ⇒ `Boolean`