Module: UrlScrubber

Defined in:
lib/url_scrubber.rb,
lib/url_scrubber/version.rb

Constant Summary collapse

VERSION =
"0.8.23"

Class Method Summary collapse

Class Method Details

.find_identity_from_url(url) ⇒ Object



124
125
126
127
128
# File 'lib/url_scrubber.rb', line 124

def self.find_identity_from_url(url)
  return nil unless url.present?
  url = UrlScrubber.scrub(url)
  url ? url.split("/").last : nil
end

.find_linkedin_identity_from_url(url) ⇒ Object



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/url_scrubber.rb', line 131

def self.find_linkedin_identity_from_url(url)
  return nil if url.nil?
  scrubbed_url = scrub(url)
  if scrubbed_url && linkedin_company_url?(scrubbed_url)
    scrubbed_url.split("/").last
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
    scrubbed_url.split("/").last
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/pub/')
    id_partition = scrubbed_url.partition('linkedin.com/pub/')
    id_partition[2] && id_partition[2] != "" ? drop_url_ampersand!(id_partition[2].split('/').first) : nil
  elsif scrubbed_url.include?('linkedin.com/groups/')
    scrubbed_url.split("/").last
  elsif scrubbed_url.include?('linkedin.com/groups?gid=')
    id_partition = scrubbed_url.partition('linkedin.com/groups?gid=')
    drop_url_ampersand!(id_partition[2])
  end
end

.ideal_form?(url) ⇒ Boolean

Returns:

  • (Boolean)


70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/url_scrubber.rb', line 70

def self.ideal_form?(url)
  url = scrub(url)
  return false unless url

  case service_of(url)
  when :vkontakte
    !!url.match(%r{^http://vk\.com/[\w_]+$})
  when :weibo
    !!url.match(%r{^http://weibo\.com/[\w_-]+$})
  when :youtube
    !!url.match(%r{^http://youtube\.com/[\w_-]+$})
  when :twitter
    !!url.match(%r{^http://twitter\.com/[\w_]+$})
  when :facebook
    !!url.match(%r{^http://facebook\.com/(profile\.php?id=\d+|[\w_\.-]+)$}) || !!url.match(%r{^http://facebook\.com/groups/[\w_\.-]+$})
  when :linkedin
    !!url.match(%r{^http://linkedin\.com/pub/[\w-]+/[\w]+/[\w]+/[\w]+$}) || !!url.match(%r{^http://linkedin\.com/in/[\w_-]+$}) || !!url.match(%r{^http://linkedin\.com/(company/[\w_-]+|profile/view\?id=\d+)$}) || !!url.match(%r{^http://linkedin\.com/(groups\?gid=[0-9]+)$}) || !!url.match(%r{^http://linkedin\.com/(groups/[\w_-]+)$})
  when :google
    !!url.match(%r{^http://plus\.google\.com/(\+[\w_-]+|\d+)$}) || !!url.match(%r{^http://plus\.google\.com/communities/\d+$})
  when :slideshare
    !!url.match(%r{^http://slideshare\.net/[\w_-]+$})
  when :flickr
    !!url.match(%r{^http://flickr\.com/[\w_\@-]+$}) || !!url.match(%r{^http://flickr\.com/groups/[\w_\@\.-]+$})
  when :pinterest
    !!url.match(%r{^http://pinterest\.com/[\w_-]+$})
  when :yelp
    !!url.match(%r{^http://yelp\.com/[\w_-]+$})
  when :vimeo
    (!!url.match(%r{^http://vimeo\.com/[\w_-]+$}) && !url.match(%r{/\d+$})) || !!url.match(%r{^http://vimeo\.com/groups/[\w_\.-]+$})
  when :instagram
    !!url.match(%r{^http://instagram\.com/[\w_]+$})
  when :tumblr
    #Rails.logger.debug "CCC  Tumblr - url=#{url}, ideal=#{!!url.match(%r{^http://[\w_]+\.tumblr\.com$})}, www=#{url.index("://www.") ? url.index("://www.") : 'NIL'}"
    !!url.match(%r{^http://[\w_]+\.tumblr\.com$}) && !url.index("://www.")
  else
    true
  end
end

.linkedin_company_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


110
111
112
113
114
# File 'lib/url_scrubber.rb', line 110

def self.linkedin_company_url?(url)
  url = scrub(url)
  return false unless url
  return url.include?('http://linkedin.com/company/')
end

.linkedin_personal_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


117
118
119
120
121
# File 'lib/url_scrubber.rb', line 117

def self.linkedin_personal_url?(url)
  url = scrub(url)
  return false unless url
  return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
end

.maps_to_public_url(url) ⇒ Object



163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/url_scrubber.rb', line 163

def self.maps_to_public_url(url)
  scrubbed = scrub(url)
  parsed = URI.parse(URI.escape(url)) or return nil
  host = Domainatrix.parse(parsed.host)
  if host.domain == "facebook" && host.subdomain == "business"
    public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com")
  elsif host.domain == "google" && host.subdomain == "business"
    public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com")
  else
    public_url = nil
  end
  public_url
end

.scrub(url) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/url_scrubber.rb', line 9

def self.scrub(url)
  return url if url.blank?
  return url if /^app:\/\//.match(url)  # Do not scrub app-only URLs
  return url if /^https?:\/\/(www.)?business.tiktok\.com\/manage\//.match(url) # Don't scrub tik tok business manager urls, quick fix until we can implement a different solution, https://business.tiktok.com/manage/overview?org_id=6974497704617492482

  url = url.clone # don't modify the original argument

  m = url.match(/(htt?ps?:\/\/\S+)/i)
  return nil unless m

  url = m[1]
  url.sub!(/^https/i, 'http')
  url.sub!(/^htp/i, 'http')
  url.sub!(/\/+$/, '')
  url.sub!(/;+$/, '')
  url.sub!('#!/', '')
  url.sub!('%27', '\'')
  url = downcase_domain(url)
  remove_subdomain!(url)
  remove_html_tags!(url)
  # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
  url = drop_anchor!(special_cases(url))
  url.sub!(/,+$/, "")    # remove one or more trailing commas at the end of the URL
  url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
  return url
end

.service_of(url) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/url_scrubber.rb', line 37

def self.service_of(url)
  url_parts = Domainatrix.parse(url)
  if url_parts.host.present?

    case url_parts.domain
    when 'facebook'           then return :facebook
    when 'fb'                 then return :facebook
    when 'flickr'             then return :flickr
    when 'instagram'          then return :instagram
    when 'linkedin'           then return :linkedin
    when 'pinterest'          then return :pinterest
    when 'slideshare'         then return :slideshare
    when 'tumblr'             then return :tumblr
    when 'twitter'            then return :twitter
    when 'vimeo'              then return :vimeo
    when 'vk'                 then return :vkontakte
    when 'weibo'              then return :weibo
    when 'yelp'               then return :yelp
    when 'youtube'            then return :youtube
    end

    case url_parts.host
    when /\bplus\.google\.com$/ then return :google
    end

  else
    Rails.logger.debug "No Domain Match"
  end

  :other
end

.valid_url?(url) ⇒ Boolean

Requirements:

  1. must have http/https scheme

  2. no “@” in any of the passed in url string

  3. valid uri as determined by Addressable::URI

Returns:

  • (Boolean)


154
155
156
157
158
159
160
# File 'lib/url_scrubber.rb', line 154

def self.valid_url?(url)
  schemes = %w(http https)
  parsed = URI.parse(URI.escape(url)) or return false
  schemes.include?(parsed.scheme) && !url.include?("@")
  rescue URI::InvalidURIError
  false
end