Module: UrlScrubber
- Defined in:
- lib/url_scrubber.rb,
lib/url_scrubber/version.rb
Constant Summary collapse
- VERSION =
"0.8.23"
Class Method Summary collapse
- .find_identity_from_url(url) ⇒ Object
- .find_linkedin_identity_from_url(url) ⇒ Object
- .ideal_form?(url) ⇒ Boolean
- .linkedin_company_url?(url) ⇒ Boolean
- .linkedin_personal_url?(url) ⇒ Boolean
- .maps_to_public_url(url) ⇒ Object
- .scrub(url) ⇒ Object
- .service_of(url) ⇒ Object
-
.valid_url?(url) ⇒ Boolean
Requirements: 1.
Class Method Details
.find_identity_from_url(url) ⇒ Object
124 125 126 127 128 |
# File 'lib/url_scrubber.rb', line 124 def self.find_identity_from_url(url) return nil unless url.present? url = UrlScrubber.scrub(url) url ? url.split("/").last : nil end |
.find_linkedin_identity_from_url(url) ⇒ Object
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# File 'lib/url_scrubber.rb', line 131 def self.find_linkedin_identity_from_url(url) return nil if url.nil? scrubbed_url = scrub(url) if scrubbed_url && linkedin_company_url?(scrubbed_url) scrubbed_url.split("/").last elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/') scrubbed_url.split("/").last elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/pub/') id_partition = scrubbed_url.partition('linkedin.com/pub/') id_partition[2] && id_partition[2] != "" ? drop_url_ampersand!(id_partition[2].split('/').first) : nil elsif scrubbed_url.include?('linkedin.com/groups/') scrubbed_url.split("/").last elsif scrubbed_url.include?('linkedin.com/groups?gid=') id_partition = scrubbed_url.partition('linkedin.com/groups?gid=') drop_url_ampersand!(id_partition[2]) end end |
.ideal_form?(url) ⇒ Boolean
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/url_scrubber.rb', line 70 def self.ideal_form?(url) url = scrub(url) return false unless url case service_of(url) when :vkontakte !!url.match(%r{^http://vk\.com/[\w_]+$}) when :weibo !!url.match(%r{^http://weibo\.com/[\w_-]+$}) when :youtube !!url.match(%r{^http://youtube\.com/[\w_-]+$}) when :twitter !!url.match(%r{^http://twitter\.com/[\w_]+$}) when :facebook !!url.match(%r{^http://facebook\.com/(profile\.php?id=\d+|[\w_\.-]+)$}) || !!url.match(%r{^http://facebook\.com/groups/[\w_\.-]+$}) when :linkedin !!url.match(%r{^http://linkedin\.com/pub/[\w-]+/[\w]+/[\w]+/[\w]+$}) || !!url.match(%r{^http://linkedin\.com/in/[\w_-]+$}) || !!url.match(%r{^http://linkedin\.com/(company/[\w_-]+|profile/view\?id=\d+)$}) || !!url.match(%r{^http://linkedin\.com/(groups\?gid=[0-9]+)$}) || !!url.match(%r{^http://linkedin\.com/(groups/[\w_-]+)$}) when :google !!url.match(%r{^http://plus\.google\.com/(\+[\w_-]+|\d+)$}) || !!url.match(%r{^http://plus\.google\.com/communities/\d+$}) when :slideshare !!url.match(%r{^http://slideshare\.net/[\w_-]+$}) when :flickr !!url.match(%r{^http://flickr\.com/[\w_\@-]+$}) || !!url.match(%r{^http://flickr\.com/groups/[\w_\@\.-]+$}) when :pinterest !!url.match(%r{^http://pinterest\.com/[\w_-]+$}) when :yelp !!url.match(%r{^http://yelp\.com/[\w_-]+$}) when :vimeo (!!url.match(%r{^http://vimeo\.com/[\w_-]+$}) && !url.match(%r{/\d+$})) || !!url.match(%r{^http://vimeo\.com/groups/[\w_\.-]+$}) when :instagram !!url.match(%r{^http://instagram\.com/[\w_]+$}) when :tumblr #Rails.logger.debug "CCC Tumblr - url=#{url}, ideal=#{!!url.match(%r{^http://[\w_]+\.tumblr\.com$})}, www=#{url.index("://www.") ? url.index("://www.") : 'NIL'}" !!url.match(%r{^http://[\w_]+\.tumblr\.com$}) && !url.index("://www.") else true end end |
.linkedin_company_url?(url) ⇒ Boolean
110 111 112 113 114 |
# File 'lib/url_scrubber.rb', line 110 def self.linkedin_company_url?(url) url = scrub(url) return false unless url return url.include?('http://linkedin.com/company/') end |
.linkedin_personal_url?(url) ⇒ Boolean
117 118 119 120 121 |
# File 'lib/url_scrubber.rb', line 117 def self.linkedin_personal_url?(url) url = scrub(url) return false unless url return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/') end |
.maps_to_public_url(url) ⇒ Object
163 164 165 166 167 168 169 170 171 172 173 174 175 |
# File 'lib/url_scrubber.rb', line 163 def self.maps_to_public_url(url) scrubbed = scrub(url) parsed = URI.parse(URI.escape(url)) or return nil host = Domainatrix.parse(parsed.host) if host.domain == "facebook" && host.subdomain == "business" public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com") elsif host.domain == "google" && host.subdomain == "business" public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com") else public_url = nil end public_url end |
.scrub(url) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/url_scrubber.rb', line 9 def self.scrub(url) return url if url.blank? return url if /^app:\/\//.match(url) # Do not scrub app-only URLs return url if /^https?:\/\/(www.)?business.tiktok\.com\/manage\//.match(url) # Don't scrub tik tok business manager urls, quick fix until we can implement a different solution, https://business.tiktok.com/manage/overview?org_id=6974497704617492482 url = url.clone # don't modify the original argument m = url.match(/(htt?ps?:\/\/\S+)/i) return nil unless m url = m[1] url.sub!(/^https/i, 'http') url.sub!(/^htp/i, 'http') url.sub!(/\/+$/, '') url.sub!(/;+$/, '') url.sub!('#!/', '') url.sub!('%27', '\'') url = downcase_domain(url) remove_subdomain!(url) (url) # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL url = drop_anchor!(special_cases(url)) url.sub!(/,+$/, "") # remove one or more trailing commas at the end of the URL url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL return url end |
.service_of(url) ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/url_scrubber.rb', line 37 def self.service_of(url) url_parts = Domainatrix.parse(url) if url_parts.host.present? case url_parts.domain when 'facebook' then return :facebook when 'fb' then return :facebook when 'flickr' then return :flickr when 'instagram' then return :instagram when 'linkedin' then return :linkedin when 'pinterest' then return :pinterest when 'slideshare' then return :slideshare when 'tumblr' then return :tumblr when 'twitter' then return :twitter when 'vimeo' then return :vimeo when 'vk' then return :vkontakte when 'weibo' then return :weibo when 'yelp' then return :yelp when 'youtube' then return :youtube end case url_parts.host when /\bplus\.google\.com$/ then return :google end else Rails.logger.debug "No Domain Match" end :other end |
.valid_url?(url) ⇒ Boolean
Requirements:
-
must have http/https scheme
-
no “@” in any of the passed in url string
-
valid uri as determined by Addressable::URI
154 155 156 157 158 159 160 |
# File 'lib/url_scrubber.rb', line 154 def self.valid_url?(url) schemes = %w(http https) parsed = URI.parse(URI.escape(url)) or return false schemes.include?(parsed.scheme) && !url.include?("@") rescue URI::InvalidURIError false end |