Class: UrlNormalizer

Inherits:
Object
  • Object
show all
Defined in:
lib/url_normalizer.rb,
lib/url_normalizer/proxy.rb,
lib/url_normalizer/version.rb,
lib/url_normalizer/linkedin.rb,
lib/url_normalizer/newyorker.rb,
lib/url_normalizer/new_york_times.rb,
lib/url_normalizer/think_progress.rb

Direct Known Subclasses

LinkedIn, NewYorkTimes, Newyorker, Proxy, ThinkProgress

Defined Under Namespace

Classes: LinkedIn, NewYorkTimes, Newyorker, Proxy, ThinkProgress

Constant Summary collapse

VERSION =
"0.0.2"
@@normalizer_for =
Hash.new(UrlNormalizer)

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(uri) ⇒ UrlNormalizer

Returns a new instance of UrlNormalizer.



20
21
22
# File 'lib/url_normalizer.rb', line 20

def initialize uri
  @uri = uri
end

Class Method Details

.normalize(url) ⇒ Object



12
13
14
15
16
17
18
# File 'lib/url_normalizer.rb', line 12

def self.normalize url
  url.sub!(/#(?!\!)[^#]*$/,'')

  uri = Addressable::URI.parse(url)

  @@normalizer_for[uri.host].new(uri).normalize
end

.normalize_for(domain) ⇒ Object



8
9
10
# File 'lib/url_normalizer.rb', line 8

def self.normalize_for domain
  @@normalizer_for[domain] = self
end

Instance Method Details

#build_query(params) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/url_normalizer.rb', line 64

def build_query(params)
  params.map do |name,values|
    escaped_name = encode_component name
    if values.length > 0
      values.map do |value|
        escaped_value = encode_component value
        "#{escaped_name}=#{escaped_value}"
      end
    else
      ["#{escaped_name}"]
    end
  end.flatten.join("&")
end

#clean_query(query) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/url_normalizer.rb', line 42

def clean_query query
  return unless query

  uri_params = CGI.parse(query)

  if forbidden_uri_params
    forbidden_params = forbidden_uri_params.map(&:to_s)
    uri_params.reject! {|k,v| forbidden_params.include? k}
  end

  if whitelisted_uri_params
    allowed_params = whitelisted_uri_params.map(&:to_s)
    uri_params.select! {|k,v| allowed_params.include? k}
  end

  build_query(uri_params)
end

#encode_component(component) ⇒ Object



60
61
62
# File 'lib/url_normalizer.rb', line 60

def encode_component component
  Addressable::URI.encode_component component
end

#forbidden_uri_paramsObject



34
35
36
# File 'lib/url_normalizer.rb', line 34

def forbidden_uri_params
  [:utm_source, :utm_content, :utm_medium, :utm_campaign]
end

#normalizeObject



24
25
26
27
28
29
30
31
32
# File 'lib/url_normalizer.rb', line 24

def normalize
  uri = @uri

  uri.query = clean_query(uri.query)
  uri.normalize!

  url = uri.to_s
  url.sub(/\?$/,'')
end

#whitelisted_uri_paramsObject



38
39
40
# File 'lib/url_normalizer.rb', line 38

def whitelisted_uri_params
  nil
end