Module: PostRank::URI

Defined in:
lib/postrank-uri.rb,
lib/postrank-uri/version.rb

Constant Summary collapse

C14N =
{}
URIREGEX =
{}
VERSION =
"1.1"

Class Method Summary collapse

Class Method Details

.c14n(uri, opts = {}) ⇒ Object



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/postrank-uri.rb', line 163

def c14n(uri, opts = {})
  u = parse(uri, opts)
  u = embedded(u)

  if q = u.query_values(Array)
    q.delete_if { |k,v| C14N[:global].include?(k) }
    q.delete_if { |k,v| C14N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } }
  end
  u.query_values = q

  if u.host =~ /^(mobile\.)?twitter\.com$/ && u.fragment && u.fragment.match(/^!(.*)/)
    u.fragment = nil
    u.path = $1
  end

  if u.host =~ /tumblr\.com$/ && u.path =~ /\/post\/\d+\//
    u.path = u.path.gsub(/[^\/]+$/, '')
  end

  u
end

.clean(uri, opts = {}) ⇒ Object



145
146
147
148
# File 'lib/postrank-uri.rb', line 145

def clean(uri, opts = {})
  uri = normalize(c14n(unescape(uri), opts))
  opts[:raw] ? uri : uri.to_s
end

.embedded(uri) ⇒ Object



185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/postrank-uri.rb', line 185

def embedded(uri)
  embedded = if uri.host == 'news.google.com' && uri.path == '/news/url' \
     || uri.host == 'xfruits.com'
    uri.query_values['url']

  elsif uri.host =~ /myspace\.com/ && uri.path =~ /PostTo/
    embedded = uri.query_values['u']
  end

  uri = clean(embedded, :raw => true) if embedded
  uri
end

.escape(uri) ⇒ Object



127
128
129
130
131
# File 'lib/postrank-uri.rb', line 127

def escape(uri)
  uri.gsub(URIREGEX[:escape]) do
    '%' + $1.unpack('H2' * $1.size).join('%').upcase
  end.gsub(' ','%20')
end

.extract(text) ⇒ Object



97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/postrank-uri.rb', line 97

def extract(text)
  return [] if !text
  urls = []
  text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
    # Only extract the URL if the domain is valid
    if PublicSuffix.valid?(domain, default_rule: nil)
      url = clean(url)
      urls.push url.to_s
    end
  end

  urls.compact
end

.extract_href(text, host = nil) ⇒ Object



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/postrank-uri.rb', line 111

def extract_href(text, host = nil)
  urls = []
  Nokogiri.HTML(text).search('a').each do |a|
    begin
      url = clean(a.attr('href'), :raw => true, :host => host)

      next unless url.absolute?

      urls.push [url.to_s, a.text]
    rescue
      next
    end
  end
  urls
end

.hash(uri, opts = {}) ⇒ Object



150
151
152
# File 'lib/postrank-uri.rb', line 150

def hash(uri, opts = {})
  Digest::MD5.hexdigest(opts[:clean] == true ? clean(uri) : uri)
end

.normalize(uri, opts = {}) ⇒ Object



154
155
156
157
158
159
160
161
# File 'lib/postrank-uri.rb', line 154

def normalize(uri, opts = {})
  u = parse(uri, opts)
  u.path = u.path.gsub(URIREGEX[:double_slash_outside_scheme], '/')
  u.path = u.path.chomp('/') if u.path.size != 1
  u.query = nil if u.query && u.query.empty?
  u.fragment = nil
  u
end

.parse(uri, opts = {}) ⇒ Object



198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/postrank-uri.rb', line 198

def parse(uri, opts = {})
  return uri if uri.is_a? Addressable::URI

  uri = Addressable::URI.parse(uri)

  if !uri.host && uri.scheme !~ /^javascript|mailto|xmpp$/
    if uri.scheme
      # With no host and scheme yes, the parser exploded
      return parse("http://#{uri}", opts)
    end

    if opts[:host]
      uri.host = opts[:host]
    else
      parts = uri.path.to_s.split(/[\/:]/)
      if parts.first =~ URIREGEX[:valid_domain]
        host = parts.shift
        uri.path = '/' + parts.join('/')
        uri.host = host
      end
    end
  end

  uri.scheme = 'http' if uri.host && !uri.scheme
  uri.normalize!
end

.unescape(uri) ⇒ Object



133
134
135
136
137
138
139
140
141
142
143
# File 'lib/postrank-uri.rb', line 133

def unescape(uri)
  u = parse(uri)
  u.query = u.query.tr('+', ' ') if u.query
  u.to_s.gsub(URIREGEX[:unescape]) do |encoded|
    if !encoded.match(URIREGEX[:reserved_characters]).nil?
      encoded
    else
      [encoded.delete('%')].pack('H*')
    end
  end
end

.valid?(uri) ⇒ Boolean

Returns:

  • (Boolean)


225
226
227
228
229
230
231
232
233
234
235
236
237
238
# File 'lib/postrank-uri.rb', line 225

def valid?(uri)
  # URI is only valid if it is not nil, parses cleanly as a URI,
  # and the domain has a recognized, valid TLD component
  return false if uri.nil?

  is_valid = false
  cleaned_uri = clean(uri, :raw => true)

  if host = cleaned_uri.host
    is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil)
  end

  is_valid
end