Module: PostRank::URI
- Defined in:
- lib/postrank-uri.rb,
lib/postrank-uri/version.rb
Constant Summary collapse
- C14N =
{}
- URIREGEX =
{}
- VERSION =
"1.1"
Class Method Summary collapse
- .c14n(uri, opts = {}) ⇒ Object
- .clean(uri, opts = {}) ⇒ Object
- .embedded(uri) ⇒ Object
- .escape(uri) ⇒ Object
- .extract(text) ⇒ Object
- .extract_href(text, host = nil) ⇒ Object
- .hash(uri, opts = {}) ⇒ Object
- .normalize(uri, opts = {}) ⇒ Object
- .parse(uri, opts = {}) ⇒ Object
- .unescape(uri) ⇒ Object
- .valid?(uri) ⇒ Boolean
Class Method Details
.c14n(uri, opts = {}) ⇒ Object
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/postrank-uri.rb', line 163 def c14n(uri, opts = {}) u = parse(uri, opts) u = (u) if q = u.query_values(Array) q.delete_if { |k,v| C14N[:global].include?(k) } q.delete_if { |k,v| C14N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } } end u.query_values = q if u.host =~ /^(mobile\.)?twitter\.com$/ && u.fragment && u.fragment.match(/^!(.*)/) u.fragment = nil u.path = $1 end if u.host =~ /tumblr\.com$/ && u.path =~ /\/post\/\d+\// u.path = u.path.gsub(/[^\/]+$/, '') end u end |
.clean(uri, opts = {}) ⇒ Object
145 146 147 148 |
# File 'lib/postrank-uri.rb', line 145 def clean(uri, opts = {}) uri = normalize(c14n(unescape(uri), opts)) opts[:raw] ? uri : uri.to_s end |
.embedded(uri) ⇒ Object
185 186 187 188 189 190 191 192 193 194 195 196 |
# File 'lib/postrank-uri.rb', line 185 def (uri) = if uri.host == 'news.google.com' && uri.path == '/news/url' \ || uri.host == 'xfruits.com' uri.query_values['url'] elsif uri.host =~ /myspace\.com/ && uri.path =~ /PostTo/ = uri.query_values['u'] end uri = clean(, :raw => true) if uri end |
.escape(uri) ⇒ Object
127 128 129 130 131 |
# File 'lib/postrank-uri.rb', line 127 def escape(uri) uri.gsub(URIREGEX[:escape]) do '%' + $1.unpack('H2' * $1.size).join('%').upcase end.gsub(' ','%20') end |
.extract(text) ⇒ Object
97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/postrank-uri.rb', line 97 def extract(text) return [] if !text urls = [] text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query| # Only extract the URL if the domain is valid if PublicSuffix.valid?(domain, default_rule: nil) url = clean(url) urls.push url.to_s end end urls.compact end |
.extract_href(text, host = nil) ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/postrank-uri.rb', line 111 def extract_href(text, host = nil) urls = [] Nokogiri.HTML(text).search('a').each do |a| begin url = clean(a.attr('href'), :raw => true, :host => host) next unless url.absolute? urls.push [url.to_s, a.text] rescue next end end urls end |
.hash(uri, opts = {}) ⇒ Object
150 151 152 |
# File 'lib/postrank-uri.rb', line 150 def hash(uri, opts = {}) Digest::MD5.hexdigest(opts[:clean] == true ? clean(uri) : uri) end |
.normalize(uri, opts = {}) ⇒ Object
154 155 156 157 158 159 160 161 |
# File 'lib/postrank-uri.rb', line 154 def normalize(uri, opts = {}) u = parse(uri, opts) u.path = u.path.gsub(URIREGEX[:double_slash_outside_scheme], '/') u.path = u.path.chomp('/') if u.path.size != 1 u.query = nil if u.query && u.query.empty? u.fragment = nil u end |
.parse(uri, opts = {}) ⇒ Object
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
# File 'lib/postrank-uri.rb', line 198 def parse(uri, opts = {}) return uri if uri.is_a? Addressable::URI uri = Addressable::URI.parse(uri) if !uri.host && uri.scheme !~ /^javascript|mailto|xmpp$/ if uri.scheme # With no host and scheme yes, the parser exploded return parse("http://#{uri}", opts) end if opts[:host] uri.host = opts[:host] else parts = uri.path.to_s.split(/[\/:]/) if parts.first =~ URIREGEX[:valid_domain] host = parts.shift uri.path = '/' + parts.join('/') uri.host = host end end end uri.scheme = 'http' if uri.host && !uri.scheme uri.normalize! end |
.unescape(uri) ⇒ Object
133 134 135 136 137 138 139 140 141 142 143 |
# File 'lib/postrank-uri.rb', line 133 def unescape(uri) u = parse(uri) u.query = u.query.tr('+', ' ') if u.query u.to_s.gsub(URIREGEX[:unescape]) do |encoded| if !encoded.match(URIREGEX[:reserved_characters]).nil? encoded else [encoded.delete('%')].pack('H*') end end end |
.valid?(uri) ⇒ Boolean
225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
# File 'lib/postrank-uri.rb', line 225 def valid?(uri) # URI is only valid if it is not nil, parses cleanly as a URI, # and the domain has a recognized, valid TLD component return false if uri.nil? is_valid = false cleaned_uri = clean(uri, :raw => true) if host = cleaned_uri.host is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil) end is_valid end |