Class: FinalDestination

Inherits:
Object
  • Object
show all
Defined in:
lib/final_destination.rb,
lib/final_destination/ssrf_detector.rb,
lib/final_destination/faraday_adapter.rb

Overview

Determine the final endpoint for a Web URI, following redirects

Defined Under Namespace

Modules: SSRFDetector Classes: FaradayAdapter, HTTP, Resolver, SSRFError, UrlEncodingError

Constant Summary collapse

MAX_REQUEST_TIME_SECONDS =
10
MAX_REQUEST_SIZE_BYTES =

1024 * 1024 * 5

5_242_880
DEFAULT_USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, opts = nil) ⇒ FinalDestination

Returns a new instance of FinalDestination.



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/final_destination.rb', line 44

def initialize(url, opts = nil)
  @url = url
  @uri = uri(normalized_url) if @url

  @opts = opts || {}
  @force_get_hosts = @opts[:force_get_hosts] || []
  @preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || []
  @force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || []
  @default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
  @opts[:max_redirects] ||= 5
  @https_redirect_ignore_limit = @opts[:initial_https_redirect_ignore_limit]
  @include_port_in_host_header = @opts[:include_port_in_host_header] || false

  @max_redirects = @opts[:max_redirects]
  @limit = @max_redirects

  @ignored = []
  if @limit > 0
    ignore_redirects = [Discourse.base_url_no_prefix]

    ignore_redirects.concat(@opts[:ignore_redirects]) if @opts[:ignore_redirects]

    ignore_redirects.each do |ignore_redirect|
      ignore_redirect = uri(ignore_redirect)
      @ignored << ignore_redirect.hostname if ignore_redirect.present? && ignore_redirect.hostname
    end
  end

  @status = :ready
  @follow_canonical = @opts[:follow_canonical]
  @http_verb = @opts[:http_verb] || http_verb(@force_get_hosts, @follow_canonical)
  @cookie = nil
  @limited_ips = []
  @verbose = @opts[:verbose] || false
  @timeout = @opts[:timeout] || nil
  @preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
  @validate_uri = @opts.fetch(:validate_uri) { true }
  @user_agent =
    (
      if @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) }
        Onebox::Helpers.user_agent
      else
        @default_user_agent
      end
    )
  @stop_at_blocked_pages = @opts[:stop_at_blocked_pages]
  @extra_headers = @opts[:headers]
end

Instance Attribute Details

#content_typeObject (readonly)

Returns the value of attribute content_type.



42
43
44
# File 'lib/final_destination.rb', line 42

def content_type
  @content_type
end

Returns the value of attribute cookie.



42
43
44
# File 'lib/final_destination.rb', line 42

def cookie
  @cookie
end

#ignoredObject (readonly)

Returns the value of attribute ignored.



42
43
44
# File 'lib/final_destination.rb', line 42

def ignored
  @ignored
end

#statusObject (readonly)

Returns the value of attribute status.



42
43
44
# File 'lib/final_destination.rb', line 42

def status
  @status
end

#status_codeObject (readonly)

Returns the value of attribute status_code.



42
43
44
# File 'lib/final_destination.rb', line 42

def status_code
  @status_code
end

Class Method Details

.cache_https_domain(domain) ⇒ Object



25
26
27
28
# File 'lib/final_destination.rb', line 25

def self.cache_https_domain(domain)
  key = redis_https_key(domain)
  Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1")
end

.clear_https_cache!(domain) ⇒ Object



20
21
22
23
# File 'lib/final_destination.rb', line 20

def self.clear_https_cache!(domain)
  key = redis_https_key(domain)
  Discourse.redis.without_namespace.del(key)
end

.connection_timeoutObject



93
94
95
# File 'lib/final_destination.rb', line 93

def self.connection_timeout
  20
end

.is_https_domain?(domain) ⇒ Boolean

Returns:

  • (Boolean)


30
31
32
33
# File 'lib/final_destination.rb', line 30

def self.is_https_domain?(domain)
  key = redis_https_key(domain)
  Discourse.redis.without_namespace.get(key).present?
end

.redis_https_key(domain) ⇒ Object



35
36
37
# File 'lib/final_destination.rb', line 35

def self.redis_https_key(domain)
  "HTTPS_DOMAIN_#{domain}"
end

.resolve(url, opts = nil) ⇒ Object



97
98
99
# File 'lib/final_destination.rb', line 97

def self.resolve(url, opts = nil)
  new(url, opts).resolve
end

Instance Method Details

#get(redirects = @limit, extra_headers: {}, except_headers: [], &blk) ⇒ Object

this is a new interface for simply getting N bytes accounting for all internal logic



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/final_destination.rb', line 159

def get(redirects = @limit, extra_headers: {}, except_headers: [], &blk)
  raise "Must specify block" unless block_given?

  if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
    @uri.scheme = "https"
    @uri = URI(@uri.to_s)
  end

  return if !validate_uri
  return if @stop_at_blocked_pages && blocked_domain?(@uri)

  result, headers_subset = safe_get(@uri, except_headers:, &blk)
  return if !result

  cookie = headers_subset.set_cookie
  location = headers_subset.location

  if result == :redirect
    return if !location

    old_uri = @uri
    location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
    @uri = uri(location)

    if @uri && redirects == @max_redirects && @https_redirect_ignore_limit &&
         same_uri_but_https?(old_uri, @uri)
      redirects += 1
      @https_redirect_ignore_limit = false
    end

    return if redirects == 0

    # https redirect, so just cache that whole new domain is https
    if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
      FinalDestination.cache_https_domain(@uri.hostname)
    end

    return if !@uri

    extra = nil
    extra = { "Cookie" => cookie } if cookie

    # Most HTTP Clients strip the Authorization header on redirects as the client could be redirecting to a untrusted
    # party. Not stripping the Authorization header on redirect can also lead to problems where the
    # redirected party does not expect a Authorization header and thus rejects the request.
    except_headers = ["Authorization"]

    get(redirects - 1, extra_headers: extra, except_headers:, &blk)
  elsif result == :ok
    @uri.to_s
  else
    nil
  end
end

#hostnameObject



449
450
451
# File 'lib/final_destination.rb', line 449

def hostname
  @uri.hostname
end

#hostname_matches?(url) ⇒ Boolean

Returns:

  • (Boolean)


453
454
455
456
457
458
459
460
461
462
463
464
465
466
# File 'lib/final_destination.rb', line 453

def hostname_matches?(url)
  url = uri(url)

  if @uri&.hostname.present? && url&.hostname.present?
    hostname_parts = url.hostname.split(".")
    has_wildcard = hostname_parts.first == "*"

    if has_wildcard
      @uri.hostname.end_with?(hostname_parts[1..-1].join("."))
    else
      @uri.hostname == url.hostname
    end
  end
end

#hostname_matches_s3_endpoint?(allowed_internal_host) ⇒ Boolean

Returns:

  • (Boolean)


443
444
445
446
447
# File 'lib/final_destination.rb', line 443

def hostname_matches_s3_endpoint?(allowed_internal_host)
  s3_endpoint_uri = URI(SiteSetting.s3_endpoint)
  hostname_matches?("http://#{allowed_internal_host}") && @uri.port == s3_endpoint_uri.port &&
    @uri.hostname.end_with?(s3_endpoint_uri.hostname)
end

#http_port_ok?Boolean

Returns:

  • (Boolean)


432
433
434
435
436
437
438
439
440
441
# File 'lib/final_destination.rb', line 432

def http_port_ok?
  return true if @uri.port == 80

  allowed_internal_hosts =
    SiteSetting.allowed_internal_hosts&.split(/[|\n]/)&.filter_map { |aih| aih.strip.presence }
  return false if allowed_internal_hosts.empty? || SiteSetting.s3_endpoint.blank?
  return false if allowed_internal_hosts.none? { |aih| hostname_matches_s3_endpoint?(aih) }

  true
end

#http_verb(force_get_hosts, follow_canonical) ⇒ Object



101
102
103
104
105
106
107
# File 'lib/final_destination.rb', line 101

def http_verb(force_get_hosts, follow_canonical)
  if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) }
    :get
  else
    :head
  end
end

#log(log_level, message) ⇒ Object



474
475
476
477
478
479
480
481
482
# File 'lib/final_destination.rb', line 474

def log(log_level, message)
  return unless @verbose
  return if @status_code == 404

  Rails.logger.public_send(
    log_level,
    "#{RailsMultisite::ConnectionManagement.current_db}: #{message}",
  )
end

#normalized_urlObject



468
469
470
471
472
# File 'lib/final_destination.rb', line 468

def normalized_url
  UrlHelper.normalized_encode(@url)
rescue ArgumentError => e
  raise UrlEncodingError, e.message
end

#redirected?Boolean

Returns:

  • (Boolean)


113
114
115
# File 'lib/final_destination.rb', line 113

def redirected?
  @limit < @max_redirects
end

#request_headersObject



117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/final_destination.rb', line 117

def request_headers
  result = {
    "User-Agent" => @user_agent,
    "Accept" => "*/*",
    "Accept-Language" => "*",
    "Host" => @uri.hostname + (@include_port_in_host_header ? ":#{@uri.port}" : ""),
  }

  result.merge!(@extra_headers) if @extra_headers
  result["Cookie"] = @cookie if @cookie

  result
end

#resolveObject



214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
# File 'lib/final_destination.rb', line 214

def resolve
  if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
    @uri.scheme = "https"
    @uri = URI(@uri.to_s)
  end

  if @limit < 0
    @status = :too_many_redirects
    log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}")
    return
  end

  unless validate_uri
    @status = :invalid_address
    log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}")
    return
  end

  @ignored.each do |host|
    if @uri&.hostname&.match?(host)
      @status = :resolved
      return @uri
    end
  end

  if Oneboxer.cached_response_body_exists?(@uri.to_s)
    @status = :resolved
    return @uri
  end

  headers = request_headers
  middlewares = Excon.defaults[:middlewares].dup
  middlewares << Excon::Middleware::Decompress if @http_verb == :get

  request_start_time = Time.now
  response_body = +""
  request_validator =
    lambda do |chunk, _remaining_bytes, _total_bytes|
      response_body << chunk
      if response_body.bytesize > MAX_REQUEST_SIZE_BYTES
        raise Excon::Errors::ExpectationFailed.new("response size too big: #{@uri}")
      end
      if Time.now - request_start_time > MAX_REQUEST_TIME_SECONDS
        raise Excon::Errors::ExpectationFailed.new("connect timeout reached: #{@uri}")
      end
    end

  # This technique will only use the first resolved IP
  # TODO: Can we standardise this by using FinalDestination::HTTP?
  begin
    resolved_ip = SSRFDetector.lookup_and_filter_ips(@uri.hostname).first
  rescue SSRFDetector::DisallowedIpError, SocketError, Timeout::Error
    @status = :invalid_address
    return
  end
  request_uri = @uri.dup
  request_uri.hostname = resolved_ip unless Rails.env.test? # WebMock doesn't understand the IP-based requests

  response =
    Excon.public_send(
      @http_verb,
      request_uri.to_s,
      read_timeout: timeout,
      connect_timeout: timeout,
      headers: { "Host" => @uri.hostname }.merge(headers),
      middlewares: middlewares,
      response_block: request_validator,
      ssl_verify_peer_host: @uri.hostname,
    )

  if @stop_at_blocked_pages
    if blocked_domain?(@uri) || response.headers["Discourse-No-Onebox"] == "1"
      @status = :blocked_page
      return
    end
  end

  location = nil
  response_headers = nil
  response_status = response.status.to_i

  case response.status
  when 200
    # Cache body of successful `get` requests
    if @http_verb == :get
      if Oneboxer.cache_response_body?(@uri)
        Oneboxer.cache_response_body(@uri.to_s, response_body)
      end
    end

    if @follow_canonical
      next_url = fetch_canonical_url(response_body)

      if next_url.to_s.present? && next_url != @uri
        @follow_canonical = false
        @uri = next_url
        @http_verb = http_verb(@force_get_hosts, @follow_canonical)

        return resolve
      end
    end

    @content_type = response.headers["Content-Type"] if response.headers.has_key?("Content-Type")
    @status = :resolved
    return @uri
  when 103, 400, 405, 406, 409, 500, 501
    response_status, small_headers = small_get(request_headers)

    if @stop_at_blocked_pages
      # this may seem weird, but the #to_hash method of the response object
      # of ruby's net/http lib returns a hash where each value is an array.
      # small_headers here is like that so our no onebox header value is an
      # array if it's set. Also the hash keys are always lower-cased.
      dont_onebox = small_headers["discourse-no-onebox"]&.join("") == "1"
      if dont_onebox || blocked_domain?(@uri)
        @status = :blocked_page
        return
      end
    end

    if response_status == 200
      @status = :resolved
      return @uri
    end

    response_headers = {}
    if cookie_val = small_headers["set-cookie"]
      response_headers[:cookies] = cookie_val
    end

    if location_val = small_headers["location"]
      response_headers[:location] = location_val.join
    end
  end

  unless response_headers
    response_headers = {
      cookies: response.data[:cookies] || response.headers[:"set-cookie"],
      location: response.headers[:location],
    }
  end

  location = response_headers[:location] if (300..399).include?(response_status)

  if cookies = response_headers[:cookies]
    @cookie = Array.wrap(cookies).map { |c| c.split(";").first.strip }.join("; ")
  end

  if location
    redirect_uri = uri(location)
    if @uri.host == redirect_uri.host &&
         (redirect_uri.path =~ %r{/login} || redirect_uri.path =~ %r{/session})
      @status = :resolved
      return @uri
    end

    old_uri = @uri
    location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present?
    location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
    @uri = uri(location)

    if @uri && @limit == @max_redirects && @https_redirect_ignore_limit &&
         same_uri_but_https?(old_uri, @uri)
      @limit += 1
      @https_redirect_ignore_limit = false
    end
    @limit -= 1

    # https redirect, so just cache that whole new domain is https
    if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
      FinalDestination.cache_https_domain(@uri.hostname)
    end
    return resolve
  end

  # this is weird an exception seems better
  @status = :failure
  @status_code = response.status

  log(:warn, "FinalDestination could not resolve URL (status #{response.status}): #{@uri}")
  nil
rescue Excon::Errors::Timeout
  log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}")
  nil
end

#skip_validations?Boolean

Returns:

  • (Boolean)


400
401
402
# File 'lib/final_destination.rb', line 400

def skip_validations?
  !@validate_uri
end

#small_get(request_headers) ⇒ Object



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/final_destination.rb', line 131

def small_get(request_headers)
  status_code, response_headers = nil

  catch(:done) do
    FinalDestination::HTTP.start(
      @uri.host,
      @uri.port,
      use_ssl: @uri.is_a?(URI::HTTPS),
      open_timeout: timeout,
    ) do |http|
      http.read_timeout = timeout
      http.request_get(@uri.request_uri, request_headers) do |resp|
        status_code = resp.code.to_i
        response_headers = resp.to_hash

        # see: https://bugs.ruby-lang.org/issues/15624
        # if we allow response to return then body will be read
        # got to abort without reading body
        throw :done
      end
    end
  end

  [status_code, response_headers]
end

#timeoutObject



109
110
111
# File 'lib/final_destination.rb', line 109

def timeout
  @timeout || FinalDestination.connection_timeout
end

#validate_uriObject



404
405
406
# File 'lib/final_destination.rb', line 404

def validate_uri
  skip_validations? || validate_uri_format
end

#validate_uri_formatObject



408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
# File 'lib/final_destination.rb', line 408

def validate_uri_format
  return false unless @uri && @uri.host
  return false if %w[https http].exclude?(@uri.scheme)

  # In some cases (like local/test environments) we may want to allow http URLs
  # to be used for internal hosts, but only if it's the case that the host is
  # explicitly used for SiteSetting.s3_endpoint. This is to allow for local
  # S3 providers like minio.
  #
  # In all other cases, we should not be allowing http calls to anything except
  # port 80.
  return false if @uri.scheme == "http" && !http_port_ok?
  return false if @uri.scheme == "https" && @uri.port != 443

  # Disallow IP based crawling
  (
    begin
      IPAddr.new(@uri.hostname)
    rescue StandardError
      nil
    end
  ).nil?
end