Class: SiteDiff::UriWrapper

Inherits:
Object
  • Object
show all
Defined in:
lib/sitediff/uriwrapper.rb

Overview

SiteDiff URI Wrapper.

Defined Under Namespace

Classes: ReadResult

Constant Summary collapse

DEFAULT_CURL_OPTS =

TODO: Move these CURL OPTS to Config.DEFAULT_CONFIG.

{
  # Don't hang on servers that don't exist.
  connecttimeout: 3,
  # Follow HTTP redirects (code 301 and 302).
  followlocation: true,
  headers: {
    'User-Agent' => 'Sitediff - https://github.com/evolvingweb/sitediff'
  },
  # always accept SSL certs
  ssl_verifypeer: false,
  ssl_verifyhost: 0
}.freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug: true, referrer: '') ⇒ UriWrapper

Creates a UriWrapper.



51
52
53
54
55
56
57
58
# File 'lib/sitediff/uriwrapper.rb', line 51

def initialize(uri, curl_opts = DEFAULT_CURL_OPTS, debug: true, referrer: '')
  @uri = uri.respond_to?(:scheme) ? uri : Addressable::URI.parse(uri)
  # remove trailing '/'s from local URIs
  @uri.path.gsub!(%r{/*$}, '') if local?
  @curl_opts = curl_opts
  @debug = debug
  @referrer = referrer
end

Class Method Details

.canonicalize(path) ⇒ Object

Canonicalize a path.

Parameters:

  • path (String)

    A base relative path. Example: /foo/bar



198
199
200
201
202
203
# File 'lib/sitediff/uriwrapper.rb', line 198

def self.canonicalize(path)
  # Ignore trailing slashes for all paths except "/" (front page).
  path = path.chomp('/') unless path == '/'
  # If the path is empty, assume that it's the front page.
  path.empty? ? '/' : path
end

Instance Method Details

#+(other) ⇒ Object

What does this one do? FIXME: this is not used anymore



89
90
91
92
93
94
# File 'lib/sitediff/uriwrapper.rb', line 89

def +(other)
  # 'path' for SiteDiff includes (parts of) path, query, and fragment.
  sep = ''
  sep = '/' if local? || @uri.path.empty?
  self.class.new(@uri.to_s + sep + other)
end

#charset_encoding(http_headers) ⇒ Object

Returns the encoding of an HTTP response from headers , nil if not specified.



106
107
108
109
110
111
# File 'lib/sitediff/uriwrapper.rb', line 106

def charset_encoding(http_headers)
  content_type = http_headers['Content-Type']
  if (md = /;\s*charset=([-\w]*)/.match(content_type))
    md[1]
  end
end

#local?Boolean

Is this a local filesystem path?

Returns:

  • (Boolean)


83
84
85
# File 'lib/sitediff/uriwrapper.rb', line 83

def local?
  @uri.scheme.nil?
end

#passwordObject

Returns the “password” part of the URI.



68
69
70
# File 'lib/sitediff/uriwrapper.rb', line 68

def password
  @uri.password
end

#queue(hydra, &handler) ⇒ Object

Queue reading this URL, with a completion handler to run after.

The handler should be callable as handler.

This method may choose not to queue the request at all, but simply execute right away.



185
186
187
188
189
190
191
# File 'lib/sitediff/uriwrapper.rb', line 185

def queue(hydra, &handler)
  if local?
    read_file(&handler)
  else
    hydra.queue(typhoeus_request(&handler))
  end
end

#read_fileObject

Reads a file and yields to the completion handler, see .queue()



98
99
100
101
102
# File 'lib/sitediff/uriwrapper.rb', line 98

def read_file
  File.open(@uri.to_s, 'r:UTF-8') { |f| yield ReadResult.new(f.read) }
rescue Errno::ENOENT, Errno::ENOTDIR, Errno::EACCES, Errno::EISDIR => e
  yield ReadResult.error(e.message)
end

#to_sObject

Converts the URI to a string.



74
75
76
77
78
79
# File 'lib/sitediff/uriwrapper.rb', line 74

def to_s
  uri = @uri.dup
  uri.user = nil
  uri.password = nil
  uri.to_s
end

#typhoeus_requestObject

Returns a Typhoeus::Request to fetch @uri

Completion callbacks of the request wrap the given handler which is assumed to accept a single ReadResult argument.



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/sitediff/uriwrapper.rb', line 117

def typhoeus_request
  params = @curl_opts.dup
  # Allow basic auth
  params[:userpwd] = "#{@uri.user}:#{@uri.password}" if @uri.user

  # params['verbose'] = true
  # params['ssl_verifypeer'] = false
  # params['ssl_verifyhost'] = 0
  # params['followlocation'] = true
  # puts to_s

  req = Typhoeus::Request.new(to_s, params)

  req.on_success do |resp|
    body = resp.body
    # Typhoeus does not respect HTTP headers when setting the encoding
    # resp.body; coerce if possible.
    if (encoding = charset_encoding(resp.headers))
      body.force_encoding(encoding)
    end
    # Should be wrapped with rescue I guess? Maybe this entire function?
    # Should at least be an option in the Cli to disable this.
    # "stop on first error"
    begin
      yield ReadResult.new(body, encoding)
    rescue ArgumentError => e
      raise if @debug

      yield ReadResult.error(
        "Parsing error for #{@uri}: #{e.message}  From page: #{@referrer}"
      )
    rescue StandardError => e
      raise if @debug

      yield ReadResult.error(
        "Unknown parsing error for #{@uri}: #{e.message}  From page: #{@referrer}"
      )
    end
  end

  req.on_failure do |resp|
    if resp&.status_message
      yield ReadResult.error(
        "HTTP error when loading #{@uri} : [#{resp.response_code}] #{resp.status_message}  From: #{@referrer}",
        resp.response_code
      )
    elsif (msg = resp.options[:return_code])
      yield ReadResult.error(
        "Connection error when loading #{@uri} : [#{resp.options[:return_code]}] #{msg}  From: #{@referrer}",
        resp.response_code
      )
    else
      yield ReadResult.error(
        "Unknown error when loading #{@uri} : [#{resp.response_code}] #{resp.status_message} From: #{@referrer}",
        resp.response_code
      )
    end
  end

  req
end

#userObject

Returns the “user” part of the URI.



62
63
64
# File 'lib/sitediff/uriwrapper.rb', line 62

def user
  @uri.user
end