Module: Onebox::Helpers

Defined in:
lib/onebox/helpers.rb

Defined Under Namespace

Classes: DownloadTooLarge

Constant Summary collapse

IGNORE_CANONICAL_DOMAINS =
%w[www.instagram.com medium.com youtube.com]

Class Method Summary collapse

Class Method Details

.audio_placeholder_htmlObject



289
290
291
# File 'lib/onebox/helpers.rb', line 289

def self.audio_placeholder_html
  "<div class='onebox-placeholder-container'><span class='placeholder-icon audio'></span></div>"
end

.clean(html) ⇒ Object



12
13
14
# File 'lib/onebox/helpers.rb', line 12

def self.clean(html)
  html.gsub(/<[^>]+>/, " ").gsub(/\n/, "")
end

.click_to_scroll_div(width = 690, height = 400) ⇒ Object



192
193
194
# File 'lib/onebox/helpers.rb', line 192

def self.click_to_scroll_div(width = 690, height = 400)
  "<div style=\"background:transparent;position:relative;width:#{width}px;height:#{height}px;top:#{height}px;margin-top:-#{height}px;\" onClick=\"style.pointerEvents='none'\"></div>"
end

.fetch_content_length(location) ⇒ Object



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/onebox/helpers.rb', line 157

def self.fetch_content_length(location)
  uri = URI(location)

  FinalDestination::HTTP.start(
    uri.host,
    uri.port,
    open_timeout: Onebox.options.connect_timeout,
    use_ssl: uri.is_a?(URI::HTTPS),
  ) do |http|
    http.read_timeout = Onebox.options.timeout
    if uri.is_a?(URI::HTTPS)
      http.use_ssl = true
      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
    end

    http.request_head([uri.path, uri.query].join("?")) do |response|
      return response.code.to_i == 200 ? response.content_length.presence : nil
    end
  end
end

.fetch_html_doc(url, headers = nil, body_cacher = nil) ⇒ Object

Fetches the HTML response body for a URL.

Note that the size of the response body is capped at ‘Onebox.options.max_download_kb`. When the limit has been reached, this method will return the response body that has been downloaded up to the limit.



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/onebox/helpers.rb', line 20

def self.fetch_html_doc(url, headers = nil, body_cacher = nil)
  response =
    (
      begin
        fetch_response(url, headers:, body_cacher:, raise_error_when_response_too_large: false)
      rescue StandardError
        nil
      end
    )

  doc = Nokogiri.HTML(response)
  uri = Addressable::URI.parse(url)

  ignore_canonical_tag = doc.at('meta[property="og:ignore_canonical"]')
  should_ignore_canonical =
    IGNORE_CANONICAL_DOMAINS.map { |hostname| uri.hostname.match?(hostname) }.any?

  if !(ignore_canonical_tag && ignore_canonical_tag["content"].to_s == "true") &&
       !should_ignore_canonical
    # prefer canonical link
    canonical_link = doc.at('//link[@rel="canonical"]/@href')
    canonical_uri = Addressable::URI.parse(canonical_link)
    if canonical_link && canonical_uri &&
         "#{canonical_uri.host}#{canonical_uri.path}" != "#{uri.host}#{uri.path}"
      uri =
        FinalDestination.new(
          canonical_link,
          Oneboxer.get_final_destination_options(canonical_link),
        ).resolve
      if uri.present?
        response =
          (
            begin
              fetch_response(
                uri.to_s,
                headers:,
                body_cacher:,
                raise_error_when_response_too_large: false,
              )
            rescue StandardError
              nil
            end
          )
        doc = Nokogiri.HTML(response) if response
      end
    end
  end

  doc
end

.fetch_response(location, redirect_limit: 5, domain: nil, headers: nil, body_cacher: nil, raise_error_when_response_too_large: true, allow_cross_domain_cookies: false) ⇒ Object

Raises:

  • (Net::HTTPError)


71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/onebox/helpers.rb', line 71

def self.fetch_response(
  location,
  redirect_limit: 5,
  domain: nil,
  headers: nil,
  body_cacher: nil,
  raise_error_when_response_too_large: true,
  allow_cross_domain_cookies: false
)
  redirect_limit = Onebox.options.redirect_limit if redirect_limit >
    Onebox.options.redirect_limit

  raise Net::HTTPError.new("HTTP redirect too deep", location) if redirect_limit == 0

  uri = Addressable::URI.parse(location)
  uri = Addressable::URI.join(domain, uri) if !uri.host

  use_body_cacher = body_cacher && body_cacher.respond_to?("fetch_cached_response_body")
  if use_body_cacher
    response_body = body_cacher.fetch_cached_response_body(uri.to_s)

    return response_body if response_body.present?
  end

  result = StringIO.new
  FinalDestination::HTTP.start(
    uri.host,
    uri.port,
    open_timeout: Onebox.options.connect_timeout,
    use_ssl: uri.normalized_scheme == "https",
  ) do |http|
    http.read_timeout = Onebox.options.timeout
    http.verify_mode = OpenSSL::SSL::VERIFY_NONE # Work around path building bugs

    headers ||= {}

    headers["User-Agent"] ||= user_agent if user_agent

    request = Net::HTTP::Get.new(uri.request_uri, headers)
    start_time = Time.now

    size_bytes = Onebox.options.max_download_kb * 1024
    http.request(request) do |response|
      if cookie = response.get_fields("set-cookie")
        headers["Cookie"] = cookie.join("; ") if allow_cross_domain_cookies
        # HACK: If this breaks again in the future, use HTTP::CookieJar from gem 'http-cookie'
        # See test: it "does not send cookies to the wrong domain"
        redir_header = { "Cookie" => cookie.join("; ") }
      end

      redir_header = nil unless redir_header.is_a? Hash

      code = response.code.to_i
      unless code === 200
        response.error! if [301, 302, 303, 307, 308].exclude?(code)

        return(
          fetch_response(
            response["location"],
            redirect_limit: redirect_limit - 1,
            domain: "#{uri.scheme}://#{uri.host}",
            headers: allow_cross_domain_cookies ? headers : redir_header,
            allow_cross_domain_cookies: allow_cross_domain_cookies,
          )
        )
      end

      response.read_body do |chunk|
        result.write(chunk)

        if result.size > size_bytes
          raise_error_when_response_too_large ? raise(DownloadTooLarge.new) : break
        end

        raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout
      end

      if use_body_cacher && body_cacher.cache_response_body?(uri)
        body_cacher.cache_response_body(uri.to_s, result.string)
      end

      return result.string
    end
  end
end

.generic_placeholder_htmlObject



297
298
299
# File 'lib/onebox/helpers.rb', line 297

def self.generic_placeholder_html
  "<div class='onebox-placeholder-container'><span class='placeholder-icon generic'></span></div>"
end

.get(meta, attr) ⇒ Object



201
202
203
# File 'lib/onebox/helpers.rb', line 201

def self.get(meta, attr)
  (meta && meta[attr].present?) ? sanitize(meta[attr]) : nil
end

.get_absolute_image_url(src, url) ⇒ Object



225
226
227
228
229
230
231
# File 'lib/onebox/helpers.rb', line 225

def self.get_absolute_image_url(src, url)
  begin
    URI.parse(url).merge(src).to_s
  rescue ArgumentError, URI::BadURIError, URI::InvalidURIError
    src
  end
end

.image_placeholder_htmlObject



281
282
283
# File 'lib/onebox/helpers.rb', line 281

def self.image_placeholder_html
  "<div class='onebox-placeholder-container'><span class='placeholder-icon image'></span></div>"
end

.map_placeholder_htmlObject



293
294
295
# File 'lib/onebox/helpers.rb', line 293

def self.map_placeholder_html
  "<div class='onebox-placeholder-container'><span class='placeholder-icon map'></span></div>"
end

.normalize_url_for_output(url) ⇒ Object



210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/onebox/helpers.rb', line 210

def self.normalize_url_for_output(url)
  return "" unless url
  url = url.dup
  # expect properly encoded url, remove any unsafe chars
  url.gsub!(" ", "%20")
  url.gsub!("'", "&apos;")
  url.gsub!('"', "&quot;")
  url.gsub!(/[^\w\-`.~:\/?#\[\]@!$&'\(\)*+,;=%\p{M}’]/, "")

  parsed = Addressable::URI.parse(url)
  return "" unless parsed.host

  url
end

.pretty_filesize(size) ⇒ Object



178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/onebox/helpers.rb', line 178

def self.pretty_filesize(size)
  conv = %w[B KB MB GB TB PB EB]
  scale = 1024

  ndx = 1
  return "#{(size)} #{conv[ndx - 1]}" if (size < 2 * (scale**ndx))
  size = size.to_f
  [2, 3, 4, 5, 6, 7].each do |i|
    return "#{"%.2f" % (size / (scale**(i - 1)))} #{conv[i - 1]}" if (size < 2 * (scale**i))
  end
  ndx = 7
  "#{"%.2f" % (size / (scale**(ndx - 1)))} #{conv[ndx - 1]}"
end

.sanitize(value, length = 50) ⇒ Object



205
206
207
208
# File 'lib/onebox/helpers.rb', line 205

def self.sanitize(value, length = 50)
  return nil if value.blank?
  Sanitize.fragment(value).strip
end

.truncate(string, length = 50) ⇒ Object



196
197
198
199
# File 'lib/onebox/helpers.rb', line 196

def self.truncate(string, length = 50)
  return string if string.nil?
  string.size > length ? string[0...(string.rindex(" ", length) || length)] + "..." : string
end

.uri_encode(url) ⇒ Object

Percent-encodes a URI string per RFC3986 - tools.ietf.org/html/rfc3986



240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# File 'lib/onebox/helpers.rb', line 240

def self.uri_encode(url)
  return "" unless url

  uri = Addressable::URI.parse(url)

  encoded_uri =
    Addressable::URI.new(
      scheme:
        Addressable::URI.encode_component(
          uri.scheme,
          Addressable::URI::CharacterClasses::SCHEME,
        ),
      authority:
        Addressable::URI.encode_component(
          uri.authority,
          Addressable::URI::CharacterClasses::AUTHORITY,
        ),
      path:
        Addressable::URI.encode_component(
          uri.path,
          Addressable::URI::CharacterClasses::PATH + "\\%",
        ),
      query:
        Addressable::URI.encode_component(
          uri.query,
          "a-zA-Z0-9\\-\\.\\_\\~\\$\\&\\*\\,\\=\\:\\@\\?\\%",
        ),
      fragment:
        Addressable::URI.encode_component(
          uri.fragment,
          "a-zA-Z0-9\\-\\.\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:\\/\\?\\%",
        ),
    )

  encoded_uri.to_s
end

.uri_unencode(url) ⇒ Object



277
278
279
# File 'lib/onebox/helpers.rb', line 277

def self.uri_unencode(url)
  Addressable::URI.unencode(url)
end

.user_agentObject



233
234
235
236
237
# File 'lib/onebox/helpers.rb', line 233

def self.user_agent
  user_agent = SiteSetting.onebox_user_agent.presence || Onebox.options.user_agent
  user_agent = "#{user_agent} v#{Discourse::VERSION::STRING}"
  user_agent
end

.video_placeholder_htmlObject



285
286
287
# File 'lib/onebox/helpers.rb', line 285

def self.video_placeholder_html
  "<div class='onebox-placeholder-container'><span class='placeholder-icon video'></span></div>"
end