Class: Webcache::DiskCache

Inherits:
Object
  • Object
show all
Defined in:
lib/webget/webcache.rb

Instance Method Summary collapse

Instance Method Details

#cached?(url) ⇒ Boolean Also known as: exist?

Returns:

  • (Boolean)


162
163
164
165
# File 'lib/webget/webcache.rb', line 162

def cached?( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  File.exist?( body_path )
end

#read(url) ⇒ Object



169
170
171
172
# File 'lib/webget/webcache.rb', line 169

def read( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  File.open( body_path, 'r:utf-8' ) {|f| f.read }
end

#read_csv(url) ⇒ Object



181
182
183
184
185
186
# File 'lib/webget/webcache.rb', line 181

def read_csv( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
  data = CsvHash.parse( txt )
  data
end

#read_json(url) ⇒ Object



174
175
176
177
178
179
# File 'lib/webget/webcache.rb', line 174

def read_json( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
  data = JSON.parse( txt )
  data
end

#read_meta(url) ⇒ Object



189
190
191
192
193
194
195
# File 'lib/webget/webcache.rb', line 189

def read_meta( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  meta_path = "#{body_path}.meta.txt"
  txt = File.open( meta_path, 'r:utf-8' ) {|f| f.read }
  data = Headers.parse( txt )
  data
end

#record(url, response, path: nil, encoding: 'UTF-8', format: 'html') ⇒ Object

add more save / put / etc. aliases - why? why not?

rename to record_html - why? why not?


200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/webget/webcache.rb', line 200

def record( url, response,
            path: nil,
            encoding: 'UTF-8',
            format: 'html' )

  body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
  meta_path = "#{body_path}.meta.txt"

  ## make sure path exits
  FileUtils.mkdir_p( File.dirname( body_path ) )


  puts "[cache] saving #{body_path}..."

  ## todo/check: verify content-type - why? why not?
  ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
  ##
  ## fix: newlines - always use "unix" style" - why? why not?
  ## fix:  use :newline => :universal option? translates to univeral "\n"
  if format == 'json'
    File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
  elsif format == 'csv'
    ## fix: newlines - always use "unix" style" - why? why not?
    ## fix:  use :newline => :universal option? translates to univeral "\n"
    text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
    File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
  else   ## html or txt
    text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
    File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
  end


  File.open( meta_path, 'w:utf-8' ) do |f|
    ## todo/check:
    ##  do headers also need to converted (like text) if encoding is NOT utf-8 ???
    response.headers.each do |key, value|  # iterate all response headers
      f.write( "#{key}: #{value}" )
      f.write( "\n" )
    end
  end
end

#url_to_id(str) ⇒ Object

note: use file path as id for DiskCache (is different for DbCache/SqlCache?)

use file:// instead of disk:// - why? why not?


246
# File 'lib/webget/webcache.rb', line 246

def url_to_id( str ) "disk://#{url_to_path( str )}"; end

#url_to_path(str, path: nil) ⇒ Object

helpers



250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
# File 'lib/webget/webcache.rb', line 250

def url_to_path( str, path: nil )
  ## map url to file path
  uri = URI.parse( str )

  ## note: ignore scheme (e.g. http/https)
  ##         and  post  (e.g. 80, 8080, etc.) for now
  ##    always downcase for now (internet domain is case insensitive)
  host_dir = uri.host.downcase

  req_path = if path   ## use "custom" (file)path for cache storage if passed in
               path
             else
              ## "/this/is/everything?query=params"
              ##   cut-off leading slash and
              ##    convert query ? =
               uri.request_uri[1..-1]
             end



  ### special "prettify" rule for weltfussball
  ##   /eng-league-one-2019-2020/  => /eng-league-one-2019-2020.html

### todo/fix - move rules downstream to user - why? why not?

  if host_dir.index( 'uefa.com' ) ||
     host_dir.index( 'kicker.de' ) ||
     host_dir.index( 'kicekr.at' )
    if req_path.end_with?( '/' )
      req_path = "#{req_path[0..-2]}.html"
    else
      puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
      exit 1
    end
  elsif host_dir.index( 'weltfussball.de' ) ||
     host_dir.index( 'worldfootball.net' )
        if req_path.end_with?( '/' )
           req_path = "#{req_path[0..-2]}.html"
        else
          puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
          exit 1
        end
  elsif host_dir.index( 'tipp3.at' )
    req_path = req_path.sub( '.jsp', '' )  # shorten - cut off .jsp extension

    ##   change ? to -I-
    ##   change = to ~
    ##   Example:
    ##   sportwetten/classicresults.jsp?oddsetProgramID=888
    ##     =>
    ##   sportwetten/classicresults-I-oddsetProgramID~888
    req_path = req_path.gsub( '?', '-I-' )
                       .gsub( '=', '~')

    req_path = "#{req_path}.html"
  elsif host_dir.index( 'fbref.com' )
    req_path = req_path.sub( 'en/', '' )      # shorten - cut off en/
    req_path = "#{req_path}.html"             # auto-add html extension
  elsif host_dir.index( 'football-data.co.uk' )
    req_path = req_path.sub( 'mmz4281/', '' )  # shorten - cut off mmz4281/
    req_path = req_path.sub( 'new/', '' )      # shorten - cut off new/
  elsif host_dir.index( 'football-data.org' )
    ##  req_path = req_path.sub( 'v2/', '' )  # shorten - cut off v2/

    ## flattern - make a file path - for auto-save
    ##   change ? to -I-
    ##   change / to ~~
    ##   change = to ~
    req_path = req_path.gsub( '?', '-I-' )
                       .gsub( '/', '~~' )
                       .gsub( '=', '~')

    req_path = "#{req_path}.json"
  elsif host_dir.index( 'api.cryptokitties.co' )
    ## for now always auto-add .json extensions e.g.
    ##     kitties/1   => kitties/1.json
    ##     cattributes => cattributes.json
    req_path = "#{req_path}.json"
  else
    ## no special rule
  end

  page_path = "#{host_dir}/#{req_path}"
  page_path
end