Class: Webcache::DiskCache

Inherits:

Object

Object
Webcache::DiskCache

show all

Defined in:: lib/webget/webcache.rb

Instance Method Summary collapse

#cached?(url) ⇒ Boolean (also: #exist?)
#read(url) ⇒ Object
#read_csv(url) ⇒ Object
#read_json(url) ⇒ Object
#read_meta(url) ⇒ Object
#record(url, response, path: nil, encoding: 'UTF-8', format: 'html') ⇒ Object

add more save / put / etc.
#url_to_id(str) ⇒ Object

note: use file path as id for DiskCache (is different for DbCache/SqlCache?) use file:// instead of disk:// - why? why not?.
#url_to_path(str, path: nil) ⇒ Object

helpers.

Instance Method Details

#cached?(url) ⇒ `Boolean` Also known as: exist?

Returns:

(Boolean)

# File 'lib/webget/webcache.rb', line 162

def cached?( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  File.exist?( body_path )
end

#read(url) ⇒ `Object`

# File 'lib/webget/webcache.rb', line 169

def read( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  File.open( body_path, 'r:utf-8' ) {|f| f.read }
end

#read_csv(url) ⇒ `Object`

# File 'lib/webget/webcache.rb', line 181

def read_csv( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
  data = CsvHash.parse( txt )
  data
end

#read_json(url) ⇒ `Object`

# File 'lib/webget/webcache.rb', line 174

def read_json( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
  data = JSON.parse( txt )
  data
end

#read_meta(url) ⇒ `Object`

# File 'lib/webget/webcache.rb', line 189

def read_meta( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  meta_path = "#{body_path}.meta.txt"
  txt = File.open( meta_path, 'r:utf-8' ) {|f| f.read }
  data = Headers.parse( txt )
  data
end

#record(url, response, path: nil, encoding: 'UTF-8', format: 'html') ⇒ `Object`

add more save / put / etc. aliases - why? why not?

rename to record_html - why? why not?

# File 'lib/webget/webcache.rb', line 200

def record( url, response,
            path: nil,
            encoding: 'UTF-8',
            format: 'html' )

  body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
  meta_path = "#{body_path}.meta.txt"

  ## make sure path exits
  FileUtils.mkdir_p( File.dirname( body_path ) )


  puts "[cache] saving #{body_path}..."

  ## todo/check: verify content-type - why? why not?
  ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
  ##
  ## fix: newlines - always use "unix" style" - why? why not?
  ## fix:  use :newline => :universal option? translates to univeral "\n"
  if format == 'json'
    File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
  elsif format == 'csv'
    ## fix: newlines - always use "unix" style" - why? why not?
    ## fix:  use :newline => :universal option? translates to univeral "\n"
    text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
    File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
  else   ## html or txt
    text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
    File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
  end


  File.open( meta_path, 'w:utf-8' ) do |f|
    ## todo/check:
    ##  do headers also need to converted (like text) if encoding is NOT utf-8 ???
    response.headers.each do |key, value|  # iterate all response headers
      f.write( "#{key}: #{value}" )
      f.write( "\n" )
    end
  end
end

#url_to_id(str) ⇒ `Object`

note: use file path as id for DiskCache (is different for DbCache/SqlCache?)

use file:// instead of disk:// - why? why not?

246	# File 'lib/webget/webcache.rb', line 246 def url_to_id( str ) "disk://#{url_to_path( str )}"; end

#url_to_path(str, path: nil) ⇒ `Object`

helpers

# File 'lib/webget/webcache.rb', line 250

def url_to_path( str, path: nil )
  ## map url to file path
  uri = URI.parse( str )

  ## note: ignore scheme (e.g. http/https)
  ##         and  post  (e.g. 80, 8080, etc.) for now
  ##    always downcase for now (internet domain is case insensitive)
  host_dir = uri.host.downcase

  req_path = if path   ## use "custom" (file)path for cache storage if passed in
               path
             else
              ## "/this/is/everything?query=params"
              ##   cut-off leading slash and
              ##    convert query ? =
               uri.request_uri[1..-1]
             end



  ### special "prettify" rule for weltfussball
  ##   /eng-league-one-2019-2020/  => /eng-league-one-2019-2020.html

### todo/fix - move rules downstream to user - why? why not?

  if host_dir.index( 'uefa.com' ) ||
     host_dir.index( 'kicker.de' ) ||
     host_dir.index( 'kicekr.at' )
    if req_path.end_with?( '/' )
      req_path = "#{req_path[0..-2]}.html"
    else
      puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
      exit 1
    end
  elsif host_dir.index( 'weltfussball.de' ) ||
     host_dir.index( 'worldfootball.net' )
        if req_path.end_with?( '/' )
           req_path = "#{req_path[0..-2]}.html"
        else
          puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
          exit 1
        end
  elsif host_dir.index( 'tipp3.at' )
    req_path = req_path.sub( '.jsp', '' )  # shorten - cut off .jsp extension

    ##   change ? to -I-
    ##   change = to ~
    ##   Example:
    ##   sportwetten/classicresults.jsp?oddsetProgramID=888
    ##     =>
    ##   sportwetten/classicresults-I-oddsetProgramID~888
    req_path = req_path.gsub( '?', '-I-' )
                       .gsub( '=', '~')

    req_path = "#{req_path}.html"
  elsif host_dir.index( 'fbref.com' )
    req_path = req_path.sub( 'en/', '' )      # shorten - cut off en/
    req_path = "#{req_path}.html"             # auto-add html extension
  elsif host_dir.index( 'football-data.co.uk' )
    req_path = req_path.sub( 'mmz4281/', '' )  # shorten - cut off mmz4281/
    req_path = req_path.sub( 'new/', '' )      # shorten - cut off new/
  elsif host_dir.index( 'football-data.org' )
    ##  req_path = req_path.sub( 'v2/', '' )  # shorten - cut off v2/

    ## flattern - make a file path - for auto-save
    ##   change ? to -I-
    ##   change / to ~~
    ##   change = to ~
    req_path = req_path.gsub( '?', '-I-' )
                       .gsub( '/', '~~' )
                       .gsub( '=', '~')

    req_path = "#{req_path}.json"
  elsif host_dir.index( 'api.cryptokitties.co' )
    ## for now always auto-add .json extensions e.g.
    ##     kitties/1   => kitties/1.json
    ##     cattributes => cattributes.json
    req_path = "#{req_path}.json"
  else
    ## no special rule
  end

  page_path = "#{host_dir}/#{req_path}"
  page_path
end

Class: Webcache::DiskCache

Instance Method Summary collapse

Instance Method Details

#cached?(url) ⇒ Boolean Also known as: exist?

#read(url) ⇒ Object

#read_csv(url) ⇒ Object

#read_json(url) ⇒ Object

#read_meta(url) ⇒ Object

#record(url, response, path: nil, encoding: 'UTF-8', format: 'html') ⇒ Object

#url_to_id(str) ⇒ Object

#url_to_path(str, path: nil) ⇒ Object

#cached?(url) ⇒ `Boolean` Also known as: exist?

#read(url) ⇒ `Object`

#read_csv(url) ⇒ `Object`

#read_json(url) ⇒ `Object`

#read_meta(url) ⇒ `Object`

#record(url, response, path: nil, encoding: 'UTF-8', format: 'html') ⇒ `Object`

#url_to_id(str) ⇒ `Object`

#url_to_path(str, path: nil) ⇒ `Object`