Class: OpenC3::BucketUtilities

Inherits:

Object
OpenC3::BucketUtilities

Defined in:: lib/openc3/utilities/bucket_utilities.rb

Constant Summary collapse

FILE_TIMESTAMP_FORMAT =

"%Y%m%d%H%M%S%N"

DIRECTORY_TIMESTAMP_FORMAT =

"%Y%m%d"

Class Method Summary collapse

Class Method Details

.bucket_load(*args, scope: $openc3_scope) ⇒ `Object`

Raises:

(LoadError)

# File 'lib/openc3/utilities/bucket_utilities.rb', line 33

def self.bucket_load(*args, scope: $openc3_scope)
  scope = ENV['OPENC3_SCOPE'] unless scope
  scope = 'DEFAULT' unless scope
  path = args[0]

  # Only support TARGET files
  if path[0] == '/' or path.split('/')[0].to_s.upcase != path.split('/')[0]
    raise LoadError, "only relative TARGET files are allowed -- #{path}"
  end
  extension = File.extname(path)
  path = path + '.rb' if extension == ""

  # Retrieve the text of the script from S3
  text = TargetFile.body(scope, path)
  raise LoadError, "Bucket file #{path} not found for scope #{scope}" unless text

  # Execute the script directly without instrumentation because we are doing require/load
  Object.class_eval(text, path, 1)

  # Successful load/require returns true
  true
end

.compress_file(filename, chunk_size = 50_000_000) ⇒ `Object`

# File 'lib/openc3/utilities/bucket_utilities.rb', line 131

def self.compress_file(filename, chunk_size = 50_000_000)
  zipped = "#{filename}.gz"

  Zlib::GzipWriter.open(zipped) do |gz|
    gz.mtime = File.mtime(filename)
    gz.orig_name = filename
    File.open(filename, 'rb') do |file|
      while chunk = file.read(chunk_size) do
        gz.write(chunk)
      end
    end
  end

  return zipped
end

.directory_in_time_range(directory, start_time, end_time) ⇒ `Object`

# File 'lib/openc3/utilities/bucket_utilities.rb', line 171

def self.directory_in_time_range(directory, start_time, end_time)
  basename = File.basename(directory)
  directory_start_time = DateTime.strptime(basename, DIRECTORY_TIMESTAMP_FORMAT).to_time
  directory_end_time = directory_start_time + Time::SEC_PER_DAY
  if (not start_time or start_time < directory_end_time) and (not end_time or end_time >= directory_start_time)
    return true
  else
    return false
  end
end

.file_in_time_range(bucket_path, start_time, end_time, overlap:) ⇒ `Object`

# File 'lib/openc3/utilities/bucket_utilities.rb', line 194

def self.file_in_time_range(bucket_path, start_time, end_time, overlap:)
  file_start_time, file_end_time = get_file_times(bucket_path)
  if overlap
    if (not start_time or start_time <= file_end_time) and (not end_time or end_time >= file_start_time)
      return true
    end
  else
    if (not start_time or start_time <= file_start_time) and (not end_time or end_time >= file_end_time)
      return true
    end
  end
  return false
end

.files_between_time(bucket, prefix, start_time, end_time, file_suffix: nil, overlap: false, max_request: 1000, max_total: 100_000) ⇒ `Object`

Parameters:

bucket (String) —

Name of the bucket to list
prefix (String) —

Prefix to filter all files by
start_time (Time|nil) —

Ruby time to find files after. nil means no start (first file on).
end_time (Time|nil) —

Ruby time to find files before. nil means no end (up to last file).
overlap (Boolean) (defaults to: false) —

Whether to include files which overlap the start and end time if true, file can be partially in the time range if false, file must be completely in the time range
max_request (Integer) (defaults to: 1000) —

How many files to request in each API call
max_total (Integer) (defaults to: 100_000) —

Total number of files before stopping API requests

# File 'lib/openc3/utilities/bucket_utilities.rb', line 65

def self.files_between_time(bucket, prefix, start_time, end_time, file_suffix: nil,
                            overlap: false, max_request: 1000, max_total: 100_000)
  client = Bucket.getClient()
  oldest_list = []

  # Return nothing if bucket doesn't exist (it won't at the very beginning)
  unless client.exist?(bucket)
    return oldest_list
  end

  directories = client.list_files(bucket: bucket, path: prefix, only_directories: true)
  filtered_directories = filter_directories_to_time_range(directories, start_time, end_time)
  filtered_directories.each do |directory|
    directory_files = client.list_objects(bucket: bucket, prefix: "#{prefix}/#{directory}", max_request: max_request, max_total: max_total)
    files = filter_files_to_time_range(directory_files, start_time, end_time, file_suffix: file_suffix, overlap: overlap)
    oldest_list.concat(files)
  end
  return oldest_list
end

.filter_directories_to_time_range(directories, start_time, end_time) ⇒ `Object`

Private methods

# File 'lib/openc3/utilities/bucket_utilities.rb', line 163

def self.filter_directories_to_time_range(directories, start_time, end_time)
  result = []
  directories.each do |directory|
    result << directory if directory_in_time_range(directory, start_time, end_time)
  end
  return result
end

.filter_files_to_time_range(files, start_time, end_time, file_suffix: nil, overlap: false) ⇒ `Object`

# File 'lib/openc3/utilities/bucket_utilities.rb', line 182

def self.filter_files_to_time_range(files, start_time, end_time, file_suffix: nil, overlap: false)
  result = []
  files.each do |file|
    file_key = file.key.to_s
    next if file_suffix and not file_key.end_with?(file_suffix)
    if file_in_time_range(file_key, start_time, end_time, overlap: overlap)
      result << file_key
    end
  end
  return result
end

.get_cache_control(filename) ⇒ `Object`

# File 'lib/openc3/utilities/bucket_utilities.rb', line 123

def self.get_cache_control(filename)
  # Allow caching for files that have a filename versioning strategy
  has_version_number = /(-|_|\.)\d+(-|_|\.)\d+(-|_|\.)\d+\./.match(filename)
  has_content_hash = /\.[a-f0-9]{20}\./.match(filename)
  return nil if has_version_number or has_content_hash
  return 'no-store'
end

.get_file_times(bucket_path) ⇒ `Object`

# File 'lib/openc3/utilities/bucket_utilities.rb', line 208

def self.get_file_times(bucket_path)
  basename = File.basename(bucket_path)
  file_start_timestamp, file_end_timestamp, _ = basename.split("__")
  file_start_time = DateTime.strptime(file_start_timestamp, FILE_TIMESTAMP_FORMAT).to_time
  file_end_time = DateTime.strptime(file_end_timestamp, FILE_TIMESTAMP_FORMAT).to_time
  return file_start_time, file_end_time
end

.move_log_file_to_bucket(filename, bucket_key, metadata: {}) ⇒ `Object`

# File 'lib/openc3/utilities/bucket_utilities.rb', line 85

def self.move_log_file_to_bucket(filename, bucket_key, metadata: {})
  Thread.new do
    client = Bucket.getClient()

    orig_filename = nil
    if File.extname(filename) != '.txt'
      orig_filename = filename
      filename = compress_file(filename)
      bucket_key += '.gz'
    end

    retry_count = 0
    begin
      # We want to open this as a file and pass that to put_object to allow
      # this to work with really large files. Otherwise the entire file has
      # to be held in memory!
      File.open(filename, 'rb') do |file|
        client.put_object(bucket: ENV['OPENC3_LOGS_BUCKET'], key: bucket_key, body: file, metadata: metadata)
      end
    rescue => err
      # Try to upload file three times
      retry_count += 1
      raise err if retry_count >= 3
      Logger.warn("Error saving log file to bucket - retry #{retry_count}: #{filename}\n#{err.formatted}")
      sleep(1)
      retry
    end

    Logger.debug "wrote #{ENV['OPENC3_LOGS_BUCKET']}/#{bucket_key}"
    ReducerModel.add_file(bucket_key) # Record the new file for data reduction

    File.delete(orig_filename) if orig_filename
    File.delete(filename)
  rescue => err
    Logger.error("Error saving log file to bucket: #{filename}\n#{err.formatted}")
  end
end

.uncompress_file(filename, chunk_size = 50_000_000) ⇒ `Object`

# File 'lib/openc3/utilities/bucket_utilities.rb', line 147

def self.uncompress_file(filename, chunk_size = 50_000_000)
  unzipped = filename[0..-4] # Drop .gz

  Zlib::GzipReader.open(filename) do |gz|
    File.open(unzipped, 'wb') do |file|
      while chunk = gz.read(chunk_size)
        file.write(chunk)
      end
    end
  end

  return unzipped
end

Class: OpenC3::BucketUtilities

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.bucket_load(*args, scope: $openc3_scope) ⇒ Object

.compress_file(filename, chunk_size = 50_000_000) ⇒ Object

.directory_in_time_range(directory, start_time, end_time) ⇒ Object

.file_in_time_range(bucket_path, start_time, end_time, overlap:) ⇒ Object

.files_between_time(bucket, prefix, start_time, end_time, file_suffix: nil, overlap: false, max_request: 1000, max_total: 100_000) ⇒ Object

.filter_directories_to_time_range(directories, start_time, end_time) ⇒ Object

.filter_files_to_time_range(files, start_time, end_time, file_suffix: nil, overlap: false) ⇒ Object

.get_cache_control(filename) ⇒ Object

.get_file_times(bucket_path) ⇒ Object

.move_log_file_to_bucket(filename, bucket_key, metadata: {}) ⇒ Object

.uncompress_file(filename, chunk_size = 50_000_000) ⇒ Object

.bucket_load(*args, scope: $openc3_scope) ⇒ `Object`

.compress_file(filename, chunk_size = 50_000_000) ⇒ `Object`

.directory_in_time_range(directory, start_time, end_time) ⇒ `Object`

.file_in_time_range(bucket_path, start_time, end_time, overlap:) ⇒ `Object`

.files_between_time(bucket, prefix, start_time, end_time, file_suffix: nil, overlap: false, max_request: 1000, max_total: 100_000) ⇒ `Object`

.filter_directories_to_time_range(directories, start_time, end_time) ⇒ `Object`

.filter_files_to_time_range(files, start_time, end_time, file_suffix: nil, overlap: false) ⇒ `Object`

.get_cache_control(filename) ⇒ `Object`

.get_file_times(bucket_path) ⇒ `Object`

.move_log_file_to_bucket(filename, bucket_key, metadata: {}) ⇒ `Object`

.uncompress_file(filename, chunk_size = 50_000_000) ⇒ `Object`