Class: EasyML::Data::Dataset

Inherits:
Object
  • Object
show all
Includes:
Utils, Logging, GlueGun::DSL
Defined in:
lib/easy_ml/data/dataset.rb,
lib/easy_ml/data/dataset/splits.rb,
lib/easy_ml/data/dataset/splitters.rb,
lib/easy_ml/data/dataset/splits/split.rb,
lib/easy_ml/data/dataset/splits/file_split.rb,
lib/easy_ml/data/dataset/splits/in_memory_split.rb

Defined Under Namespace

Modules: Splits, Splitters

Instance Method Summary collapse

Methods included from Utils

#append_to_csv, #expand_dir, #null_check

Methods included from Logging

included, #log_info, #log_message, #log_verbose, #log_warning

Instance Method Details

#check_nulls(data_type = :processed) ⇒ Object



261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'lib/easy_ml/data/dataset.rb', line 261

def check_nulls(data_type = :processed)
  result = %i[train test valid].each_with_object({}) do |segment, acc|
    segment_result = { nulls: {}, total: 0 }

    data_source = data_type == :raw ? raw : processed
    data_source.read(segment) do |df|
      df_nulls = null_check(df)
      df.columns.each do |column|
        segment_result[:nulls][column] ||= { null_count: 0, total_count: 0 }
        if df_nulls && df_nulls[column]
          segment_result[:nulls][column][:null_count] += df_nulls[column][:null_count]
        end
        segment_result[:nulls][column][:total_count] += df.height
      end
    end

    segment_result[:nulls].each do |column, counts|
      percentage = (counts[:null_count].to_f / counts[:total_count] * 100).round(1)
      acc[column] ||= {}
      acc[column][segment] = percentage
    end
  end

  # Remove columns that have no nulls across all segments
  result.reject! { |_, v| v.values.all?(&:zero?) }

  result.empty? ? nil : result
end

#cleanupObject



256
257
258
259
# File 'lib/easy_ml/data/dataset.rb', line 256

def cleanup
  raw.cleanup
  processed.cleanup
end

#data(split_ys: false, all_columns: false) ⇒ Object



238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# File 'lib/easy_ml/data/dataset.rb', line 238

def data(split_ys: false, all_columns: false)
  if split_ys
    x_train, y_train = train(split_ys: true, all_columns: all_columns)
    x_valid, y_valid = valid(split_ys: true, all_columns: all_columns)
    x_test, y_test = test(split_ys: true, all_columns: all_columns)

    xs = Polars.concat([x_train, x_valid, x_test])
    ys = Polars.concat([y_train, y_valid, y_test])
    [xs, ys]
  else
    train_df = train(split_ys: false, all_columns: all_columns)
    valid_df = valid(split_ys: false, all_columns: all_columns)
    test_df = test(split_ys: false, all_columns: all_columns)

    Polars.concat([train_df, valid_df, test_df])
  end
end

#decode_labels(ys, col: nil) ⇒ Object



294
295
296
# File 'lib/easy_ml/data/dataset.rb', line 294

def decode_labels(ys, col: nil)
  preprocessor.decode_labels(ys, col: col.nil? ? target : col)
end

#normalize(df = nil) ⇒ Object



215
216
217
218
219
# File 'lib/easy_ml/data/dataset.rb', line 215

def normalize(df = nil)
  df = drop_nulls(df)
  df = apply_transforms(df)
  preprocessor.postprocess(df)
end

#polars_args=(args) ⇒ Object



66
67
68
69
70
71
72
73
# File 'lib/easy_ml/data/dataset.rb', line 66

def polars_args=(args)
  super(args.deep_symbolize_keys.inject({}) do |hash, (k, v)|
    hash.tap do
      hash[k] = v
      hash[k] = v.stringify_keys if k == :dtypes
    end
  end)
end

#processed?Boolean

Returns:

  • (Boolean)


290
291
292
# File 'lib/easy_ml/data/dataset.rb', line 290

def processed?
  !should_split?
end

#refresh!Object



207
208
209
210
211
212
213
# File 'lib/easy_ml/data/dataset.rb', line 207

def refresh!
  refresh_datasource
  split_data
  fit
  normalize_all
  alert_nulls
end

#root_dir=(value) ⇒ Object



57
58
59
# File 'lib/easy_ml/data/dataset.rb', line 57

def root_dir=(value)
  super(Pathname.new(value).append("data").to_s)
end

#test(split_ys: false, all_columns: false, &block) ⇒ Object



234
235
236
# File 'lib/easy_ml/data/dataset.rb', line 234

def test(split_ys: false, all_columns: false, &block)
  load_data(:test, split_ys: split_ys, all_columns: all_columns, &block)
end

#today=(value) ⇒ Object



47
48
49
# File 'lib/easy_ml/data/dataset.rb', line 47

def today=(value)
  super(value.in_time_zone(UTC).to_date)
end

#train(split_ys: false, all_columns: false, &block) ⇒ Object



226
227
228
# File 'lib/easy_ml/data/dataset.rb', line 226

def train(split_ys: false, all_columns: false, &block)
  load_data(:train, split_ys: split_ys, all_columns: all_columns, &block)
end

#transforms_are_transformsObject



77
78
79
80
81
# File 'lib/easy_ml/data/dataset.rb', line 77

def transforms_are_transforms
  return if transforms.nil? || transforms.respond_to?(:transform)

  errors.add(:transforms, "Must respond to transform, try including EasyML::Data::Transforms")
end

#valid(split_ys: false, all_columns: false, &block) ⇒ Object



230
231
232
# File 'lib/easy_ml/data/dataset.rb', line 230

def valid(split_ys: false, all_columns: false, &block)
  load_data(:valid, split_ys: split_ys, all_columns: all_columns, &block)
end