Class: EasyML::Data::Dataset
- Inherits:
-
Object
- Object
- EasyML::Data::Dataset
show all
- Includes:
- Utils, Logging, GlueGun::DSL
- Defined in:
- lib/easy_ml/data/dataset.rb,
lib/easy_ml/data/dataset/splits.rb,
lib/easy_ml/data/dataset/splitters.rb,
lib/easy_ml/data/dataset/splits/split.rb,
lib/easy_ml/data/dataset/splits/file_split.rb,
lib/easy_ml/data/dataset/splits/in_memory_split.rb
Defined Under Namespace
Modules: Splits, Splitters
Instance Method Summary
collapse
Methods included from Utils
#append_to_csv, #expand_dir, #null_check
Methods included from Logging
included, #log_info, #log_message, #log_verbose, #log_warning
Instance Method Details
#check_nulls(data_type = :processed) ⇒ Object
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
|
# File 'lib/easy_ml/data/dataset.rb', line 261
def check_nulls(data_type = :processed)
result = %i[train test valid].each_with_object({}) do |segment, acc|
segment_result = { nulls: {}, total: 0 }
data_source = data_type == :raw ? raw : processed
data_source.read(segment) do |df|
df_nulls = null_check(df)
df.columns.each do |column|
segment_result[:nulls][column] ||= { null_count: 0, total_count: 0 }
if df_nulls && df_nulls[column]
segment_result[:nulls][column][:null_count] += df_nulls[column][:null_count]
end
segment_result[:nulls][column][:total_count] += df.height
end
end
segment_result[:nulls].each do |column, counts|
percentage = (counts[:null_count].to_f / counts[:total_count] * 100).round(1)
acc[column] ||= {}
acc[column][segment] = percentage
end
end
result.reject! { |_, v| v.values.all?(&:zero?) }
result.empty? ? nil : result
end
|
#cleanup ⇒ Object
256
257
258
259
|
# File 'lib/easy_ml/data/dataset.rb', line 256
def cleanup
raw.cleanup
processed.cleanup
end
|
#data(split_ys: false, all_columns: false) ⇒ Object
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
|
# File 'lib/easy_ml/data/dataset.rb', line 238
def data(split_ys: false, all_columns: false)
if split_ys
x_train, y_train = train(split_ys: true, all_columns: all_columns)
x_valid, y_valid = valid(split_ys: true, all_columns: all_columns)
x_test, y_test = test(split_ys: true, all_columns: all_columns)
xs = Polars.concat([x_train, x_valid, x_test])
ys = Polars.concat([y_train, y_valid, y_test])
[xs, ys]
else
train_df = train(split_ys: false, all_columns: all_columns)
valid_df = valid(split_ys: false, all_columns: all_columns)
test_df = test(split_ys: false, all_columns: all_columns)
Polars.concat([train_df, valid_df, test_df])
end
end
|
#decode_labels(ys, col: nil) ⇒ Object
294
295
296
|
# File 'lib/easy_ml/data/dataset.rb', line 294
def decode_labels(ys, col: nil)
preprocessor.decode_labels(ys, col: col.nil? ? target : col)
end
|
#normalize(df = nil) ⇒ Object
215
216
217
218
219
|
# File 'lib/easy_ml/data/dataset.rb', line 215
def normalize(df = nil)
df = drop_nulls(df)
df = apply_transforms(df)
preprocessor.postprocess(df)
end
|
#polars_args=(args) ⇒ Object
66
67
68
69
70
71
72
73
|
# File 'lib/easy_ml/data/dataset.rb', line 66
def polars_args=(args)
super(args.deep_symbolize_keys.inject({}) do |hash, (k, v)|
hash.tap do
hash[k] = v
hash[k] = v.stringify_keys if k == :dtypes
end
end)
end
|
#processed? ⇒ Boolean
290
291
292
|
# File 'lib/easy_ml/data/dataset.rb', line 290
def processed?
!should_split?
end
|
#refresh! ⇒ Object
207
208
209
210
211
212
213
|
# File 'lib/easy_ml/data/dataset.rb', line 207
def refresh!
refresh_datasource
split_data
fit
normalize_all
alert_nulls
end
|
#root_dir=(value) ⇒ Object
57
58
59
|
# File 'lib/easy_ml/data/dataset.rb', line 57
def root_dir=(value)
super(Pathname.new(value).append("data").to_s)
end
|
#test(split_ys: false, all_columns: false, &block) ⇒ Object
234
235
236
|
# File 'lib/easy_ml/data/dataset.rb', line 234
def test(split_ys: false, all_columns: false, &block)
load_data(:test, split_ys: split_ys, all_columns: all_columns, &block)
end
|
#today=(value) ⇒ Object
47
48
49
|
# File 'lib/easy_ml/data/dataset.rb', line 47
def today=(value)
super(value.in_time_zone(UTC).to_date)
end
|
#train(split_ys: false, all_columns: false, &block) ⇒ Object
226
227
228
|
# File 'lib/easy_ml/data/dataset.rb', line 226
def train(split_ys: false, all_columns: false, &block)
load_data(:train, split_ys: split_ys, all_columns: all_columns, &block)
end
|
77
78
79
80
81
|
# File 'lib/easy_ml/data/dataset.rb', line 77
def transforms_are_transforms
return if transforms.nil? || transforms.respond_to?(:transform)
errors.add(:transforms, "Must respond to transform, try including EasyML::Data::Transforms")
end
|
#valid(split_ys: false, all_columns: false, &block) ⇒ Object
230
231
232
|
# File 'lib/easy_ml/data/dataset.rb', line 230
def valid(split_ys: false, all_columns: false, &block)
load_data(:valid, split_ys: split_ys, all_columns: all_columns, &block)
end
|