Class: EasyML::Data::Preprocessor
- Inherits:
-
Object
- Object
- EasyML::Data::Preprocessor
show all
- Includes:
- Utils, GlueGun::DSL
- Defined in:
- lib/easy_ml/data/preprocessor.rb,
lib/easy_ml/data/preprocessor/utils.rb,
lib/easy_ml/data/preprocessor/simple_imputer.rb
Defined Under Namespace
Modules: Utils
Classes: SimpleImputer
Constant Summary
collapse
- CATEGORICAL_COMMON_MIN =
50
- PREPROCESSING_ORDER =
%w[clip mean median constant categorical one_hot ffill custom fill_date add_datepart]
Instance Method Summary
collapse
Methods included from Utils
#standardize_config
Instance Method Details
#decode_labels(values, col: nil) ⇒ Object
97
98
99
100
101
102
103
104
105
106
107
108
109
|
# File 'lib/easy_ml/data/preprocessor.rb', line 97
def decode_labels(values, col: nil)
imputers = initialize_imputers(preprocessing_steps[:training], dumb: true)
imputer = imputers.dig(col, "categorical")
decoder = imputer.statistics.dig(:categorical, :label_decoder)
other_value = decoder.keys.map(&:to_s).map(&:to_i).max + 1
decoder[other_value] = "other"
decoder.stringify_keys!
values.map do |value|
decoder[value.to_s]
end
end
|
#delete ⇒ Object
81
82
83
84
85
|
# File 'lib/easy_ml/data/preprocessor.rb', line 81
def delete
return unless File.directory?(@directory)
FileUtils.rm_rf(@directory)
end
|
#fit(df) ⇒ Object
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
# File 'lib/easy_ml/data/preprocessor.rb', line 23
def fit(df)
return if df.nil?
return if preprocessing_steps.keys.none?
puts "Preprocessing..." if verbose
imputers = initialize_imputers(
preprocessing_steps[:training].merge!(preprocessing_steps[:inference] || {})
)
did_cleanup = false
imputers.each do |col, imputers|
sorted_strategies(imputers).each do |strategy|
imputer = imputers[strategy]
unless did_cleanup
imputer.cleanup
did_cleanup = true
end
if df.columns.map(&:downcase).include?(col.downcase)
actual_col = df.columns.find { |c| c.downcase == imputer.attribute.downcase }
imputer.fit(df[actual_col], df)
if strategy == "clip" df[actual_col] = imputer.transform(df[actual_col])
end
elsif @verbose
puts "Warning: Column '#{col}' not found in DataFrame during fit process."
end
end
end
end
|
#is_fit? ⇒ Boolean
77
78
79
|
# File 'lib/easy_ml/data/preprocessor.rb', line 77
def is_fit?
statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } }
end
|
#move(to) ⇒ Object
87
88
89
90
91
92
93
94
95
|
# File 'lib/easy_ml/data/preprocessor.rb', line 87
def move(to)
old_dir = directory
current_env = directory.split("/")[-1]
new_dir = directory.gsub(Regexp.new(current_env), to)
puts "Moving #{old_dir} to #{new_dir}"
FileUtils.mv(old_dir, new_dir)
@directory = new_dir
end
|
#postprocess(df, inference: false) ⇒ Object
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
# File 'lib/easy_ml/data/preprocessor.rb', line 53
def postprocess(df, inference: false)
puts "Postprocessing..." if verbose
return df if preprocessing_steps.keys.none?
steps = if inference
preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
else
preprocessing_steps[:training]
end
df = apply_transformations(df, steps)
puts "Postprocessing complete." if @verbose
df
end
|
#preprocessing_steps=(preprocessing_steps) ⇒ Object
19
20
21
|
# File 'lib/easy_ml/data/preprocessor.rb', line 19
def preprocessing_steps=(preprocessing_steps)
super(standardize_config(preprocessing_steps).with_indifferent_access)
end
|
#statistics ⇒ Object
69
70
71
72
73
74
75
|
# File 'lib/easy_ml/data/preprocessor.rb', line 69
def statistics
initialize_imputers(preprocessing_steps[:training]).each_with_object({}) do |(col, strategies), result|
result[col] = strategies.each_with_object({}) do |(strategy, imputer), col_result|
col_result[strategy] = imputer.statistics
end
end
end
|