Class: EasyML::Data::PreprocessingSteps::Preprocessor
- Inherits:
-
Object
- Object
- EasyML::Data::PreprocessingSteps::Preprocessor
- Includes:
- Utils
- Defined in:
- lib/easy_ml/data/preprocessor/preprocessor.rb
Constant Summary collapse
- CATEGORICAL_COMMON_MIN =
50
- PREPROCESSING_ORDER =
%w[clip mean median constant categorical one_hot ffill custom fill_date add_datepart]
Instance Attribute Summary collapse
-
#directory ⇒ Object
Returns the value of attribute directory.
-
#environment ⇒ Object
Returns the value of attribute environment.
-
#imputers ⇒ Object
Returns the value of attribute imputers.
-
#preprocessing_steps ⇒ Object
Returns the value of attribute preprocessing_steps.
-
#verbose ⇒ Object
Returns the value of attribute verbose.
Instance Method Summary collapse
- #delete ⇒ Object
- #fit(df) ⇒ Object
-
#initialize(directory: nil, preprocessing_steps: {}, verbose: false, environment: "development") ⇒ Preprocessor
constructor
A new instance of Preprocessor.
- #is_fit? ⇒ Boolean
- #move(to) ⇒ Object
- #postprocess(df, inference: false) ⇒ Object
- #statistics ⇒ Object
Constructor Details
#initialize(directory: nil, preprocessing_steps: {}, verbose: false, environment: "development") ⇒ Preprocessor
Returns a new instance of Preprocessor.
16 17 18 19 20 21 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 16 def initialize(directory: nil, preprocessing_steps: {}, verbose: false, environment: "development") @directory = directory @preprocessing_steps = standardize_config(preprocessing_steps).with_indifferent_access @verbose = verbose @environment = environment end |
Instance Attribute Details
#directory ⇒ Object
Returns the value of attribute directory.
14 15 16 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14 def directory @directory end |
#environment ⇒ Object
Returns the value of attribute environment.
14 15 16 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14 def environment @environment end |
#imputers ⇒ Object
Returns the value of attribute imputers.
14 15 16 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14 def imputers @imputers end |
#preprocessing_steps ⇒ Object
Returns the value of attribute preprocessing_steps.
14 15 16 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14 def preprocessing_steps @preprocessing_steps end |
#verbose ⇒ Object
Returns the value of attribute verbose.
14 15 16 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14 def verbose @verbose end |
Instance Method Details
#delete ⇒ Object
81 82 83 84 85 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 81 def delete return unless File.directory?(@directory) FileUtils.rm_rf(@directory) end |
#fit(df) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 23 def fit(df) return if df.nil? return if preprocessing_steps.keys.none? puts "Preprocessing..." if verbose imputers = initialize_imputers( preprocessing_steps[:training].merge!(preprocessing_steps[:inference] || {}) ) did_cleanup = false imputers.each do |col, imputers| sorted_strategies(imputers).each do |strategy| imputer = imputers[strategy] unless did_cleanup imputer.cleanup did_cleanup = true end if df.columns.map(&:downcase).include?(col.downcase) actual_col = df.columns.find { |c| c.downcase == imputer.attribute.downcase } imputer.fit(df[actual_col], df) if strategy == "clip" # This is the only one to transform during fit df[actual_col] = imputer.transform(df[actual_col]) end elsif @verbose puts "Warning: Column '#{col}' not found in DataFrame during fit process." end end end end |
#is_fit? ⇒ Boolean
77 78 79 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 77 def is_fit? statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } } end |
#move(to) ⇒ Object
87 88 89 90 91 92 93 94 95 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 87 def move(to) old_dir = directory current_env = directory.split("/")[-1] new_dir = directory.gsub(Regexp.new(current_env), to) puts "Moving #{old_dir} to #{new_dir}" FileUtils.mv(old_dir, new_dir) @directory = new_dir end |
#postprocess(df, inference: false) ⇒ Object
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 53 def postprocess(df, inference: false) puts "Postprocessing..." if verbose return df if preprocessing_steps.keys.none? steps = if inference preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {}) else preprocessing_steps[:training] end df = apply_transformations(df, steps) puts "Postprocessing complete." if @verbose df end |
#statistics ⇒ Object
69 70 71 72 73 74 75 |
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 69 def statistics initialize_imputers(preprocessing_steps[:training]).each_with_object({}) do |(col, strategies), result| result[col] = strategies.each_with_object({}) do |(strategy, imputer), col_result| col_result[strategy] = imputer.statistics end end end |