Class: EasyML::Data::PreprocessingSteps::Preprocessor

Inherits:
Object
  • Object
show all
Includes:
Utils
Defined in:
lib/easy_ml/data/preprocessor/preprocessor.rb

Constant Summary collapse

CATEGORICAL_COMMON_MIN =
50
PREPROCESSING_ORDER =
%w[clip mean median constant categorical one_hot ffill custom fill_date add_datepart]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(directory: nil, preprocessing_steps: {}, verbose: false, environment: "development") ⇒ Preprocessor

Returns a new instance of Preprocessor.



16
17
18
19
20
21
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 16

def initialize(directory: nil, preprocessing_steps: {}, verbose: false, environment: "development")
  @directory = directory
  @preprocessing_steps = standardize_config(preprocessing_steps).with_indifferent_access
  @verbose = verbose
  @environment = environment
end

Instance Attribute Details

#directoryObject

Returns the value of attribute directory.



14
15
16
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14

def directory
  @directory
end

#environmentObject

Returns the value of attribute environment.



14
15
16
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14

def environment
  @environment
end

#imputersObject

Returns the value of attribute imputers.



14
15
16
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14

def imputers
  @imputers
end

#preprocessing_stepsObject

Returns the value of attribute preprocessing_steps.



14
15
16
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14

def preprocessing_steps
  @preprocessing_steps
end

#verboseObject

Returns the value of attribute verbose.



14
15
16
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 14

def verbose
  @verbose
end

Instance Method Details

#deleteObject



81
82
83
84
85
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 81

def delete
  return unless File.directory?(@directory)

  FileUtils.rm_rf(@directory)
end

#fit(df) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 23

def fit(df)
  return if df.nil?
  return if preprocessing_steps.keys.none?

  puts "Preprocessing..." if verbose
  imputers = initialize_imputers(
    preprocessing_steps[:training].merge!(preprocessing_steps[:inference] || {})
  )

  did_cleanup = false
  imputers.each do |col, imputers|
    sorted_strategies(imputers).each do |strategy|
      imputer = imputers[strategy]
      unless did_cleanup
        imputer.cleanup
        did_cleanup = true
      end
      if df.columns.map(&:downcase).include?(col.downcase)
        actual_col = df.columns.find { |c| c.downcase == imputer.attribute.downcase }
        imputer.fit(df[actual_col], df)
        if strategy == "clip" # This is the only one to transform during fit
          df[actual_col] = imputer.transform(df[actual_col])
        end
      elsif @verbose
        puts "Warning: Column '#{col}' not found in DataFrame during fit process."
      end
    end
  end
end

#is_fit?Boolean

Returns:

  • (Boolean)


77
78
79
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 77

def is_fit?
  statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } }
end

#move(to) ⇒ Object



87
88
89
90
91
92
93
94
95
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 87

def move(to)
  old_dir = directory
  current_env = directory.split("/")[-1]
  new_dir = directory.gsub(Regexp.new(current_env), to)

  puts "Moving #{old_dir} to #{new_dir}"
  FileUtils.mv(old_dir, new_dir)
  @directory = new_dir
end

#postprocess(df, inference: false) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 53

def postprocess(df, inference: false)
  puts "Postprocessing..." if verbose
  return df if preprocessing_steps.keys.none?

  steps = if inference
            preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
          else
            preprocessing_steps[:training]
          end

  df = apply_transformations(df, steps)

  puts "Postprocessing complete." if @verbose
  df
end

#statisticsObject



69
70
71
72
73
74
75
# File 'lib/easy_ml/data/preprocessor/preprocessor.rb', line 69

def statistics
  initialize_imputers(preprocessing_steps[:training]).each_with_object({}) do |(col, strategies), result|
    result[col] = strategies.each_with_object({}) do |(strategy, imputer), col_result|
      col_result[strategy] = imputer.statistics
    end
  end
end