Class: EasyML::Data::Preprocessor

Inherits:
Object
  • Object
show all
Includes:
Utils, GlueGun::DSL
Defined in:
lib/easy_ml/data/preprocessor.rb,
lib/easy_ml/data/preprocessor/utils.rb,
lib/easy_ml/data/preprocessor/simple_imputer.rb

Defined Under Namespace

Modules: Utils Classes: SimpleImputer

Constant Summary collapse

CATEGORICAL_COMMON_MIN =
50
PREPROCESSING_ORDER =
%w[clip mean median constant categorical one_hot ffill custom fill_date add_datepart]

Instance Method Summary collapse

Methods included from Utils

#standardize_config

Instance Method Details

#decode_labels(values, col: nil) ⇒ Object



97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/easy_ml/data/preprocessor.rb', line 97

def decode_labels(values, col: nil)
  imputers = initialize_imputers(preprocessing_steps[:training], dumb: true)
  imputer = imputers.dig(col, "categorical")
  decoder = imputer.statistics.dig(:categorical, :label_decoder)

  other_value = decoder.keys.map(&:to_s).map(&:to_i).max + 1
  decoder[other_value] = "other"
  decoder.stringify_keys!

  values.map do |value|
    decoder[value.to_s]
  end
end

#deleteObject



81
82
83
84
85
# File 'lib/easy_ml/data/preprocessor.rb', line 81

def delete
  return unless File.directory?(@directory)

  FileUtils.rm_rf(@directory)
end

#fit(df) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/easy_ml/data/preprocessor.rb', line 23

def fit(df)
  return if df.nil?
  return if preprocessing_steps.keys.none?

  puts "Preprocessing..." if verbose
  imputers = initialize_imputers(
    preprocessing_steps[:training].merge!(preprocessing_steps[:inference] || {})
  )

  did_cleanup = false
  imputers.each do |col, imputers|
    sorted_strategies(imputers).each do |strategy|
      imputer = imputers[strategy]
      unless did_cleanup
        imputer.cleanup
        did_cleanup = true
      end
      if df.columns.map(&:downcase).include?(col.downcase)
        actual_col = df.columns.find { |c| c.downcase == imputer.attribute.downcase }
        imputer.fit(df[actual_col], df)
        if strategy == "clip" # This is the only one to transform during fit
          df[actual_col] = imputer.transform(df[actual_col])
        end
      elsif @verbose
        puts "Warning: Column '#{col}' not found in DataFrame during fit process."
      end
    end
  end
end

#is_fit?Boolean

Returns:

  • (Boolean)


77
78
79
# File 'lib/easy_ml/data/preprocessor.rb', line 77

def is_fit?
  statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } }
end

#move(to) ⇒ Object



87
88
89
90
91
92
93
94
95
# File 'lib/easy_ml/data/preprocessor.rb', line 87

def move(to)
  old_dir = directory
  current_env = directory.split("/")[-1]
  new_dir = directory.gsub(Regexp.new(current_env), to)

  puts "Moving #{old_dir} to #{new_dir}"
  FileUtils.mv(old_dir, new_dir)
  @directory = new_dir
end

#postprocess(df, inference: false) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/easy_ml/data/preprocessor.rb', line 53

def postprocess(df, inference: false)
  puts "Postprocessing..." if verbose
  return df if preprocessing_steps.keys.none?

  steps = if inference
            preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
          else
            preprocessing_steps[:training]
          end

  df = apply_transformations(df, steps)

  puts "Postprocessing complete." if @verbose
  df
end

#preprocessing_steps=(preprocessing_steps) ⇒ Object



19
20
21
# File 'lib/easy_ml/data/preprocessor.rb', line 19

def preprocessing_steps=(preprocessing_steps)
  super(standardize_config(preprocessing_steps).with_indifferent_access)
end

#statisticsObject



69
70
71
72
73
74
75
# File 'lib/easy_ml/data/preprocessor.rb', line 69

def statistics
  initialize_imputers(preprocessing_steps[:training]).each_with_object({}) do |(col, strategies), result|
    result[col] = strategies.each_with_object({}) do |(strategy, imputer), col_result|
      col_result[strategy] = imputer.statistics
    end
  end
end