Class: EasyML::Data::Preprocessor::SimpleImputer
- Inherits:
-
Object
- Object
- EasyML::Data::Preprocessor::SimpleImputer
- Defined in:
- lib/easy_ml/data/preprocessor/simple_imputer.rb
Instance Attribute Summary collapse
-
#attribute ⇒ Object
Returns the value of attribute attribute.
-
#options ⇒ Object
Returns the value of attribute options.
-
#path ⇒ Object
Returns the value of attribute path.
-
#statistics ⇒ Object
readonly
Returns the value of attribute statistics.
-
#strategy ⇒ Object
Returns the value of attribute strategy.
Instance Method Summary collapse
- #apply_defaults ⇒ Object
- #cleanup ⇒ Object
- #deep_symbolize_keys! ⇒ Object
- #file_path ⇒ Object
- #fit(x, df = nil) ⇒ Object
- #fit_custom(x) ⇒ Object
-
#initialize(strategy: "mean", path: nil, attribute: nil, options: {}, &block) ⇒ SimpleImputer
constructor
A new instance of SimpleImputer.
- #load ⇒ Object
- #save ⇒ Object
- #should_transform_categorical?(val) ⇒ Boolean
- #should_transform_custom?(x) ⇒ Boolean
- #transform(x) ⇒ Object
- #transform_categorical(val) ⇒ Object
- #transform_custom(x) ⇒ Object
- #transform_polars(x) ⇒ Object
- #transform_today(_val) ⇒ Object
Constructor Details
#initialize(strategy: "mean", path: nil, attribute: nil, options: {}, &block) ⇒ SimpleImputer
Returns a new instance of SimpleImputer.
12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 12 def initialize(strategy: "mean", path: nil, attribute: nil, options: {}, &block) @strategy = strategy.to_sym @path = path @attribute = attribute @options = || {} apply_defaults load @statistics ||= {} deep_symbolize_keys! return unless block_given? instance_eval(&block) end |
Instance Attribute Details
#attribute ⇒ Object
Returns the value of attribute attribute.
10 11 12 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 10 def attribute @attribute end |
#options ⇒ Object
Returns the value of attribute options.
10 11 12 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 10 def @options end |
#path ⇒ Object
Returns the value of attribute path.
10 11 12 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 10 def path @path end |
#statistics ⇒ Object (readonly)
Returns the value of attribute statistics.
9 10 11 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 9 def statistics @statistics end |
#strategy ⇒ Object
Returns the value of attribute strategy.
10 11 12 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 10 def strategy @strategy end |
Instance Method Details
#apply_defaults ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 30 def apply_defaults @options[:date_column] ||= "CREATED_DATE" if strategy == :categorical @options[:categorical_min] ||= 25 elsif strategy == :custom itself = ->(col) { col } @options[:fit] ||= itself @options[:transform] ||= itself end end |
#cleanup ⇒ Object
134 135 136 137 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 134 def cleanup @statistics = {} FileUtils.rm(file_path) if File.exist?(file_path) end |
#deep_symbolize_keys! ⇒ Object
26 27 28 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 26 def deep_symbolize_keys! @statistics = @statistics.deep_symbolize_keys end |
#file_path ⇒ Object
128 129 130 131 132 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 128 def file_path raise "Need both attribute and path to save/load statistics" unless attribute.present? && path.to_s.present? File.join(path, "statistics.json") end |
#fit(x, df = nil) ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 42 def fit(x, df = nil) x = validate_input(x) fit_values = case @strategy when :mean fit_mean(x) when :median fit_median(x) when :ffill fit_ffill(x, df) when :most_frequent fit_most_frequent(x) when :categorical fit_categorical(x) when :constant fit_constant(x) when :clip fit_no_op(x) when :today fit_no_op(x) when :one_hot fit_no_op(x) when :custom fit_custom(x) else raise ArgumentError, "Invalid strategy: #{@strategy}" end || {} @statistics[attribute] ||= {} @statistics[attribute][@strategy] = fit_values.merge!(original_dtype: x.dtype) save self end |
#fit_custom(x) ⇒ Object
189 190 191 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 189 def fit_custom(x) x end |
#load ⇒ Object
155 156 157 158 159 160 161 162 163 164 165 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 155 def load return unless File.exist?(file_path) all_statistics = JSON.parse(File.read(file_path)) attribute_stats = all_statistics[@attribute] return unless attribute_stats @statistics = deserialize_statistics(attribute_stats) deep_symbolize_keys! end |
#save ⇒ Object
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 139 def save FileUtils.mkdir_p(File.dirname(file_path)) all_statistics = (File.exist?(file_path) ? JSON.parse(File.read(file_path)) : {}).deep_symbolize_keys deep_symbolize_keys! serialized = serialize_statistics(@statistics) all_statistics[attribute] = {} unless all_statistics.key?(attribute) all_statistics[attribute][@strategy] = serialized[attribute.to_sym][@strategy.to_sym] File.open(file_path, "w") do |file| file.write(JSON.pretty_generate(all_statistics)) end end |
#should_transform_categorical?(val) ⇒ Boolean
167 168 169 170 171 172 173 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 167 def should_transform_categorical?(val) values = @statistics.dig(:categorical, :value) || {} min_ct = [:categorical_min] || 25 allowed_values = values.select { |_v, c| c >= min_ct } allowed_values.keys.map(&:to_s).exclude?(val) end |
#should_transform_custom?(x) ⇒ Boolean
193 194 195 196 197 198 199 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 193 def should_transform_custom?(x) if .key?(:should_transform) [:should_transform].call(x) else should_transform_default?(x) end end |
#transform(x) ⇒ Object
76 77 78 79 80 81 82 83 84 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 76 def transform(x) check_is_fitted if x.is_a?(Polars::Series) transform_polars(x) else transform_dense(x) end end |
#transform_categorical(val) ⇒ Object
175 176 177 178 179 180 181 182 183 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 175 def transform_categorical(val) return "other" if val.nil? values = @statistics.dig(:categorical, :value) || {} min_ct = [:categorical_min] || 25 allowed_values = values.select { |_v, c| c >= min_ct }.keys.map(&:to_s) allowed_values.include?(val.to_s) ? val.to_s : "other" end |
#transform_custom(x) ⇒ Object
201 202 203 204 205 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 201 def transform_custom(x) raise "Transform required" unless .key?(:transform) [:transform].call(x) end |
#transform_polars(x) ⇒ Object
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 86 def transform_polars(x) result = case @strategy when :mean, :median, :ffill, :most_frequent, :constant x.fill_null(@statistics[@strategy][:value]) when :clip min = ["min"] || 0 max = ["max"] || 1_000_000_000_000 if x.null_count != x.len x.clip(min, max) else x end when :categorical allowed_values = @statistics.dig(:categorical, :value).select do |_k, v| v >= [:categorical_min] end.keys.map(&:to_s) if x.null_count == x.len x.fill_null(transform_categorical(nil)) else x.apply do |val| allowed_values.include?(val) ? val : transform_categorical(val) end end when :today x.fill_null(transform_today(nil)) when :custom if x.null_count == x.len x.fill_null(transform_custom(nil)) else x.apply do |val| should_transform_custom?(val) ? transform_custom(val) : val end end else raise ArgumentError, "Unsupported strategy for Polars::Series: #{@strategy}" end # Cast the result back to the original dtype original_dtype = @statistics.dig(@strategy, :original_dtype) original_dtype ? result.cast(original_dtype) : result end |
#transform_today(_val) ⇒ Object
185 186 187 |
# File 'lib/easy_ml/data/preprocessor/simple_imputer.rb', line 185 def transform_today(_val) EST.now.beginning_of_day end |