Class: EasyML::Data::Dataset::Splits::FileSplit

Inherits:
Split
  • Object
show all
Includes:
Utils, GlueGun::DSL
Defined in:
lib/easy_ml/data/dataset/splits/file_split.rb

Instance Method Summary collapse

Methods included from Utils

#append_to_csv, #expand_dir, #null_check

Methods inherited from Split

#test, #train, #valid

Constructor Details

#initialize(options) ⇒ FileSplit

Returns a new instance of FileSplit.



18
19
20
21
# File 'lib/easy_ml/data/dataset/splits/file_split.rb', line 18

def initialize(options)
  super
  FileUtils.mkdir_p(dir)
end

Instance Method Details

#cleanupObject



84
85
86
87
# File 'lib/easy_ml/data/dataset/splits/file_split.rb', line 84

def cleanup
  FileUtils.rm_rf(dir)
  FileUtils.mkdir_p(dir)
end

#read(segment, split_ys: false, target: nil, drop_cols: [], &block) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/easy_ml/data/dataset/splits/file_split.rb', line 45

def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
  files = files_for_segment(segment)

  if block_given?
    result = nil
    total_rows = files.sum { |file| df(file).shape[0] }
    progress_bar = create_progress_bar(segment, total_rows) if verbose

    files.each do |file|
      df = self.df(file)
      df = sample_data(df) if sample < 1.0
      drop_cols &= df.columns
      df = df.drop(drop_cols) unless drop_cols.empty?

      if split_ys
        xs, ys = split_features_targets(df, true, target)
        result = process_block_with_split_ys(block, result, xs, ys)
      else
        result = process_block_without_split_ys(block, result, df)
      end

      progress_bar.progress += df.shape[0] if verbose
    end
    progress_bar.finish if verbose
    result
  elsif files.empty?
    return nil, nil if split_ys

    nil

  else
    combined_df = combine_dataframes(files)
    combined_df = sample_data(combined_df) if sample < 1.0
    drop_cols &= combined_df.columns
    combined_df = combined_df.drop(drop_cols) unless drop_cols.empty?
    split_features_targets(combined_df, split_ys, target)
  end
end

#save(segment, df) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/easy_ml/data/dataset/splits/file_split.rb', line 23

def save(segment, df)
  segment_dir = File.join(dir, segment.to_s)
  FileUtils.mkdir_p(segment_dir)

  current_file = current_file_for_segment(segment)
  current_row_count = current_file && File.exist?(current_file) ? df(current_file).shape[0] : 0
  remaining_rows = max_rows_per_file - current_row_count

  while df.shape[0] > 0
    if df.shape[0] <= remaining_rows
      append_to_csv(df, current_file)
      break
    else
      df_to_append = df.slice(0, remaining_rows)
      df = df.slice(remaining_rows, df.shape[0] - remaining_rows)
      append_to_csv(df_to_append, current_file)
      current_file = new_file_path_for_segment(segment)
      remaining_rows = max_rows_per_file
    end
  end
end

#split_atObject



89
90
91
92
93
# File 'lib/easy_ml/data/dataset/splits/file_split.rb', line 89

def split_at
  return nil if output_files.empty?

  output_files.map { |file| File.mtime(file) }.max
end