Class: EasyML::Data::Datasource::FileDatasource

Inherits:
EasyML::Data::Datasource show all
Includes:
GlueGun::DSL
Defined in:
lib/easy_ml/data/datasource/file_datasource.rb

Instance Method Summary collapse

Instance Method Details

#dataObject



30
31
32
33
34
35
36
37
# File 'lib/easy_ml/data/datasource/file_datasource.rb', line 30

def data
  combined_df = nil
  files.each do |file|
    df = Polars.read_csv(file, **polars_args)
    combined_df = combined_df.nil? ? df : combined_df.vstack(df)
  end
  combined_df
end

#filesObject



18
19
20
# File 'lib/easy_ml/data/datasource/file_datasource.rb', line 18

def files
  Dir.glob(File.join(root_dir, "**/*.csv")).sort
end

#in_batches(of: 10_000) ⇒ Object



11
12
13
14
15
16
# File 'lib/easy_ml/data/datasource/file_datasource.rb', line 11

def in_batches(of: 10_000)
  files.each do |file|
    df = Polars.read_csv(file, **polars_args)
    yield df
  end
end

#last_updated_atObject



22
23
24
# File 'lib/easy_ml/data/datasource/file_datasource.rb', line 22

def last_updated_at
  files.map { |file| File.mtime(file) }.max
end

#refresh!Object



26
27
28
# File 'lib/easy_ml/data/datasource/file_datasource.rb', line 26

def refresh!
  # No need to refresh for directory-based datasource
end