Class: OpenTox::Parser::Spreadsheets

Inherits:
Object
  • Object
show all
Defined in:
lib/parser.rb

Overview

Parser for getting spreadsheet data into a dataset

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeSpreadsheets

Returns a new instance of Spreadsheets.



288
289
290
291
292
293
294
295
296
297
298
# File 'lib/parser.rb', line 288

def initialize
  @data = []
  @features = []
  @feature_types = {}

  @format_errors = []
  @id_errors = []
  @activity_errors = []
  @duplicates = {}
  @max_class_values = 3
end

Instance Attribute Details

#datasetObject

Returns the value of attribute dataset.



286
287
288
# File 'lib/parser.rb', line 286

def dataset
  @dataset
end

Instance Method Details

#detect_new_values(row, value_maps) ⇒ Object



300
301
302
303
304
305
306
307
308
# File 'lib/parser.rb', line 300

def detect_new_values(row, value_maps)
  row.shift
  row.each_index do |i|
    value = row[i]
    value_maps[i] = Hash.new if value_maps[i].nil?
    value_maps[i][value].nil? ? value_maps[i][value]=0 : value_maps[i][value] += 1
  end
  value_maps
end

#load_csv(csv, drop_missing = false, all_numeric = false) ⇒ OpenTox::Dataset

Load CSV string (format specification: toxcreate.org/help)

Parameters:

  • csv (String)

    CSV representation of the dataset

  • drop_missing (Boolean) (defaults to: false)

    Whether completely missing rows should be droppped

  • all_numeric (Boolean) (defaults to: false)

    Whether all features should be treated as numeric

  • del_nominal (Boolean)

    All nominal features will be removed

Returns:



356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
# File 'lib/parser.rb', line 356

def load_csv(csv, drop_missing=false, all_numeric=false)
  row = 0
  input = csv.split("\n")
  headers = split_row(input.shift)
  headers.collect! {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")}
  add_features(headers)
  value_maps = Array.new
  regression_features=Array.new

  input.each { |row| 
    row = split_row(row)
    value_maps = detect_new_values(row, value_maps)
    value_maps.each_with_index { |vm,j|
      if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes.
        regression_features[j]=true 
      else
        regression_features[j]=false
      end
    }
  }

  input.each_with_index { |row, i| 
    drop=false
    row = split_row(row)
    raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
    if row.include?("")
      @format_errors << "Row #{i} has #{row.count("")} missing values" 
      drop=true
      drop_missing=true if (row.count("") == row.size-1) 
    end
    add_values(row, regression_features) unless (drop_missing && drop)
    if (drop_missing && drop) 
      @format_errors << "Row #{i} not added" 
    end
  }
  warnings
  @dataset
end

#load_spreadsheet(book, drop_missing = false) ⇒ OpenTox::Dataset

Load Spreadsheet book (created with roo gem roo.rubyforge.org/, excel format specification: toxcreate.org/help)

Parameters:

  • book (Excel)

    Excel workbook object (created with roo gem)

Returns:



313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# File 'lib/parser.rb', line 313

def load_spreadsheet(book, drop_missing=false)
  book.default_sheet = 0
  headers = book.row(1)
  add_features headers
  value_maps = Array.new
  regression_features=Array.new

  2.upto(book.last_row) { |i| 
    row = book.row(i)
    value_maps = detect_new_values(row, value_maps)
    value_maps.each_with_index { |vm,j|
      if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer.
        regression_features[j]=true 
      else
        regression_features[j]=false
      end
    }
  }

  2.upto(book.last_row) { |i| 
    drop=false
    row = book.row(i)
    raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size
    if row.include?("")
      @format_errors << "Row #{i} has #{row.count("")} missing values" 
      drop=true
      drop_missing=true if (row.count("") == row.size-1) 
    end
    add_values(row, regression_features) unless (drop_missing && drop)
    if (drop_missing && drop) 
      @format_errors << "Row #{i} not added" 
    end
  }
  warnings
  @dataset
end