Class: OpenTox::Parser::Spreadsheets
- Inherits:
-
Object
- Object
- OpenTox::Parser::Spreadsheets
- Defined in:
- lib/parser.rb
Overview
Parser for getting spreadsheet data into a dataset
Instance Attribute Summary collapse
-
#dataset ⇒ Object
Returns the value of attribute dataset.
Instance Method Summary collapse
- #detect_new_values(row, value_maps) ⇒ Object
-
#initialize ⇒ Spreadsheets
constructor
A new instance of Spreadsheets.
-
#load_csv(csv, drop_missing = false, all_numeric = false) ⇒ OpenTox::Dataset
Load CSV string (format specification: toxcreate.org/help).
-
#load_spreadsheet(book, drop_missing = false) ⇒ OpenTox::Dataset
Load Spreadsheet book (created with roo gem roo.rubyforge.org/, excel format specification: toxcreate.org/help).
Constructor Details
#initialize ⇒ Spreadsheets
Returns a new instance of Spreadsheets.
288 289 290 291 292 293 294 295 296 297 298 |
# File 'lib/parser.rb', line 288 def initialize @data = [] @features = [] @feature_types = {} @format_errors = [] @id_errors = [] @activity_errors = [] @duplicates = {} @max_class_values = 3 end |
Instance Attribute Details
#dataset ⇒ Object
Returns the value of attribute dataset.
286 287 288 |
# File 'lib/parser.rb', line 286 def dataset @dataset end |
Instance Method Details
#detect_new_values(row, value_maps) ⇒ Object
300 301 302 303 304 305 306 307 308 |
# File 'lib/parser.rb', line 300 def detect_new_values(row, value_maps) row.shift row.each_index do |i| value = row[i] value_maps[i] = Hash.new if value_maps[i].nil? value_maps[i][value].nil? ? value_maps[i][value]=0 : value_maps[i][value] += 1 end value_maps end |
#load_csv(csv, drop_missing = false, all_numeric = false) ⇒ OpenTox::Dataset
Load CSV string (format specification: toxcreate.org/help)
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 |
# File 'lib/parser.rb', line 356 def load_csv(csv, drop_missing=false, all_numeric=false) row = 0 input = csv.split("\n") headers = split_row(input.shift) headers.collect! {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")} add_features(headers) value_maps = Array.new regression_features=Array.new input.each { |row| row = split_row(row) value_maps = detect_new_values(row, value_maps) value_maps.each_with_index { |vm,j| if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes. regression_features[j]=true else regression_features[j]=false end } } input.each_with_index { |row, i| drop=false row = split_row(row) raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size if row.include?("") @format_errors << "Row #{i} has #{row.count("")} missing values" drop=true drop_missing=true if (row.count("") == row.size-1) end add_values(row, regression_features) unless (drop_missing && drop) if (drop_missing && drop) @format_errors << "Row #{i} not added" end } warnings @dataset end |
#load_spreadsheet(book, drop_missing = false) ⇒ OpenTox::Dataset
Load Spreadsheet book (created with roo gem roo.rubyforge.org/, excel format specification: toxcreate.org/help)
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 |
# File 'lib/parser.rb', line 313 def load_spreadsheet(book, drop_missing=false) book.default_sheet = 0 headers = book.row(1) add_features headers value_maps = Array.new regression_features=Array.new 2.upto(book.last_row) { |i| row = book.row(i) value_maps = detect_new_values(row, value_maps) value_maps.each_with_index { |vm,j| if vm.size > @max_class_values # 5 is the maximum nr of classes supported by Fminer. regression_features[j]=true else regression_features[j]=false end } } 2.upto(book.last_row) { |i| drop=false row = book.row(i) raise "Entry has size #{row.size}, different from headers (#{headers.size})" if row.size != headers.size if row.include?("") @format_errors << "Row #{i} has #{row.count("")} missing values" drop=true drop_missing=true if (row.count("") == row.size-1) end add_values(row, regression_features) unless (drop_missing && drop) if (drop_missing && drop) @format_errors << "Row #{i} not added" end } warnings @dataset end |