Module: FileIO
- Included in:
- FSelector::Base
- Defined in:
- lib/fselector/fileio.rb
Overview
class labels and features are treated as symbols
read and write various file formats, the internal data structure looks like:
data = {
:c1 => [ # class c1
{:f1=>1, :f2=>2} # sample 2
],
:c2 => [ # class c2
{:f1=>1, :f3=>3}, # sample 1
{:f2=>2} # sample 3
]
}
where :c1 and :c2 are class labels; :f1, :f2, and :f3 are features
Instance Method Summary collapse
-
#data_from_csv(fname = :stdin, csv_opts = {})
read from csv.
-
#data_from_libsvm(fname = :stdin)
read from libsvm.
-
#data_from_random(nsample = 100, nclass = 2, nfeature = 10, ncategory = 2, allow_mv = true)
read from random data (read only, for test purpose).
-
#data_from_url(url, format, *args)
read data from url.
-
#data_from_weka(fname = :stdin, quote_char = '"')
read from WEKA ARFF file.
-
#data_to_csv(fname = :stdout)
write to csv.
-
#data_to_libsvm(fname = :stdout)
write to libsvm.
-
#data_to_weka(fname = :stdout, format = nil)
write to WEKA ARFF file.
Instance Method Details
#data_from_csv(fname = :stdin, csv_opts = {})
missing values are allowed
read from csv
if no csv_opts supplied, we assume the CSV file in the following format:
first row contains feature names with last column being class name
feat_name1,feat_name2,...,class_name
second row contains feature types with last column being class type
feat_type1,feat_type2,...,class_type
and the remaing rows containing feature values with last column being class label
feat_value1,feat_value2,...,class_label
...
allowed feature types (case-insensitive) are:
INTEGER, REAL, NUMERIC, CONTINUOUS, DOUBLE, FLOAT, STRING, NOMINAL, CATEGORICAL
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 |
# File 'lib/fselector/fileio.rb', line 190 def data_from_csv(fname=:stdin, csv_opts = {}) ifs = get_ifs(fname) csv_opts = { # default options, new opts will override old ones :feature_name_row => 1, # first row contains feature names :feature_type_row => 2, # second row contains feature types :class_label_column => 0, # last column contains class labels :feature_name2type => {} # user-supplied feature name-type pairs }.merge(csv_opts) if csv_opts and csv_opts.class == Hash feature_name_row = csv_opts[:feature_name_row] feature_type_row = csv_opts[:feature_type_row] class_label_column = csv_opts[:class_label_column] feature_name2type = csv_opts[:feature_name2type] # user-supplied feature name-type pairs, this is useful # when file contains no specific rows for feture names and types if feature_name2type and not feature_name2type.empty? features = feature_name2type.keys.collect { |n| n.to_sym } types = feature_name2type.values.collect { |t| t.downcase.to_sym } # disable name and type rows feature_name_row, feature_type_row = nil, nil end data = {} ifs.each_line do |ln| next if ln.blank? cells = ln.chomp.split(/,/) # feature names if ifs.lineno == feature_name_row class_name = cells.delete_at(class_label_column-1) # simply useless features = cells.to_sym # feature types elsif ifs.lineno == feature_type_row class_type = cells.delete_at(class_label_column-1) # simply useless # store feature type as lower-case symbol types = cells.collect { |t| t.downcase.to_sym } else # data rows if class_label_column <= cells.size and features.size+1 == cells.size and types.size+1 == cells.size class_label = cells.delete_at(class_label_column-1).to_sym data[class_label] ||= [] # feature values fvs = cells else abort "[#{__FILE__}@#{__LINE__}]: \n"+ " invalid csv format!" end fs = {} fvs.each_with_index do |v, i| next if v.empty? # missing value type = types[i] if type == :integer v = v.to_i elsif [:real, :numeric, :continuous, :float, :double].include? type v = v.to_f elsif [:string, :nominal, :categorical].include? type # else abort "[#{__FILE__}@#{__LINE__}]: \n"+ " invalid feature type '#{type}', must be one of the following: \n"+ " integer, real, numeric, float, double, continuous, categorical, string, nominal" end fs[features[i]] = v end data[class_label] << fs end end # close file ifs.close if not ifs == $stdin set_data(data) set_features(features) # feature name-type pairs features.each_with_index do |f, i| set_feature_type(f, types[i]) end end |
#data_from_libsvm(fname = :stdin)
read from libsvm
file has the following format
+1 2:1 4:1 ...
-1 3:1 4:1 ...
....
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
# File 'lib/fselector/fileio.rb', line 87 def data_from_libsvm(fname=:stdin) ifs = get_ifs(fname) data = {} ifs.each_line do |ln| label, *features = ln.chomp.split(/\s+/) label = label.to_sym data[label] ||= [] feats = {} features.each do |fv| f, v = fv.split(/:/) feats[f.to_sym] = v.to_f end data[label] << feats end # close file ifs.close if not ifs == $stdin set_data(data) # feature name-type pairs each_feature do |f| set_feature_type(f, :numeric) end end |
#data_from_random(nsample = 100, nclass = 2, nfeature = 10, ncategory = 2, allow_mv = true)
read from random data (read only, for test purpose)
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/fselector/fileio.rb', line 40 def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true) data = {} nsample.times do k = "c#{rand(nclass)+1}".to_sym data[k] ||= [] feats = {} fs = (1..nfeature).to_a if allow_mv (rand(nfeature)).times do v = fs[rand(fs.size)] fs.delete(v) end end fs.sort.each do |i| f = "f#{i}".to_sym if ncategory == 1 feats[f] = 1 elsif ncategory > 1 feats[f] = rand(ncategory)+1 else feats[f] = rand.round(3) # round to 3-digit precision end end data[k] << feats end set_data(data) end |
#data_from_url(url, format, *args)
read data from url
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 |
# File 'lib/fselector/fileio.rb', line 504 def data_from_url(url, format, *args) format = format.downcase.to_sym if not [:libsvm, :csv, :weka].include? format abort "[#{__FILE__}@#{__LINE__}]: \n"+ " only CSV, LibSVM and Weka file formats are supported!" end uri = URI.parse(URI.encode(url)) data_src = StringIO.new(uri.read) if format == :csv data_from_csv(data_src, *args) elsif format == :libsvm data_from_libsvm(data_src) else # weka data_from_weka(data_src, *args) end end |
#data_from_weka(fname = :stdin, quote_char = '"')
it's ok if string containes spaces quoted by quote_char
read from WEKA ARFF file
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 |
# File 'lib/fselector/fileio.rb', line 320 def data_from_weka(fname=:stdin, quote_char='"') ifs = get_ifs(fname) relation, features, classes, types, comments = '', [], [], [], [] has_class, has_data = false, false data = {} ifs.each_line do |ln| next if ln.blank? # blank lines ln = ln.chomp # comment line if ln.comment?('%') comments << ln # relation elsif ln =~ /^@RELATION/i tmp, relation = ln.split_me(/\s+/, quote_char) # class attribute elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i has_class = true classes = $1.strip.split_me(/,\s*/, quote_char).to_sym classes.each { |k| data[k] = [] } # feature attribute (nominal) elsif ln =~ /^@ATTRIBUTE\s+(\S+)\s+{(.+)}/i f = $1.to_sym features << f #$2.split_me(/,\s*/, quote_char) # feature nominal values types << :nominal # feature attribute (categorical, integer, real, continuous, numeric, float, double, string, date) elsif ln =~ /^@ATTRIBUTE/i tmp, v1, v2 = ln.split_me(/\s+/, quote_char) f = v1.to_sym features << f # store feture type as lower-case symbol types << v2.downcase.to_sym # data header elsif ln =~ /^@DATA/i has_data = true # data elsif has_data and has_class # read data section if ln =~ /^{(.+)}$/ # sparse ARFF feats = $1.split_me(/,\s*/, quote_char) label = feats.pop.split_me(/\s+/, quote_char)[1] label = label.to_sym fs = {} # indices of feature with zero value zero_fi = (0...features.size).to_a feats.each do |fi_fv| fi, fv = fi_fv.split_me(/\s+/, quote_char) fi = fi.to_i add_feature_weka(fs, features[fi], fv, types[fi]) zero_fi.delete(fi) end # feature with zero value zero_fi.each do |zi| add_feature_weka(fs, features[zi], 0, types[zi]) end data[label] << fs else # regular ARFF feats = ln.split_me(/,\s*/, quote_char) label = feats.pop.to_sym fs = {} feats.each_with_index do |fv, i| add_feature_weka(fs, features[i], fv, types[i]) end data[label] << fs if label end else next end end # close file ifs.close if not ifs == $stdin set_data(data) set_classes(classes) set_features(features) # feature name-type pairs features.each_with_index do |f, i| set_feature_type(f, types[i]) end set_opt(:relation, relation) set_opt(:comments, comments) if not comments.empty? end |
#data_to_csv(fname = :stdout)
write to csv
ouput CSV file has the same format as the default input for data_from_csv()
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
# File 'lib/fselector/fileio.rb', line 287 def data_to_csv(fname=:stdout) ofs = get_ofs(fname) # feature names ofs.puts get_features.join(',') # feature types ofs.puts get_features.collect { |f| get_feature_type(f) || :string }.join(',') each_sample do |k, s| each_feature do |f| if s.has_key? f ofs.print "#{s[f]}," else ofs.print "," end end ofs.puts "#{k}" end # close file ofs.close if not ofs == $stdout end |
#data_to_libsvm(fname = :stdout)
write to libsvm
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/fselector/fileio.rb', line 124 def data_to_libsvm(fname=:stdout) ofs = get_ofs(fname) # convert class label to integer type k2idx = {} get_classes.each_with_index do |k, i| k2idx[k] = i+1 end # convert feature to integer type f2idx = {} get_features.each_with_index do |f, i| f2idx[f] = i+1 end each_sample do |k, s| ofs.print "#{k2idx[k]} " s.keys.sort { |x, y| f2idx[x] <=> f2idx[y] }.each do |f| if not s[f].is_a? Numeric abort "[#{__FILE__}@#{__LINE__}]: \n"+ " LibSVM format only supports the following feature type: \n"+ " integer, real, numeric, float, double, continuous" else ofs.print " #{f2idx[f]}:#{s[f]}" if not s[f].zero? # implicit mode end end ofs.puts end # close file ofs.close if not ofs == $stdout end |
#data_to_weka(fname = :stdout, format = nil)
write to WEKA ARFF file
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 |
# File 'lib/fselector/fileio.rb', line 424 def data_to_weka(fname=:stdout, format=nil) ofs = get_ofs(fname) # comments comments = get_opt(:comments) if comments ofs.puts comments.join("\n") ofs.puts end # relation relation = get_opt(:relation) if relation ofs.puts "@RELATION #{relation}" else ofs.puts "@RELATION data_gen_by_FSelector" end ofs.puts # feature attribute each_feature do |f| ofs.print "@ATTRIBUTE #{f} " type = get_feature_type(f) if type if type == :nominal ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}" else ofs.puts type end else # treat all other feature types as string ofs.puts :string end end # class attribute ofs.puts "@ATTRIBUTE class {#{get_classes.join(',')}}" ofs.puts # data header ofs.puts "@DATA" each_sample do |k, s| if format == :sparse # sparse ARFF ofs.print "{" get_features.each_with_index do |f, i| if s.has_key? f ofs.print "#{i} #{s[f]}," if not s[f].zero? else # missing value ofs.print "#{i} ?," end end ofs.print "#{get_features.size} #{k}" ofs.puts "}" else # regular ARFF each_feature do |f| if s.has_key? f ofs.print "#{s[f]}," else # missing value ofs.print "?," end end ofs.puts "#{k}" end end # close file ofs.close if not ofs == $stdout end |