Class: CTioga2::Data::Dataset

Inherits:
Object
  • Object
show all
Includes:
Log
Defined in:
lib/ctioga2/data/dataset.rb

Overview

This is the central class of the data manipulation in ctioga. It is a series of ‘Y’ DataColumn indexed on a unique ‘X’ DataColumn. This can be used to represent multiple XY data sets, but also XYZ and even more complex data. The actual signification of the various ‘Y’ columns are left to the user.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Log

context, debug, error, fatal, #format_exception, #identify, info, init_logger, log_to, logger, set_level, #spawn, warn

Constructor Details

#initialize(name, columns) ⇒ Dataset

Creates a new Dataset object with the given data columns (Dvector or DataColumn). #x is the first one



48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/ctioga2/data/dataset.rb', line 48

def initialize(name, columns)
  columns.each_index do |i|
    if columns[i].is_a? Dobjects::Dvector
      columns[i] = DataColumn.new(columns[i])
    end
  end
  @x = columns[0]
  @ys = columns[1..-1]
  @name = name

  # Cache for the indexed dtable
  @indexed_dtable = nil
end

Instance Attribute Details

#nameObject

The name of the Dataset, such as one that could be used in a legend (like for the –auto-legend option of ctioga).



42
43
44
# File 'lib/ctioga2/data/dataset.rb', line 42

def name
  @name
end

#xObject

The X DataColumn



35
36
37
# File 'lib/ctioga2/data/dataset.rb', line 35

def x
  @x
end

#ysObject

All Y DataColumn (an Array of DataColumn)



38
39
40
# File 'lib/ctioga2/data/dataset.rb', line 38

def ys
  @ys
end

Class Method Details

.create(name, number) ⇒ Object

Creates a



63
64
65
66
67
68
69
# File 'lib/ctioga2/data/dataset.rb', line 63

def self.create(name, number)
  cols = []
  number.times do
    cols << Dobjects::Dvector.new()
  end
  return self.new(name, cols)
end

.dataset_from_spec(name, spec) ⇒ Object

Creates a new Dataset from a specification. This function parses a specification in the form of:

  • a:b:c+

  • spec=a:spec2=b+

It yields each of the unprocessed text, not necessarily in the order they were read, and expects a Dvector as a return value.

It then builds a suitable Dataset object with these values, and returns it.

It is strongly recommended to use this function for reimplementations of Backends::Backend#query_dataset.



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/ctioga2/data/dataset.rb', line 84

def self.dataset_from_spec(name, spec)
  specs = []
  i = 0
  for s in spec.split_at_toplevel(/:/)
    if s =~ /^(x|y\d*|z)(#{DataColumn::ColumnSpecsRE})=(.*)/i
      which, mod, s = $1.downcase,($2 && $2.downcase) || "value",$3
      
      case which
      when /x/
        idx = 0
      when /y(\d+)?/
        if $1
          idx = $1.to_i
        else
          idx = 1
        end
      when /z/
        idx = 2
      end
      specs[idx] ||= {}
      specs[idx][mod] = yield s
    else
      specs[i] = {"value" =>  yield(s)}
    end
    i += 1
  end
  columns = []
  for s in specs
    columns << DataColumn.from_hash(s)
  end
  return Dataset.new(name, columns)
end

Instance Method Details

#<<(dataset) ⇒ Object

Concatenates another Dataset to this one



178
179
180
181
182
183
184
185
186
# File 'lib/ctioga2/data/dataset.rb', line 178

def <<(dataset)
  if dataset.size != self.size
    raise "Can't concatenate datasets that don't have the same number of columns: #{self.size} vs #{dataset.size}"
  end
  @x << dataset.x
  @ys.size.times do |i|
    @ys[i] << dataset.ys[i]
  end
end

#apply_formulas(formula) ⇒ Object

Applies formulas to values. Formulas are like text-backend specification: “:”-separated specs of the target



308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# File 'lib/ctioga2/data/dataset.rb', line 308

def apply_formulas(formula)
  columns = []
  columns << Dobjects::Dvector.new(@x.size) do |i|
    i
  end
  columns << @x.values
  for y in @ys
    columns << y.values
  end

  # Names:
  heads = {
    'x' => 1,
    'y' => 2,
    'z' => 3,
  }
  i = 1
  for f in @ys
    heads["y#{i}"] = i+1
    i += 1
  end

  result = []
  for f in formula.split(/:/) do
    fm = Utils::parse_formula(f, nil, heads)
    debug { 
      "Using formula #{fm} for column spec: #{f} (##{result.size})" 
    }
    result << DataColumn.new(Dobjects::Dvector.
                             compute_formula(fm, 
                                             columns))
  end
  return Dataset.new(name + "_mod", result)
end

#average_duplicates!Object

Average all the non-X values of successive data points that have the same X values. It is a naive version that also averages the error columns.



267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# File 'lib/ctioga2/data/dataset.rb', line 267

def average_duplicates!
  last_x = nil
  last_x_first_idx = 0
  xv = @x.values
  i = 0
  vectors = all_vectors
  while i < xv.size
    x = xv[i]
    if ((last_x == x) && (i != (xv.size - 1)))
      # Do nothing
    else
      if last_x_first_idx < (i - 1)  || 
          ((last_x == x) && (i == (xv.size - 1)))
        if i == (xv.size - 1)
          e = i
        else
          e = i-1
        end                 # The end of the slice.

        ## \todo In real, to do this properly, one would
        # have to write a proper function in DataColumn that
        # does averaging over certain indices possibly more
        # cleverly than the current way to do.
        for v in vectors
          subv = v[last_x_first_idx..e]
          ave = subv.sum/subv.size
          v.slice!(last_x_first_idx+1, e - last_x_first_idx)
          v[last_x_first_idx] = ave
        end
        i -= e - last_x_first_idx
      end
      last_x = x
      last_x_first_idx = i
    end
    i += 1
  end
  
end

#column_namesObject

Returns an array with Column names.



147
148
149
150
151
152
153
# File 'lib/ctioga2/data/dataset.rb', line 147

def column_names
  retval = @x.column_names("x")
  @ys.each_index do |i|
    retval += @ys[i].column_names("y#{i+1}")
  end
  return retval
end

#each_values(with_errors = false, expand_nil = true) ⇒ Object

Iterates over all the values of the Dataset. Values of optional arguments are those of DataColumn::values_at.



157
158
159
160
161
162
163
164
165
# File 'lib/ctioga2/data/dataset.rb', line 157

def each_values(with_errors = false, expand_nil = true)
  @x.size.times do |i|
    v = @x.values_at(i,with_errors, expand_nil)
    for y in @ys
      v += y.values_at(i,with_errors, expand_nil)
    end
    yield i, *v
  end
end

#has_xy_errors?Boolean

Returns true if X or Y columns have errors

Returns:

  • (Boolean)


128
129
130
# File 'lib/ctioga2/data/dataset.rb', line 128

def has_xy_errors?
  return self.y.has_errors? || self.x.has_errors?
end

#index_on_cols(cols = [2]) ⇒ Object

Returns a hash of Datasets indexed on the values of the columns cols. Datasets contain the same number of columns.



416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
# File 'lib/ctioga2/data/dataset.rb', line 416

def index_on_cols(cols = [2])
  # Transform column number into index in the each_values call
  cols.map! do |i|
    i*3 
  end

  datasets = {}
  self.each_values(true) do |i,*values|
    signature = cols.map do |i|
      values[i]
    end
    datasets[signature] ||= Dataset.create(name, self.size)
    datasets[signature].push_values(*values)
  end
  return datasets
end

#indexed_tableObject

TODO:

For performance, this will have to be turned into a real

TODO:

The cache should be invalidated when the contents of the

Returns an IndexedDTable representing the XYZ data. Information about errors are not included.

Dtable or Dvector class function. This function is just going to be bad ;-)

Dataset changes (but that will be real hard !)



353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
# File 'lib/ctioga2/data/dataset.rb', line 353

def indexed_table
  if @indexed_dtable
    return @indexed_dtable
  end
  # We convert the index into three x,y and z arrays
  x = @x.values.dup
  y = @ys[0].values.dup
  z = @ys[1].values.dup
  
  xvals = x.sort.uniq
  yvals = y.sort.uniq
  
  # Now building reverse hashes to speed up the conversion:
  x_index = {}
  i = 0
  xvals.each do |v|
    x_index[v] = i
    i += 1
  end

  y_index = {}
  i = 0
  yvals.each do |v|
    y_index[v] = i
    i += 1
  end

  table = Dobjects::Dtable.new(xvals.size, yvals.size)
  # We initialize all the values to NaN
  table.set(0.0/0.0)
  
  x.each_index do |i|
    ix = x_index[x[i]]
    iy = y_index[y[i]]
    # Y first !
    table[iy, ix] = z[i]
  end
  @indexed_dtable = IndexedDTable.new(xvals, yvals, table)
  return @indexed_dtable
end

#make_contour(level) ⇒ Object

TODO:

add algorithm

Returns a x,y Function



397
398
399
400
# File 'lib/ctioga2/data/dataset.rb', line 397

def make_contour(level)
  table = indexed_table
  return table.make_contour(level, {'ret' => 'func'} )
end

#merge_datasets_in(datasets, columns = [0], precision = nil) ⇒ Object

TODO:

update column names.

TODO:

write provisions for column names, actually ;-)…

Merges one or more other data sets into this one; one or more columns are designated as “master” columns and their values must match in all datasets. Extra columns are simply appended, in the order in which the datasets are given

Comparisons between the values are made in abritrary precision unless precision is given, in which case values only have to match to this given number of digits.



492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
# File 'lib/ctioga2/data/dataset.rb', line 492

def merge_datasets_in(datasets, columns = [0], precision = nil)
  # First thing, the data precision block:

  prec = if precision then
           proc do |x|
      ("%.#{@precision}g" % x) # This does not need to be a Float
    end
         else
           proc {|x| x}   # For exact comparisons
         end

  # First, we build an index of the master columns of the first
  # dataset.

  hash = {}
  self.each_values(false) do |i, *cols|
    signature = columns.map {|j|
      prec.call(cols[j])
    }
    hash[signature] = i
  end

  remove_indices = columns.sort.reverse

  for set in datasets
    old_columns = set.all_columns
    for i in remove_indices
      old_columns.slice!(i)
    end

    # Now, we got rid of the master columns, we add the given
    # number of columns

    new_columns = []
    old_columns.each do |c|
      new_columns << DataColumn.create(@x.size, c.has_errors?)
    end

    set.each_values(false) do |i, *cols|
      signature = columns.map {|j|
        prec.call(cols[j])
      }
      idx = hash[signature]
      if idx
        old_columns.each_index  { |j|
          new_columns[j].
          set_values_at(idx, 
                        * old_columns[j].values_at(i, true, true))
        }
      else
        # Data points are lost
      end
    end
    @ys.concat(new_columns)
  end

end

#naive_smooth!(number) ⇒ Object

Smooths the data using a naive gaussian-like convolution (but not exactly). Not for use for reliable data filtering.



404
405
406
407
408
409
410
411
412
# File 'lib/ctioga2/data/dataset.rb', line 404

def naive_smooth!(number)
  kernel = Dobjects::Dvector.new(number) { |i|
    Utils.cnk(number,i)
  }
  mid = number - number/2 - 1
  for y in @ys
    y.convolve!(kernel, mid)
  end
end

#push_only_values(values) ⇒ Object

Almost the same thing as #push_values, but when you don’t care about the min/max things.



209
210
211
212
213
214
# File 'lib/ctioga2/data/dataset.rb', line 209

def push_only_values(values)
  @x.push_values(values[0])
  @ys.size.times do |i|
    @ys[i].push_values(values[i+1])
  end
end

#push_values(*values) ⇒ Object

Appends the given values (as yielded by each_values(true)) to the stack. Elements of values laying after the last DataColumn in the Dataset are simply ignored. Giving less than there should be will give interesting results.



200
201
202
203
204
205
# File 'lib/ctioga2/data/dataset.rb', line 200

def push_values(*values)
  @x.push_values(*(values[0..2]))
  @ys.size.times do |i|
    @ys[i].push_values(*(values.slice(3*(i+1),3)))
  end
end

#reglin(options = {}) ⇒ Object

TODO:

Have the possibility to elaborate on the regression side

Massive linear regressions over all X and Y values corresponding to a unique set of all the other Y2… Yn values.

Returns the [coeffs, lines]

(in particular force b to 0)



442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
# File 'lib/ctioga2/data/dataset.rb', line 442

def reglin(options = {})
  cols = []
  2.upto(self.size-1) do |i|
    cols << i
  end
  datasets = index_on_cols(cols)

  # Create two new datasets:
  # * one that collects the keys and a,b
  # * another that collects the keys and x1,y1, x2y2
  coeffs = Dataset.create("coefficients", self.size)
  lines = Dataset.create("lines", self.size)

  for k,v in datasets
    f = Dobjects::Function.new(v.x.values, v.y.values)
    if options['linear']  # Fit to y = a*x
      d = f.x.dup
      d.mul!(f.x)
      sxx = d.sum
      d.replace(f.x)
      d.mul!(f.y)
      sxy = d.sum
      a = sxy/sxx
      coeffs.push_only_values(k + [a,0])
      lines.push_only_values(k + [f.x.min, a * f.x.min])
      lines.push_only_values(k + [f.x.max, a * f.x.max])
    else
      a,b = f.reglin
      coeffs.push_only_values(k + [a, b])
      lines.push_only_values(k + [f.x.min, b + a * f.x.min])
      lines.push_only_values(k + [f.x.max, b + a * f.x.max])
    end
    
  end

  return [coeffs, lines]
end

#select!(&block) ⇒ Object

Modifies the dataset to only keep the data for which the block returns true. The block should take the following arguments, in order:

x, xmin, xmax, y, ymin, ymax, y1, y1min, y1max,

_z_, _zmin_, _zmax_, _y2_, _y2min_, _y2max_, _y3_, _y3min_, _y3max_


223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/ctioga2/data/dataset.rb', line 223

def select!(&block)
  target = []
  @x.size.times do |i|
    args = @x.values_at(i, true)
    args.concat(@ys[0].values_at(i, true) * 2)
    if @ys[1]
      args.concat(@ys[1].values_at(i, true) * 2)
      for yvect in @ys[2..-1]
        args.concat(yvect.values_at(i, true))
      end
    end
    if block.call(*args)
      target << i
    end
  end
  for col in all_columns
    col.reindex(target)
  end
end

#select_formula!(formula) ⇒ Object

Same as #select!, but you give it a text formula instead of a block. It internally calls #select!, by the way ;-)…



245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/ctioga2/data/dataset.rb', line 245

def select_formula!(formula)
  names = @x.column_names('x', true)
  names.concat(@x.column_names('y', true))
  names.concat(@x.column_names('y1', true))
  if @ys[1]
    names.concat(@x.column_names('z', true))
    names.concat(@x.column_names('y2', true))
    i = 3
    for yvect in @ys[2..-1]
      names.concat(@x.column_names("y#{i}", true))
      i += 1
    end
  end
  block = eval("proc do |#{names.join(',')}|\n#{formula}\nend")
  select!(&block)
end

#sizeObject

The overall number of columns



168
169
170
# File 'lib/ctioga2/data/dataset.rb', line 168

def size
  return 1 + @ys.size
end

#sort!Object

Sorts all columns according to X values



133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/ctioga2/data/dataset.rb', line 133

def sort!
  idx_vector = Dobjects::Dvector.new(@x.values.size) do |i|
    i
  end
  f = Dobjects::Function.new(@x.values.dup, idx_vector)
  f.sort
  # Now, idx_vector contains the indices that make X values
  # sorted.
  for col in all_columns
    col.reindex(idx_vector)
  end
end

#trim!(nb) ⇒ Object

Trims all data columns. See DataColumn#trim!



190
191
192
193
194
# File 'lib/ctioga2/data/dataset.rb', line 190

def trim!(nb)
  for col in all_columns
    col.trim!(nb)
  end
end

#yObject

The main Y column (ie, the first one)



118
119
120
# File 'lib/ctioga2/data/dataset.rb', line 118

def y
  return @ys[0]
end

#zObject

The Z column, if applicable



123
124
125
# File 'lib/ctioga2/data/dataset.rb', line 123

def z
  return @ys[1]
end

#z_columnsObject

The number of Z columns



173
174
175
# File 'lib/ctioga2/data/dataset.rb', line 173

def z_columns
  return @ys.size - 1
end