Class: CTioga2::Data::Dataset

Inherits:
Object
  • Object
show all
Includes:
Log
Defined in:
lib/ctioga2/data/dataset.rb

Overview

This is the central class of the data manipulation in ctioga. It is a series of ‘Y’ DataColumn indexed on a unique ‘X’ DataColumn. This can be used to represent multiple XY data sets, but also XYZ and even more complex data. The actual signification of the various ‘Y’ columns are left to the user.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Log

debug, error, fatal, #format_exception, #identify, info, init_logger, logger, set_level, #spawn, warn

Constructor Details

#initialize(name, columns) ⇒ Dataset

Creates a new Dataset object with the given data columns (Dvector or DataColumn). #x is the first one



51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/ctioga2/data/dataset.rb', line 51

def initialize(name, columns)
  columns.each_index do |i|
    if columns[i].is_a? Dobjects::Dvector
      columns[i] = DataColumn.new(columns[i])
    end
  end
  @x = columns[0]
  @ys = columns[1..-1]
  @name = name

  # Cache for the indexed dtable
  @indexed_dtable = nil
end

Instance Attribute Details

#nameObject

The name of the Dataset, such as one that could be used in a legend (like for the –auto-legend option of ctioga).



45
46
47
# File 'lib/ctioga2/data/dataset.rb', line 45

def name
  @name
end

#xObject

The X DataColumn



38
39
40
# File 'lib/ctioga2/data/dataset.rb', line 38

def x
  @x
end

#ysObject

All Y DataColumn (an Array of DataColumn)



41
42
43
# File 'lib/ctioga2/data/dataset.rb', line 41

def ys
  @ys
end

Class Method Details

.create(name, number) ⇒ Object

Creates a



66
67
68
69
70
71
72
# File 'lib/ctioga2/data/dataset.rb', line 66

def self.create(name, number)
  cols = []
  number.times do
    cols << Dobjects::Dvector.new()
  end
  return self.new(name, cols)
end

.dataset_from_spec(name, spec) ⇒ Object

Creates a new Dataset from a specification. This function parses a specification in the form of:

  • a:b:c+

  • spec=a:spec2=b+

It yields each of the unprocessed text, not necessarily in the order they were read, and expects a Dvector as a return value.

It then builds a suitable Dataset object with these values, and returns it.

It is strongly recommended to use this function for reimplementations of Backends::Backend#query_dataset.



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/ctioga2/data/dataset.rb', line 87

def self.dataset_from_spec(name, spec)
  specs = []
  i = 0
  for s in spec.split(/:/)
    if s =~ /^(x|y\d*|z)(#{DataColumn::ColumnSpecsRE})=(.*)/i
      which, mod, s = $1.downcase,($2 && $2.downcase) || "value",$3
      
      case which
      when /x/
        idx = 0
      when /y(\d+)?/
        if $1
          idx = $1.to_i
        else
          idx = 1
        end
      when /z/
        idx = 2
      end
      specs[idx] ||= {}
      specs[idx][mod] = yield s
    else
      specs[i] = {"value" =>  yield(s)}
    end
    i += 1
  end
  columns = []
  for s in specs
    columns << DataColumn.from_hash(s)
  end
  return Dataset.new(name, columns)
end

Instance Method Details

#<<(dataset) ⇒ Object

Concatenates another Dataset to this one



176
177
178
179
180
181
182
183
184
# File 'lib/ctioga2/data/dataset.rb', line 176

def <<(dataset)
  if dataset.size != self.size
    raise "Can't concatenate datasets that don't have the same number of columns: #{self.size} vs #{dataset.size}"
  end
  @x << dataset.x
  @ys.size.times do |i|
    @ys[i] << dataset.ys[i]
  end
end

#apply_formulas(formula) ⇒ Object

Applies formulas to values. Formulas are like text-backend specification: “:”-separated specs of the target



306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/ctioga2/data/dataset.rb', line 306

def apply_formulas(formula)
  columns = []
  columns << Dobjects::Dvector.new(@x.size) do |i|
    i
  end
  columns << @x.values
  for y in @ys
    columns << y.values
  end

  # Names:
  heads = {
    'x' => 1,
    'y' => 2,
    'z' => 3,
  }
  i = 1
  for f in @ys
    heads["y#{i}"] = i+1
    i += 1
  end

  result = []
  for f in formula.split(/:/) do
    fm = Utils::parse_formula(f, nil, heads)
    debug { 
      "Using formula #{fm} for column spec: #{f} (##{result.size})" 
    }
    result << DataColumn.new(Dobjects::Dvector.
                             compute_formula(fm, 
                                             columns))
  end
  return Dataset.new(name + "_mod", result)
end

#average_duplicates!Object

Average all the non-X values of successive data points that have the same X values. It is a naive version that also averages the error columns.



265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# File 'lib/ctioga2/data/dataset.rb', line 265

def average_duplicates!
  last_x = nil
  last_x_first_idx = 0
  xv = @x.values
  i = 0
  vectors = all_vectors
  while i < xv.size
    x = xv[i]
    if ((last_x == x) && (i != (xv.size - 1)))
      # Do nothing
    else
      if last_x_first_idx < (i - 1)  || 
          ((last_x == x) && (i == (xv.size - 1)))
        if i == (xv.size - 1)
          e = i
        else
          e = i-1
        end                 # The end of the slice.

        ## \todo In real, to do this properly, one would
        # have to write a proper function in DataColumn that
        # does averaging over certain indices possibly more
        # cleverly than the current way to do.
        for v in vectors
          subv = v[last_x_first_idx..e]
          ave = subv.sum/subv.size
          v.slice!(last_x_first_idx+1, e - last_x_first_idx)
          v[last_x_first_idx] = ave
        end
        i -= e - last_x_first_idx
      end
      last_x = x
      last_x_first_idx = i
    end
    i += 1
  end
  
end

#column_namesObject

Returns an array with Column names.



150
151
152
153
154
155
156
# File 'lib/ctioga2/data/dataset.rb', line 150

def column_names
  retval = @x.column_names("x")
  @ys.each_index do |i|
    retval += @ys[i].column_names("y#{i+1}")
  end
  return retval
end

#each_values(with_errors = false, expand_nil = true) ⇒ Object

Iterates over all the values of the Dataset. Values of optional arguments are those of DataColumn::values_at.



160
161
162
163
164
165
166
167
168
# File 'lib/ctioga2/data/dataset.rb', line 160

def each_values(with_errors = false, expand_nil = true)
  @x.size.times do |i|
    v = @x.values_at(i,with_errors, expand_nil)
    for y in @ys
      v += y.values_at(i,with_errors, expand_nil)
    end
    yield i, *v
  end
end

#has_xy_errors?Boolean

Returns true if X or Y columns have errors

Returns:

  • (Boolean)


131
132
133
# File 'lib/ctioga2/data/dataset.rb', line 131

def has_xy_errors?
  return self.y.has_errors? || self.x.has_errors?
end

#index_on_cols(cols = [2]) ⇒ Object

Returns a hash of Datasets indexed on the values of the columns cols. Datasets contain the same number of columns.



424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
# File 'lib/ctioga2/data/dataset.rb', line 424

def index_on_cols(cols = [2])
  # Transform column number into index in the each_values call
  cols.map! do |i|
    i*3 
  end

  datasets = {}
  self.each_values(true) do |i,*values|
    signature = cols.map do |i|
      values[i]
    end
    datasets[signature] ||= Dataset.create(name, self.size)
    datasets[signature].push_values(*values)
  end
  return datasets
end

#indexed_tableObject

TODO:

For performance, this will have to be turned into a real

TODO:

The cache should be invalidated when the contents of the

Returns an IndexedDTable representing the XYZ data. Information about errors are not included.

Dtable or Dvector class function. This function is just going to be bad ;-)

Dataset changes (but that will be real hard !)



351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
# File 'lib/ctioga2/data/dataset.rb', line 351

def indexed_table
  if @indexed_dtable
    return @indexed_dtable
  end
  # We convert the index into three x,y and z arrays
  x = @x.values.dup
  y = @ys[0].values.dup
  z = @ys[1].values.dup
  
  xvals = x.sort.uniq
  yvals = y.sort.uniq
  
  # Now building reverse hashes to speed up the conversion:
  x_index = {}
  i = 0
  xvals.each do |v|
    x_index[v] = i
    i += 1
  end

  y_index = {}
  i = 0
  yvals.each do |v|
    y_index[v] = i
    i += 1
  end

  table = Dobjects::Dtable.new(xvals.size, yvals.size)
  # We initialize all the values to NaN
  table.set(0.0/0.0)
  
  x.each_index do |i|
    ix = x_index[x[i]]
    iy = y_index[y[i]]
    # Y first !
    table[iy, ix] = z[i]
  end
  @indexed_dtable = IndexedDTable.new(xvals, yvals, table)
  return @indexed_dtable
end

#make_contour(level) ⇒ Object

TODO:

add algorithm

Returns a x,y Function



395
396
397
398
399
400
401
402
403
404
405
406
407
408
# File 'lib/ctioga2/data/dataset.rb', line 395

def make_contour(level)
  dtable = indexed_table
  x,y,gaps = *dtable.make_contour(level)

  # We remove any gap corresponding to the element size,
  # meaningless.
  gaps -= [x.size]
  n = 0.0/0.0
  gaps.sort.reverse.each do |i|
    x.insert(i,n)
    y.insert(i,n)
  end
  return Dobjects::Function.new(x,y)
end

#merge_datasets_in(datasets, columns = [0], precision = nil) ⇒ Object

TODO:

update column names.

TODO:

write provisions for column names, actually ;-)…

Merges one or more other data sets into this one; one or more columns are designated as “master” columns and their values must match in all datasets. Extra columns are simply appended, in the order in which the datasets are given

Comparisons between the values are made in abritrary precision unless precision is given, in which case values only have to match to this given number of digits.



500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
# File 'lib/ctioga2/data/dataset.rb', line 500

def merge_datasets_in(datasets, columns = [0], precision = nil)
  # First thing, the data precision block:

  prec = if precision then
           proc do |x|
      ("%.#{@precision}g" % x) # This does not need to be a Float
    end
         else
           proc {|x| x}   # For exact comparisons
         end

  # First, we build an index of the master columns of the first
  # dataset.

  hash = {}
  self.each_values(false) do |i, *cols|
    signature = columns.map {|j|
      prec.call(cols[j])
    }
    hash[signature] = i
  end

  remove_indices = columns.sort.reverse

  for set in datasets
    old_columns = set.all_columns
    for i in remove_indices
      old_columns.slice!(i)
    end

    # Now, we got rid of the master columns, we add the given
    # number of columns

    new_columns = []
    old_columns.each do |c|
      new_columns << DataColumn.create(@x.size, c.has_errors?)
    end

    set.each_values(false) do |i, *cols|
      signature = columns.map {|j|
        prec.call(cols[j])
      }
      idx = hash[signature]
      if idx
        old_columns.each_index  { |j|
          new_columns[j].
          set_values_at(idx, 
                        * old_columns[j].values_at(i, true, true))
        }
      else
        # Data points are lost
      end
    end
    @ys.concat(new_columns)
  end

end

#naive_smooth!(number) ⇒ Object

Smooths the data using a naive gaussian-like convolution (but not exactly). Not for use for reliable data filtering.



412
413
414
415
416
417
418
419
420
# File 'lib/ctioga2/data/dataset.rb', line 412

def naive_smooth!(number)
  kernel = Dobjects::Dvector.new(number) { |i|
    Utils.cnk(number,i)
  }
  mid = number - number/2 - 1
  for y in @ys
    y.convolve!(kernel, mid)
  end
end

#push_only_values(values) ⇒ Object

Almost the same thing as #push_values, but when you don’t care about the min/max things.



207
208
209
210
211
212
# File 'lib/ctioga2/data/dataset.rb', line 207

def push_only_values(values)
  @x.push_values(values[0])
  @ys.size.times do |i|
    @ys[i].push_values(values[i+1])
  end
end

#push_values(*values) ⇒ Object

Appends the given values (as yielded by each_values(true)) to the stack. Elements of values laying after the last DataColumn in the Dataset are simply ignored. Giving less than there should be will give interesting results.



198
199
200
201
202
203
# File 'lib/ctioga2/data/dataset.rb', line 198

def push_values(*values)
  @x.push_values(*(values[0..2]))
  @ys.size.times do |i|
    @ys[i].push_values(*(values.slice(3*(i+1),3)))
  end
end

#reglin(options = {}) ⇒ Object

TODO:

Have the possibility to elaborate on the regression side

Massive linear regressions over all X and Y values corresponding to a unique set of all the other Y2… Yn values.

Returns the [coeffs, lines]

(in particular force b to 0)



450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
# File 'lib/ctioga2/data/dataset.rb', line 450

def reglin(options = {})
  cols = []
  2.upto(self.size-1) do |i|
    cols << i
  end
  datasets = index_on_cols(cols)

  # Create two new datasets:
  # * one that collects the keys and a,b
  # * another that collects the keys and x1,y1, x2y2
  coeffs = Dataset.create("coefficients", self.size)
  lines = Dataset.create("lines", self.size)

  for k,v in datasets
    f = Dobjects::Function.new(v.x.values, v.y.values)
    if options['linear']  # Fit to y = a*x
      d = f.x.dup
      d.mul!(f.x)
      sxx = d.sum
      d.replace(f.x)
      d.mul!(f.y)
      sxy = d.sum
      a = sxy/sxx
      coeffs.push_only_values(k + [a,0])
      lines.push_only_values(k + [f.x.min, a * f.x.min])
      lines.push_only_values(k + [f.x.max, a * f.x.max])
    else
      a,b = f.reglin
      coeffs.push_only_values(k + [a, b])
      lines.push_only_values(k + [f.x.min, b + a * f.x.min])
      lines.push_only_values(k + [f.x.max, b + a * f.x.max])
    end
    
  end

  return [coeffs, lines]
end

#select!(&block) ⇒ Object

Modifies the dataset to only keep the data for which the block returns true. The block should take the following arguments, in order:

x, xmin, xmax, y, ymin, ymax, y1, y1min, y1max,

_z_, _zmin_, _zmax_, _y2_, _y2min_, _y2max_, _y3_, _y3min_, _y3max_


221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/ctioga2/data/dataset.rb', line 221

def select!(&block)
  target = []
  @x.size.times do |i|
    args = @x.values_at(i, true)
    args.concat(@ys[0].values_at(i, true) * 2)
    if @ys[1]
      args.concat(@ys[1].values_at(i, true) * 2)
      for yvect in @ys[2..-1]
        args.concat(yvect.values_at(i, true))
      end
    end
    if block.call(*args)
      target << i
    end
  end
  for col in all_columns
    col.reindex(target)
  end
end

#select_formula!(formula) ⇒ Object

Same as #select!, but you give it a text formula instead of a block. It internally calls #select!, by the way ;-)…



243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/ctioga2/data/dataset.rb', line 243

def select_formula!(formula)
  names = @x.column_names('x', true)
  names.concat(@x.column_names('y', true))
  names.concat(@x.column_names('y1', true))
  if @ys[1]
    names.concat(@x.column_names('z', true))
    names.concat(@x.column_names('y2', true))
    i = 3
    for yvect in @ys[2..-1]
      names.concat(@x.column_names("y#{i}", true))
      i += 1
    end
  end
  block = eval("proc do |#{names.join(',')}|\n#{formula}\nend")
  select!(&block)
end

#sizeObject

The overall number of columns



171
172
173
# File 'lib/ctioga2/data/dataset.rb', line 171

def size
  return 1 + @ys.size
end

#sort!Object

Sorts all columns according to X values



136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/ctioga2/data/dataset.rb', line 136

def sort!
  idx_vector = Dobjects::Dvector.new(@x.values.size) do |i|
    i
  end
  f = Dobjects::Function.new(@x.values.dup, idx_vector)
  f.sort
  # Now, idx_vector contains the indices that make X values
  # sorted.
  for col in all_columns
    col.reindex(idx_vector)
  end
end

#trim!(nb) ⇒ Object

Trims all data columns. See DataColumn#trim!



188
189
190
191
192
# File 'lib/ctioga2/data/dataset.rb', line 188

def trim!(nb)
  for col in all_columns
    col.trim!(nb)
  end
end

#yObject

The main Y column (ie, the first one)



121
122
123
# File 'lib/ctioga2/data/dataset.rb', line 121

def y
  return @ys[0]
end

#zObject

The Z column, if applicable



126
127
128
# File 'lib/ctioga2/data/dataset.rb', line 126

def z
  return @ys[1]
end