Module: TSV

Defined in:
lib/rbbt/tsv.rb,
lib/rbbt/util/R.rb,
lib/rbbt/tsv/csv.rb,
lib/rbbt/tsv/melt.rb,
lib/rbbt/tsv/util.rb,
lib/rbbt/tsv/excel.rb,
lib/rbbt/tsv/index.rb,
lib/rbbt/tsv/attach.rb,
lib/rbbt/tsv/dumper.rb,
lib/rbbt/tsv/filter.rb,
lib/rbbt/tsv/matrix.rb,
lib/rbbt/tsv/parser.rb,
lib/rbbt/tsv/stream.rb,
lib/rbbt/tsv/marshal.rb,
lib/rbbt/tsv/accessor.rb,
lib/rbbt/tsv/change_id.rb,
lib/rbbt/tsv/manipulate.rb,
lib/rbbt/tsv/attach/util.rb,
lib/rbbt/tsv/field_index.rb,
lib/rbbt/tsv/serializers.rb,
lib/rbbt/association/item.rb,
lib/rbbt/tsv/parallel/through.rb,
lib/rbbt/tsv/parallel/traverse.rb

Defined Under Namespace

Modules: XLS, XLSX Classes: BinarySerializer, CleanSerializer, Dumper, FloatArraySerializer, FloatSerializer, IntegerArraySerializer, IntegerSerializer, Parser, StrictFloatArraySerializer, StrictIntegerArraySerializer, StringArraySerializer, StringDoubleArraySerializer, StringSerializer, TSVMarshalSerializer, TSVSerializer, Traverser

Constant Summary collapse

TSV_SERIALIZER =
YAML
SERIALIZED_NIL =
TSV_SERIALIZER.dump nil
KEY_PREFIX =

{{{ TSV ENTRIES and ENTRY_KEYS

"__tsv_hash_"
ENTRIES =
[]
ENTRY_KEYS =
Set.new
NIL_VALUE =
"NIL_VALUE"
SERIALIZER_ALIAS =
{
  :integer => IntegerSerializer, 
  :float => FloatSerializer, 
  :integer_array => IntegerArraySerializer,
  :float_array => FloatArraySerializer,
  :strict_integer_array => StrictIntegerArraySerializer,
  :strict_float_array => StrictFloatArraySerializer,
  :marshal => Marshal,
  :single => StringSerializer,
  :string => StringSerializer,
  :list => StringArraySerializer,
  :flat => StringArraySerializer,
  :double => StringDoubleArraySerializer,
  :clean => CleanSerializer,
  :binary => BinarySerializer,
  :tsv => TSVSerializer,
  :marshal_tsv => TSVMarshalSerializer
}

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Attribute Details

.field_index_dirObject

Returns the value of attribute field_index_dir.



4
5
6
# File 'lib/rbbt/tsv/field_index.rb', line 4

def field_index_dir
  @field_index_dir
end

.lock_dirObject

Returns the value of attribute lock_dir.



25
26
27
# File 'lib/rbbt/tsv.rb', line 25

def lock_dir
  @lock_dir
end

.unnamedObject

Returns the value of attribute unnamed.



25
26
27
# File 'lib/rbbt/tsv.rb', line 25

def unnamed
  @unnamed
end

Instance Attribute Details

#entity_optionsObject

Returns the value of attribute entity_options.



11
12
13
# File 'lib/rbbt/tsv/accessor.rb', line 11

def entity_options
  @entity_options
end

#entity_templatesObject

Returns the value of attribute entity_templates.



11
12
13
# File 'lib/rbbt/tsv/accessor.rb', line 11

def entity_templates
  @entity_templates
end

#field_indicesObject

Returns the value of attribute field_indices.



10
11
12
# File 'lib/rbbt/tsv/field_index.rb', line 10

def field_indices
  @field_indices
end

#monitorObject

Returns the value of attribute monitor.



5
6
7
# File 'lib/rbbt/tsv/manipulate.rb', line 5

def monitor
  @monitor
end

#serializer_moduleObject

Returns the value of attribute serializer_module.



11
12
13
# File 'lib/rbbt/tsv/accessor.rb', line 11

def serializer_module
  @serializer_module
end

#unnamedObject

Returns the value of attribute unnamed.



11
12
13
# File 'lib/rbbt/tsv/accessor.rb', line 11

def unnamed
  @unnamed
end

Class Method Details

._clean_float(v) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/rbbt/tsv/excel.rb', line 14

def self._clean_float(v)
  case v
  when Float
    v.to_s.sub(/e(-?\d+)$/,'E\1')
  when String
    if v =~ /^-?[\d\.]+e(-?\d+)$/
      v.sub(/e(-?\d+)$/,'E\1') 
    else
      v
    end
  else
    v
  end
end

._excel_data(tsv, options = {}) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/rbbt/tsv/excel.rb', line 30

def self._excel_data(tsv, options ={})
  options = Misc.add_defaults options, :sep2 => ', '

  name = Misc.process_options options, :name
  sep2 = Misc.process_options options, :sep2
  unmerge = Misc.process_options options, :unmerge
  sort_by = Misc.process_options options, :sort_by
  sort_by_cast = Misc.process_options options, :sort_by_cast
  remove_links = Misc.process_options options, :remove_links

  i = 1
  if sort_by
    if sort_by_cast
      data = tsv.sort_by sort_by do |k, v| 
        if Array === v
          v.first.send(sort_by_cast)
        else
          v.send(sort_by_cast)
        end
      end
    else
      data = tsv.sort_by sort_by
    end
  else
    data = tsv
  end

  rows = []
  data.through do |key, values|
    cells = []
    cells.push((name and key.respond_to?(:name)) ?  key.name || key : key )

    values = [values] unless Array === values
    values.each do |value|
      v = (name and value.respond_to?(:name)) ?  value.name || value : value 
      if Array === v
        v = v.collect{|_v| _remove_link(_v)} if remove_links
        v = v.collect{|_v| _clean_float(_v)} 
        if unmerge
          cells.push v
        else
          cells.push v * sep2
        end
      else
        v = _remove_link(v) if remove_links
        cells.push v
      end
    end

    rows << cells
    i += 1
  end
  if unmerge
    new_rows = []
    rows.each do |row|
      header = row.shift
      Misc.zip_fields(row).each do |values|
        new_rows << [header] + values
      end
    end
    rows = new_rows
  end
  [tsv.all_fields, rows]
end

._extended(data) ⇒ Object



133
134
135
136
137
138
139
140
# File 'lib/rbbt/tsv/accessor.rb', line 133

def self._extended(data)
  if not data.respond_to? :write
    class << data
      attr_accessor :writable

    end
  end
end


6
7
8
9
10
11
12
# File 'lib/rbbt/tsv/excel.rb', line 6

def self._remove_link(value)
  if value =~ /<([\w]+)[^>]*>(.*?)<\/\1>/
    $2
  else
    value
  end
end

.abort_stream(file, exception = nil) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/rbbt/tsv/util.rb', line 89

def self.abort_stream(file, exception = nil)
  return if file.nil?
  if defined? Step and Step === file
    if exception
      file.exception exception 
    else
      if not (file.aborted? or file.done?)
        file.abort 
      end
    end
  elsif Hash === file or Array === file
    return
  else
    stream = get_stream(file)
    stream.abort(exception) if stream.respond_to? :abort
    AbortedStream.setup(stream, exception) unless stream.respond_to?(:exception) && stream.exception
  end
end

.build_traverse_index(files, options = {}) ⇒ Object



323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# File 'lib/rbbt/tsv/attach/util.rb', line 323

def self.build_traverse_index(files, options = {})
  options       = Misc.add_defaults options, :in_namespace => false, :persist_input => true
  in_namespace  = options[:in_namespace]
  persist_input = options[:persist_input]

  path = find_path(files, options)

  return nil if path.nil?

  traversal_ids = path.collect{|p| p.first}

  Log.debug "Found Traversal: #{traversal_ids * " => "}"

  index_for_traversal path, persist_input
end

.change_key(tsv, format, options = {}, &block) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/rbbt/tsv/change_id.rb', line 6

def self.change_key(tsv, format, options = {}, &block)
  options = Misc.add_defaults options, :persist => false, :identifiers => tsv.identifiers

  identifiers, persist_input = Misc.process_options options, :identifiers, :persist_input

  identifiers = Organism.identifiers(tsv.namespace) if identifiers.nil? and tsv.namespace


  if ! tsv.fields.include?(format)
    new = {}
    tsv.each do |k,v|
      if v === String or v === Array
        new[k] = v.dup 
      else
        new[k] = v
      end
    end
    orig_fields = tsv.fields
    tsv = tsv.annotate new
    new.fields = new.fields.collect{|f| "TMP-" << f }

    orig_type = tsv.type 
    tsv = tsv.to_double if orig_type != :double

    if Array === identifiers
      tsv = tsv.attach identifiers.first, :fields => [format], :persist_input => true, :identifiers => identifiers.last
    else
      tsv = tsv.attach identifiers, :fields => [format], :persist_input => true
    end


    tsv = tsv.reorder(format, tsv.fields[0..-2])

    tsv = tsv.to_flat  if orig_type == :flat

    tsv = tsv.to_list(&block)  if orig_type == :list

    tsv.fields = orig_fields

    tsv
  else
    tsv.reorder(format)
  end
end

.collapse_stream(input, options = {}, &block) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/rbbt/tsv/stream.rb', line 4

def self.collapse_stream(input, options = {}, &block)
  options = Misc.add_defaults options, :sep => "\t", :header_hash => '#', :uniq => true
  input_stream = TSV.get_stream input

  header_hash = options[:header_hash]
  cmd_args = options[:uniq] ? "-u" : nil

  sorted_input_stream = Misc.sort_stream input_stream, header_hash, cmd_args

  parser = TSV::Parser.new(sorted_input_stream, options.dup)
  dumper = TSV::Dumper.new parser
  header = TSV.header_lines(parser.key_field, parser.fields, parser.options)
  dumper.close_in
  dumper.close_out
  dumper.stream = Misc.collapse_stream parser.stream, parser.first_line, parser.sep, header, &block
  dumper
end

.csv(obj, options = {}) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/rbbt/tsv/csv.rb', line 4

def self.csv(obj, options = {}) 
  options = Misc.add_defaults IndiferentHash.setup(options.dup), :headers => true, :type => :list
  headers = options[:headers]

  noheaders = ! headers

  type = options.delete :type
  cast = options.delete :cast
  merge = options.delete :merge
  key_field = options.delete :key_field
  fields = options.delete :fields
  
  if key_field || fields
    orig_type = type
    type = :double
    merge = true
  end

  options[:headers] = false

  csv = case obj
        when Path
          CSV.read obj.find.open, options
        when String
          if Open.remote?(obj)
            CSV.read Open.open(obj), options
          elsif Misc.is_filename?(obj)
            CSV.read obj, options
          else
            CSV.new obj, **options
          end
        else
          CSV.new obj, **options
        end

  tsv = if noheaders
          TSV.setup({}, :key_field => nil, :fields => nil, :type => type)
        else
          key, *csv_fields = csv.shift
          TSV.setup({}, :key_field => key, :fields => csv_fields, :type => type)
        end

  csv.each_with_index do |row,i|
    if noheaders
      key, values = ["row-#{i}", row]
    else
      key, *values = row
    end
    
    if cast
      values = values.collect{|v| v.send cast }
    end

    case type
    when :double, :flat
      tsv.zip_new(key, values)
    when :single
      tsv[key] = values.first
    when :list
      tsv[key] = values
    end
  end

  if key_field || fields
    tsv = tsv.reorder(key_field, fields, :zipped => true, :merge => true)
    if tsv.type != orig_type
      tsv = case orig_type
            when :list
              tsv.to_list
            when :single
              tsv.to_single
            when :list
              tsv.to_list
            when :flat
              tsv.to_flat
            end
    end
  end

  tsv
end

.entry(*entries) ⇒ Object



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/rbbt/tsv/accessor.rb', line 159

def self.entry(*entries)
  entries = entries.collect{|entry| entry.to_s}
  ENTRIES.concat entries
  entries.each do |entry|
    key = KEY_PREFIX + entry
    ENTRY_KEYS << key
    var_name = ("@" << entry).to_sym

    TSV.send(:define_method, entry) do
      return instance_variable_get(var_name) if instance_variables.include? var_name
      svalue = self.send(:[], key, :entry_key)
      value = load_entry_value(svalue)
      instance_variable_set(var_name, value)
      value
    end

    TSV.send(:define_method, entry + "=") do |value|
      instance_variable_set(var_name, value)
      value = value.to_s if Path === value
      self.send(:[]=, key, dump_entry_value(value), :entry_key)
      value
    end

  end
end

.excel(filename, options = {}) ⇒ Object



290
291
292
293
294
295
296
# File 'lib/rbbt/tsv/excel.rb', line 290

def self.excel(filename, options = {})
  if filename =~ /\.xlsx$/
    xlsx(filename, options)
  else
    xls(filename, options)
  end
end

.excel2tsv(filename, options = {}) ⇒ Object



298
299
300
# File 'lib/rbbt/tsv/excel.rb', line 298

def self.excel2tsv(filename, options ={})
  excel(filename, options)
end

.field_match_counts(file, values, options = {}) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/rbbt/tsv/util.rb', line 37

def self.field_match_counts(file, values, options = {})
  options = Misc.add_defaults options, :persist_prefix => "Field_Matches"
  persist_options = Misc.pull_keys options, :persist

  filename = TSV === file ? file.filename : file
  path = Persist.persist filename, :string, persist_options.merge(:no_load => true) do
    tsv = TSV === file ? file : TSV.open(file, options)

    text = ""
    fields = nil
    tsv.tap{|e| e.unnamed =  true; fields = e.fields}.through do |gene, names|
      names.zip(fields).each do |list, format|
        list = [list] unless Array === list
        list.delete_if do |name| name.empty? end
        next if list.empty?
        text << list.collect{|name| [name, format] * "\t"} * "\n" << "\n"
      end
      text << [gene, tsv.key_field] * "\t" << "\n"
    end
    text
  end

  TmpFile.with_file(values.uniq * "\n", false) do |value_file|
    cmd = "cat '#{ path }' | sed 's/\\t/\\tHEADERNOMATCH/' | grep -w -F -f '#{ value_file }' | sed 's/HEADERNOMATCH//' |sort -u|cut -f 2  |sort|uniq -c|sed 's/^ *//;s/ /\t/'"
    begin
      TSV.open(CMD.cmd(cmd), :key_field => 1, :fields => [0], :type => :single, :cast => :to_i)
    rescue
      Log.exception $!
      TSV.setup({}, :type => :single, :cast => :to_i)
    end
  end
end

.find_path(files, options = {}) ⇒ Object

May make an extra index!



239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# File 'lib/rbbt/tsv/attach/util.rb', line 239

def self.find_path(files, options = {})
  options      = Misc.add_defaults options, :in_namespace => false
  in_namespace = options[:in_namespace]

  if in_namespace
    if files.first.all_fields.include? in_namespace
      ids = [[in_namespace]]
    else
      ids = [files.first.all_namespace_fields(in_namespace)]
    end
    ids += files[1..-1].collect{|f| f.all_fields}
  else
    ids = files.collect{|f| f.all_fields }
  end

  id_list = []

  ids.each_with_index do |list, i|
    break if i == ids.length - 1
    match = list.select{|field| 
      ids[i + 1].select{|f| Misc.match_fields(field, f) }.any?
    }
    return nil if match.empty?
    id_list << match.first
  end

  if ! Misc.match_fields(id_list.last, files.last.all_fields.first)
    id_list << files.last.all_fields.first
    id_list.zip(files)
  else
    id_list.zip(files[0..-1])
  end
end

.find_traversal(tsv1, tsv2, options = {}) ⇒ Object



340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# File 'lib/rbbt/tsv/attach/util.rb', line 340

def self.find_traversal(tsv1, tsv2, options = {})
  options      = Misc.add_defaults options, :in_namespace => false
  in_namespace = options[:in_namespace]

  identifiers1 = tsv1.identifier_files || []
  identifiers1 += [options[:identifiers]].flatten if options[:identifiers]
  identifiers2 = tsv2.identifier_files || []

  identifiers1.unshift tsv1
  identifiers2.unshift tsv2

  files1 = []
  files2 = []
  while identifiers1.any?
    files1.push identifiers1.shift
    identifiers2.each_with_index do |e,i|
      files2 = identifiers2[(0..i)]
      index  = build_traverse_index(files1 + files2.reverse, options)
      return index if not index.nil?
    end
  end

  return nil
end

.get_filename(file) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/rbbt/tsv/util.rb', line 70

def self.get_filename(file)
  case
  when (defined? Step and Step === file)
    file.path
  when Path === file
    file
  when (String === file and (Open.exists? file or Open.remote? file))
    file
  when String === file 
    "String-#{Misc.digest file}"
  when file.respond_to?(:filename)
    file.filename
  when file.respond_to?(:gets)
    nil
  else
    raise "Cannot get filename from: #{file.inspect}"
  end
end

.get_stream(file, open_options = {}) ⇒ Object



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/rbbt/tsv/util.rb', line 108

def self.get_stream(file, open_options = {})
  case file
  when Zlib::GzipReader
    file
  when (defined? Bgzf and Bgzf)
    file
  when TSV
    file.dumper_stream
  when TSV::Dumper
    file.stream
  when TSV::Parser
    file.stream
  when Path
    file.open(open_options)
  when (defined? Tempfile and Tempfile)
    begin
      pos = file.pos
      file.rewind if file.respond_to?(:rewind) and pos != 0
    rescue Exception
    end
    file
  when IO, StringIO, File
    begin
      pos = file.pos
      file.rewind if file.respond_to?(:rewind) and pos != 0
    rescue
    end
    file
  when String
    if Open.remote?(file) || Open.ssh?(file) || Open.exist?(file) 
      Open.open(file, open_options)
    else
      StringIO.new file
    end
  when (defined? Step and Step)
    if file.respond_to?(:base_url) 
      if file.result and IO === file.result
        file.result
      else
        file.join
        get_stream(file.path, open_options.merge(:nocache => true))
      end
    else
      file.grace

      stream = file.get_stream
      if stream && ! stream.closed?
        stream
      else
        file.join
        raise "Aborted stream from Step #{file.path}" if file.aborted?
        raise "Exception in stream from Step #{file.path}: #{file.messages.last}" if file.error?
        get_stream(file.path, open_options)
      end
    end
  when Array
    Misc.open_pipe do |sin|
      file.each do |l|
        sin.puts l
      end
    end
  when Set
    get_stream(file.to_a, open_options)
  else
    raise "Cannot get stream from: #{file.inspect}"
  end
end

.get_streams_to_close(obj) ⇒ Object



549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 549

def self.get_streams_to_close(obj)
  close_streams = []
  case obj
  when IO, File
    close_streams << obj
  when TSV::Parser
  when TSV::Dumper
    close_streams << obj.result.in_stream
  when (defined? Step and Step)
    obj.mutex.synchronize do
      case obj.result
      when IO
        close_streams << obj.result
      when TSV::Dumper
        close_streams << obj.result.in_stream
      end
    end
    obj.inputs.each do |input|
      close_streams = get_streams_to_close(input) + close_streams
    end
    obj.dependencies.each do |dependency|
      close_streams = get_streams_to_close(dependency) + close_streams
    end
  end 
  close_streams
end

.guess_id(identifier_file, values, options = {}) ⇒ Object



32
33
34
35
# File 'lib/rbbt/tsv/util.rb', line 32

def self.guess_id(identifier_file, values, options = {})
  field_matches = TSV.field_match_counts(identifier_file, values, options)
  field_matches.sort_by{|field, count| count.to_i}.last
end

.guess_max(obj) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 18

def self.guess_max(obj)
  begin
    case obj
    when (defined? Step and Step)
      if obj.done?
        path = obj.path
        path = path.find if path.respond_to? :find
        if File.exist? path
          CMD.cmd("wc -l '#{path}'").read.to_i 
        else
          nil
        end
      else
        nil
      end
    when TSV
      obj.length
    when Array, Hash
      obj.size
    when File
      return nil if Open.gzip?(obj) or Open.bgzip?(obj)
      CMD.cmd("wc -l '#{obj.path}'").read.to_i
    when Path, String
      obj = obj.find if Path === obj
      if File.exist? obj
        return nil if Open.gzip?(obj) or Open.bgzip?(obj)
        CMD.cmd("wc -l '#{obj}'").read.to_i
      else
        nil
      end
    end
  rescue Exception
    Log.exception $!
    nil
  end
end

.header_lines(key_field, fields, entry_hash = nil) ⇒ Object



200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/rbbt/tsv/util.rb', line 200

def self.header_lines(key_field, fields, entry_hash = nil)
  if Hash === entry_hash 
    sep = entry_hash[:sep] ? entry_hash[:sep] : "\t"
    preamble = entry_hash[:preamble]
    header_hash = entry_hash[:header_hash]
  end

  header_hash = "#" if header_hash.nil?

  preamble = "#: " << Misc.hash2string(entry_hash.merge(:key_field => nil, :fields => nil)) << "\n" if preamble.nil? and entry_hash and entry_hash.values.compact.any?

  str = "" 
  str << preamble.strip << "\n" if preamble and not preamble.empty?
  if fields
    if fields.empty?
      str << header_hash << (key_field || "ID").to_s << "\n" 
    else
      str << header_hash << (key_field || "ID").to_s << sep << (fields * sep) << "\n" 
    end
  end

  str
end

.identify_field(key_field, fields, field) ⇒ Object



176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/rbbt/tsv/util.rb', line 176

def self.identify_field(key_field, fields, field)
  case field
  when nil
    :key
  when Symbol
    field == :key ? field : identify_field(key_field, fields, field.to_s)
  when Integer
    field
  when (fields.nil? and String)
    raise "No field information available and specified field not numeric: #{ field }" unless field =~ /^\d+$/
    identify_field(key_field, fields, field.to_i)
  when String
    return :key if key_field == field
    pos = fields.index field
    return pos if pos
    return identify_field(key_field, fields, field.to_i) if field =~ /^\d+$/
    raise "Field '#{ field }' was not found. Options: (#{key_field || "NO_KEY_FIELD"}), #{(fields || ["NO_FIELDS"]) * ", "}" if pos.nil?
  else
    raise "Field '#{ field }' was not found. Options: (#{key_field || "NO_KEY_FIELD"}), #{(fields || ["NO_FIELDS"]) * ", "}"
  end
end

.incidence(tsv) ⇒ Object



223
224
225
# File 'lib/rbbt/association/item.rb', line 223

def self.incidence(tsv)
  AssociationItem.incidence Association.index(tsv, :persist => false).keys
end

.index(file, options = {}) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/rbbt/tsv/index.rb', line 125

def self.index(file, options = {})
  persist_options = Misc.pull_keys options, :persist
  persist_options[:prefix] ||= "StaticIndex[#{options[:target] || :key}]"
   
  Log.debug "Static Index: #{ file } - #{Misc.fingerprint options}"
  Persist.persist_tsv nil, file, options, persist_options do |data|
    data_options = Misc.pull_keys options, :data
    data_options[:grep] ||= data_options[:tsv_grep] if data_options[:tsv_grep]
    identifiers = TSV.open(file, data_options)
    identifiers.with_monitor :desc => "Creating Index for #{ file }" do
      identifiers.index(options.merge :persist_data => data, :persist => persist_options[:persist])
    end
  end
end

.index_for_traversal(path, persist_input = false) ⇒ Object



273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# File 'lib/rbbt/tsv/attach/util.rb', line 273

def self.index_for_traversal(path, persist_input = false)
  data_key, data_file = path.shift
  data_index = if data_key == data_file.key_field
                 Log.debug "Data index not required '#{data_file.key_field}' => '#{data_key}'"
                 nil
               else
                 Log.debug "Data index required"
                 data_file.index :target => data_key, :fields => [data_file.key_field], :persist => false, :type => (data_file.type == :single ? :single : :flat)
               end

  current_index = data_index
  current_key   = data_key
  while not path.empty?
    next_key, next_file = path.shift

    next_fields = next_file.all_fields
    corrected_next_key = next_fields.select{|f| Misc.match_fields(f, next_key)}.first
    corrected_current_key = next_fields.select{|f| Misc.match_fields(f, current_key)}.first 

    if current_index.nil?
      current_index = next_file.index(:target => corrected_next_key, :fields => [corrected_current_key], :persist => persist_input)
      current_index = current_index.select :key => data_file.keys
    else
      next_index = next_file.index :target => next_key, :fields => [current_key], :persist => persist_input

      next_index.with_unnamed do
        current_index.with_unnamed do
          current_index.process current_index.fields.first do |values|
            if values.nil?
              nil
            else
              new_values = next_index.values_at(*values).flatten
              if current_index.type == :single
                new_values.first
              else
                new_values
              end
            end
          end
          current_index.fields = [next_key]
        end
      end
    end
    current_key = next_key
  end

  current_index

end

.melt(tsv, key_field, header_field, fields, *info_fields, &block) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/rbbt/tsv/melt.rb', line 2

def self.melt(tsv, key_field, header_field, fields, *info_fields, &block)
  dumper = TSV::Dumper.new :key_field => "ID", :fields => [key_field] + info_fields, :type => :list
  dumper.init
  TSV.traverse tsv, :into => dumper, :fields => info_fields do |k,values|
    values = [values] if tsv.type == :single
    values = values.collect{|v| [v]} if tsv.type == :list
    values = Misc.zip_fields(values) if tsv.type == :double

    res = []
    values.each_with_index do |value,i|
      info_values = if block_given?
                      new = block.call value
                      next if new.nil?
                      new
                    else
                      value
                    end
      
      info_values = [info_values] unless tsv.type == :double
      id = [k, i] * ":"
      res << [id, [k] + [info_values].flatten]
    end
    res.extend MultipleResult
    res
  end
end

.merge_different_fields(file1, file2, output, options = {}) ⇒ Object

Merge two files with the same keys and different fields



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/rbbt/tsv/attach.rb', line 66

def self.merge_different_fields(file1, file2, output, options = {})
  options = Misc.add_defaults options, :sep => "\t"
  monitor, key_field, fields = Misc.process_options options, :monitor, :key_field, :fields
  sep = options[:sep] || "\t"

  case
  when (String === file1 and not file1 =~ /\n/ and file1.length < 250 and File.exist?(file1))
    size = CMD.cmd("wc -c '#{file1}'").read.to_f if monitor
    file1 = CMD.cmd("env LC_ALL=C sort -k1,1 -t'#{sep}' #{ file1 } | grep -v '^#{sep}' ", :pipe => true)
  when (String === file1 or StringIO === file1)
    size = file1.length if monitor
    file1 = CMD.cmd("env LC_ALL=C sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file1, :pipe => true)
  when TSV === file1
    size = file1.size if monitor
    file1 = CMD.cmd("env LC_ALL=C sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file1.to_s(:sort, true), :pipe => true)
  end

  case
  when (String === file2 and not file2 =~ /\n/ and file2.length < 250 and File.exist?(file2))
    file2 = CMD.cmd("env LC_ALL=C sort -k1,1 -t'#{sep}' #{ file2 } | grep -v '^#{sep}' ", :pipe => true)
  when (String === file2 or StringIO === file2)
    file2 = CMD.cmd("env LC_ALL=C sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file2, :pipe => true)
  when TSV === file2
    file2 = CMD.cmd("env LC_ALL=C sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => file2.to_s(:sort, true), :pipe => true)
  end

  begin
    output = File.open(output, 'w') if String === output

    cols1 = nil
    cols2 = nil

    done1 = false
    done2 = false

    key1 = key2 = nil
    while key1.nil?
      while (line1 = file1.gets) =~ /^#/
        key_field1, *fields1 = line1.chomp.sub('#','').split(sep)
      end
      key1, *parts1 = line1.sub("\n",'').split(sep, -1)
      cols1 = parts1.length
    end

    while key2.nil?
      while (line2 = file2.gets) =~ /^#/
        key_field2, *fields2 = line2.chomp.sub('#','').split(sep)
      end
      key2, *parts2 = line2.sub("\n",'').split(sep, -1)
      cols2 = parts2.length
    end

    #progress_monitor = Progress::Bar.new(size, 0, 100, "Merging fields") if monitor
    progress_monitor = Log::ProgressBar.new(size, :desc => "Merging fields") if monitor

    entry_hash = options
    entry_hash.delete :sep if entry_hash[:sep] == "\t"
    output.puts TSV.header_lines key_field1, fields1 + fields2, entry_hash if key_field1 and fields1 and fields2

    key = key1 < key2 ? key1 : key2
    parts = [""] * (cols1 + cols2)
    while not (done1 and done2)
      while (not done1 and key1 == key)
        parts1.each_with_index do |part, i|
          parts[i] = (parts[i].nil? or parts[i].empty?) ? part : parts[i] << "|" << part
        end
        key1 = nil
        while key1.nil? and not done1
          if file1.eof?; done1 = true; else key1, *parts1 = file1.gets.sub("\n",'').split(sep, -1) end
        end
        progress_monitor.tick if monitor
      end
      while (not done2 and key2 == key)
        parts2.each_with_index do |part, i|
          i += cols1
          parts[i] = (parts[i].nil? or parts[i].empty?) ? part : parts[i] << "|" << part
        end
        key2 = nil
        while key2.nil? and not done2
          if file2.eof?; done2 = true; else key2, *parts2 = file2.gets.sub("\n",'').split(sep, -1) end
        end
      end

      output.puts [key, parts].flatten * sep
      parts = [""] * (cols1 + cols2)

      case
      when done1
        key = key2
      when done2
        key = key1
      else
        key = key1 < key2 ? key1 : key2
      end
    end

    output.close
    file1.join if file1.respond_to? :join
    file2.join if file2.respond_to? :join
  rescue
    file1.abort if file1.respond_to? :abort
    file2.abort if file2.respond_to? :abort
    file1.join if file1.respond_to? :join
    file2.join if file2.respond_to? :join
  end
end

.merge_paste(files, delim = "$") ⇒ Object

Merge columns from different files



174
175
176
# File 'lib/rbbt/tsv/attach.rb', line 174

def self.merge_paste(files, delim = "$")
  CMD.cmd("paste #{ files.collect{|f| "'#{f}'"} * " "} -d'#{delim}' |sed 's/#{delim}[^\\t]*//g'", :pipe => true)
end

.merge_row_fields(input, output, options = {}) ⇒ Object

Merge columns from different rows of a file



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/rbbt/tsv/attach.rb', line 6

def self.merge_row_fields(input, output, options = {})
  options = Misc.add_defaults options, :sep => "\t"
  key_field, fields = Misc.process_options options, :key_field, :fields
  sep = options[:sep]

  is = case
       when (String === input and not input.index("\n") and input.length < 250 and File.exist?(input))
         CMD.cmd("env LC_ALL=C sort -k1,1 -t'#{sep}' #{ input } | grep -v '^#{sep}' ", :pipe => true)
       when (String === input or StringIO === input)
         CMD.cmd("env LC_ALL=C sort -k1,1 -t'#{sep}' | grep -v '^#{sep}'", :in => input, :pipe => true)
       else
         input
       end

  if key_field.nil? or fields.nil?
    parser = TSV::Parser.new(is, options.dup)
    fields ||= parser.fields
    key_field ||= parser.key_field
    line = parser.first_line
  else
    line = is.gets
  end
 
  current_key  = nil
  current_parts = []

  done = false
  Open.write(output) do |os|
    options.delete :sep if options[:sep] == "\t"
    header_lines = TSV.header_lines(key_field, fields, options) 
    os.puts header_lines unless header_lines.empty?

    while line
      key, *parts = line.sub("\n",'').split(sep, -1)
      current_key ||= key
      case
      when key.nil?
      when current_key == key
        parts.each_with_index do |part,i|
          if current_parts[i].nil?
            current_parts[i] = part
          else
            current_parts[i] = current_parts[i] << "|" << part
          end
        end
      when current_key != key
        os.puts [current_key, current_parts].flatten * sep
        current_key = key
        current_parts = parts
      end

      line = is.gets
    end

    os.puts [current_key, current_parts].flatten * sep unless current_key.nil?

  end
end

.obj_stream(obj) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 3

def self.obj_stream(obj)
  case obj
  when nil
    nil
  when (defined? Step and Step)
    obj.result
  when IO, File, Zlib::GzipReader, Bgzf
    obj
  when TSV::Dumper
    obj.stream
  when TSV::Parser
    obj.stream
  end
end

.open(source, type = nil, options = nil) ⇒ Object

options shift if type.nil?



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/rbbt/tsv.rb', line 67

def self.open(source, type = nil, options = nil)
  type, options = nil, type if options.nil? and (Hash === type or (String === type and type.include? "~"))
  options = TSV.str2options(options) if String === options and options.include? "~"
  options ||= {}
  options[:type] ||= type unless type.nil?

  persist_options = Misc.pull_keys options, :persist

  raise "TSV source is nil" if source.nil?

  filename = get_filename source
  serializer = Misc.process_options options, :serializer
  unnamed = Misc.process_options options, :unnamed
  entity_options = Misc.process_options options, :entity_options


  Log.debug "TSV open: #{ filename } - #{Misc.fingerprint options }.#{unnamed ? " [unnamed]" : "[not unnamed]"}"

  data = nil

  lock_filename = filename.nil? ? nil : Persist.persistence_path(filename + '.open', {:dir => TSV.lock_dir})
  Misc.lock lock_filename  do
    data = Persist.persist_tsv source, filename, options, persist_options do |data|
      if serializer
        data.extend TSV unless TSV === data
        data.serializer = serializer
      end

      tsv_grep = Misc.process_options options, :tsv_grep
      tsv_grep ||= Misc.process_options options, :grep
      open_options = Misc.pull_keys options, :open

      stream = get_stream source, options.merge(open_options)
      parse stream, data, options.merge(:tsv_grep => tsv_grep)

      if ! open_options[:noclose]
        stream.close unless stream.closed?
        stream.join if stream.respond_to?(:join)
      end

      data.filename = filename.to_s unless filename.nil?

      if data.identifiers.nil? and Path === filename and filename.identifier_file_path
        data.identifiers = filename.identifier_file_path.find if filename.identifier_file_path.exists?
      end

      data
    end
  end


  data.unnamed = unnamed unless unnamed.nil?

  data.entity_options = entity_options

  if Path === source && data.identifiers
    Path.setup(data.identifiers, source.pkgdir, source.resource)
  end

  if data.respond_to? :persistence_path
    data.read
    data
  else
    h = data.dup
    data.clear
    data.annotate h
  end
end

.parse(stream, data, options = {}) ⇒ Object



155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/rbbt/tsv.rb', line 155

def self.parse(stream, data, options = {})

  parser = Misc.process_options options, :parser
  parser = TSV::Parser.new stream, options if parser.nil?

  # dump with tchmgr
  if defined? TokyoCabinet and TokyoCabinet::HDB === data and parser.straight and
    data.close
    begin
      bin = 'tchmgr'
      CMD.cmd("#{bin} version", :log => false)
      FileUtils.mkdir_p File.dirname(data.persistence_path)
      CMD.cmd("#{bin} importtsv '#{data.persistence_path}'", :in => stream, :log => false, :dont_close_in => true)
    rescue
      Log.debug("tchmgr importtsv failed for: #{data.persistence_path}")
    end
    data.write
  end

  # make TSV
  data.extend TSV unless TSV === data
  data.unnamed = true

  # choose serializer
  if data.serializer == :type
    data.serializer = case
                      when parser.cast.nil?
                        data.serializer = parser.type
                      when (parser.cast == :to_i and (parser.type == :list or parser.type == :flat))
                        data.serializer = :integer_array
                      when (parser.cast == :to_i and parser.type == :single)
                        data.serializer = :integer
                      when (parser.cast == :to_f and parser.type == :single)
                        data.serializer = :float
                      when (parser.cast == :to_f and (parser.type == :list or parser.type == :flat))
                        data.serializer = :float_array
                      else
                        data.serializer = :marshal
                      end
  end

  parser.traverse(options) do |key,values|
    parser.add_to_data data, key, values
  end

  # setup the TSV
  parser.setup data

  data.unnamed = false

  data
end

.parse_header(stream, options = {}) ⇒ Object



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/rbbt/tsv.rb', line 136

def self.parse_header(stream, options = {})
  case
  when Path === stream 
    stream.open do |f|
      f.no_fail = true if ConcurrentStream === f
      Parser.new f, options
    end
  when (String === stream and stream.length < 300 and (Open.exists? stream or Open.remote? stream))
    Open.open(stream) do |f|
      f.no_fail = true if ConcurrentStream === f
      Parser.new f, options
    end
  else
    filename = stream.respond_to?(:filename) ? stream.filename : Misc.fingerprint(stream)
    Log.debug("Parsing header of open stream: #{filename}")
    Parser.new stream, options
  end
end

.paste_streams(streams, options = {}) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/rbbt/tsv/stream.rb', line 22

def self.paste_streams(streams, options = {})
  options = Misc.add_defaults options, :sep => "\t", :sort => true
  sort, sep, preamble, header, same_fields, fix_flat, all_match, field_prefix = Misc.process_options options, :sort, :sep, :preamble, :header, :same_fields, :fix_flat, :all_match, :field_prefix

  out = Misc.open_pipe do |sin|

    streams = streams.collect do |stream|
      case stream
      when (defined? Step and Step) 
        stream.grace
        stream.get_stream || Open.open(stream.join.path)
      when Path
        stream.open
      when TSV::Dumper
        stream.stream
      else
        stream
      end
    end.compact

    num_streams = streams.length

    streams = streams.collect do |stream|
      sorted = Misc.sort_stream(stream)
      stream.annotate sorted if stream.respond_to? :annotate
      sorted
    end if sort

    lines         = []
    fields        = []
    sizes         = []
    key_fields    = []
    input_options = []
    empty         = []
    preambles     = []

    streams = streams.collect do |stream|

      parser = TSV::Parser.new stream, options.dup
      sfields = parser.fields

      if field_prefix
        index = streams.index stream
        prefix = field_prefix[index]

        sfields = sfields.collect{|f| [prefix, f] * ":" }
      end

      first_line = parser.first_line
      first_line = nil if first_line == ""

      lines         << first_line
      key_fields    << parser.key_field
      fields        << sfields
      sizes         << sfields.length if sfields
      input_options << parser.options
      preambles     << parser.preamble      if preamble and not parser.preamble.empty?

      stream = if fix_flat and parser.type == :flat and first_line
                 parts = lines[-1].nil? ? [] : lines[-1].split("\t")
                 lines[-1] = [parts[0], (parts[1..-1] || [])*"|"] * "\t"
                 TSV.stream_flat2double(parser.stream, :noheader => true).stream
               else
                 parser.stream
               end

      empty         << stream               if parser.first_line.nil? || parser.first_line.empty?

      stream
    end

    all_fields = fields
    key_field = key_fields.compact.first
    if same_fields
      fields = fields.first
    else
      fields = fields.compact.flatten
    end
    options = options.merge(input_options.first || {})
    options[:type] = :list if options[:type] == :single
    options[:type] = :double if fix_flat

    preamble_txt = case preamble
                   when TrueClass
                     preambles * "\n"
                   when String
                     if preamble[0] == '+'
                       preambles * "\n" + "\n" + preamble[1..-1]
                     else
                       preamble
                     end
                   else
                     nil
                   end

    header ||= TSV.header_lines(key_field, fields, options.merge(:preamble => preamble_txt))
    sin.puts header

    empty_pos = empty.collect{|stream| streams.index stream }
    empty_pos.sort.reverse.each do |i|
      key_fields.delete_at i
      input_options.delete_at i
    end

    begin
      done_streams = []

      keys = []
      parts = []
      lines.each_with_index do |line,i|
        if line.nil? || line.empty?
          keys[i] = nil
          parts[i] = nil
        else
          vs = line.chomp.split(sep, -1) 
          key, *p = vs
          keys[i] = key
          parts[i] = p
        end
        sizes[i] ||= parts[i].length-1 unless parts[i].nil?
      end

      last_min = nil
      while lines.compact.any?
        min = keys.compact.sort.first
        break if min.nil?
        str = []

        skip = all_match && keys.uniq != [min]

        keys.each_with_index do |key,i|
          case key
          when min
            str << parts[i] * sep

            begin
              line = lines[i] = begin
                                  streams[i].gets
                                rescue
                                  Log.exception $!
                                  nil
                                end
              if line.nil?
                stream = streams[i]
                keys[i] = nil
                parts[i] = nil
              else
                k, *p = line.chomp.split(sep, -1)
                raise TryAgain if k == keys[i]
                keys[i] = k
                parts[i] = p.collect{|e| e.nil? ? "" : e }
              end
            rescue TryAgain
              Log.debug "Skipping repeated key in stream #{i}: #{keys[i]}"
              retry
            end
          else
            if sizes[i] and sizes[i] > 0
              p = sep * (sizes[i]-1)
              str << p
            end
          end
        end

        next if skip

        if same_fields

          values = nil
          str.each do |part|
            next if part.nil? or part.empty?
            _p = part.split(sep,-1)
            if values.nil?
              values = _p.collect{|v| [v]}
            else
              _p.each_with_index{|v,i| values[i] ||= []; values[i] << v}
            end
          end

          values = [[]] * str.length if values.nil?
          values = values.collect{|list| list * "|" } * sep

        else
          values = str.inject(nil) do |acc,part| 
            if acc.nil?
              acc = part.dup
            else
              acc << sep << part
            end
            acc
          end
        end
        text = [min, values] * sep
        sin.puts text
      end

      streams.each do |stream|
        stream.join if stream.respond_to? :join
      end
    rescue Aborted
      Log.error "Aborted pasting streams #{streams.inspect}: #{$!.message}"
      streams.each do |stream|
        stream.abort if stream.respond_to? :abort
      end
      raise $!
    rescue Exception
      Log.error "Exception pasting streams #{streams.inspect}: #{$!.message}"
      streams.each do |stream|
        stream.abort if stream.respond_to? :abort
      end
      raise $!
    end
  end

  out
end

.pos_index(file, pos_field = nil, options = {}) ⇒ Object



177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/rbbt/tsv/index.rb', line 177

def self.pos_index(file, pos_field = nil, options = {})
  pos_field ||= "Position"

  data_options = Misc.pull_keys options, :data
  filename = case
             when (String === file or Path === file)
               file
             when file.respond_to?(:filename)
               file.filename
             else
               file.object_id.to_s
             end
  persist_options = Misc.pull_keys options, :persist
  persist_options[:prefix] ||= "StaticPosIndex[#{pos_field}]"

  filters = Misc.process_options options, :filters

  if filters
    filename += ":Filtered[#{filters.collect{|f| f * "="} * ", "}]"
  end

  Persist.persist(filename, :fwt, persist_options) do
    tsv = TSV.open(file, data_options)
    if filters
      tsv.filter
      filters.each do |match, value|
        tsv.add_filter match, value
      end
    end
    tsv.pos_index(pos_field, options)
  end
end

.range_index(file, start_field = nil, end_field = nil, options = {}) ⇒ Object



248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
# File 'lib/rbbt/tsv/index.rb', line 248

def self.range_index(file, start_field = nil, end_field = nil, options = {})
  start_field ||= "Start"
  end_field ||= "End"

  data_options = Misc.pull_keys options, :data
  filename = case
             when (String === file or Path === file)
               file
             when file.respond_to?(:filename)
               file.filename
             else
               file.object_id.to_s
             end
  persist_options = Misc.pull_keys options, :persist
  persist_options[:prefix] ||= "StaticRangeIndex[#{start_field}-#{end_field}]"

  filters = Misc.process_options options, :filters

  if filters
    filename += ":Filtered[#{filters.collect{|f| f * "="} * ", "}]"
  end

  Persist.persist(filename, :fwt, persist_options) do
    tsv = TSV.open(file, data_options)
    if filters
      tsv.filter
      filters.each do |match, value|
        tsv.add_filter match, value
      end
    end
 
    tsv.range_index(start_field, end_field, options)
  end
end

.read_matrix(tsv, field_format = "ID", value_format = "Value", *others) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/rbbt/tsv/matrix.rb', line 4

def self.read_matrix(tsv, field_format = "ID", value_format = "Value", *others)
  tsv = TSV.open(tsv) unless TSV === tsv

  if others.any?
    other_tsv = tsv.slice(others)
    tsv = tsv.slice(tsv.fields - others)
  end

  key_field, *fields = tsv.all_fields
  options = tsv.options.merge(:key_field => key_field, :fields => [field_format, value_format], :type => :double, :cast => nil)

  options[:filename] ||= tsv.filename
  options[:identifiers] ||= tsv.identifier_files.first

  dumper = TSV::Dumper.new(options)

  dumper.init
  TSV.traverse tsv, :into => dumper do |key, values|
    [key, [fields, values]]
  end

  res = TSV.open(dumper.stream, options)
  if others.any?
    other_tsv = other_tsv.to_double
    res.attach other_tsv, :one2one => true
  else
    res
  end
end

.reorder_stream(stream, positions, sep = "\t") ⇒ Object



254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# File 'lib/rbbt/tsv/stream.rb', line 254

def self.reorder_stream(stream, positions, sep = "\t")
  Misc.open_pipe do |sin|
    line = stream.gets
    line.chomp! unless line.nil?

    while line =~ /^#\:/
      sin.puts line
      line = stream.gets
      line.chomp! unless line.nil?
    end

    while line  =~ /^#/
      if Hash === positions
        new = (0..line.split(sep,-1).length-1).to_a
        positions.each do |k,v|
          new[k] = v
          new[v] = k
        end
        positions = new
      end
      sin.puts "#" + line.sub(/^#/,'').chomp.split(sep).values_at(*positions).compact * sep
      line = stream.gets
      line.chomp! unless line.nil?
    end

    while line
      if Hash === positions
        new = (0..line.split(sep, -1).length-1).to_a
        positions.each do |k,v|
          new[k] = v
          new[v] = k
        end
        positions = new
      end
      values = line.split(sep, -1)
      new_values = values.values_at(*positions)
      sin.puts new_values * sep
      line = stream.gets
      line.chomp! unless line.nil?
    end
  end
end

.reorder_stream_tsv(stream, key_field, fields = nil, zipped = true, bar = nil) ⇒ Object



298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# File 'lib/rbbt/tsv/stream.rb', line 298

def self.reorder_stream_tsv(stream, key_field, fields=nil, zipped = true, bar = nil)
  parser = TSV::Parser.new TSV.get_stream(stream), :key_field => key_field, :fields => fields
  dumper_options = parser.options
  dumper = TSV::Dumper.new dumper_options
  dumper.init 
  case parser.type
  when :single
    TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
      key = keys.first
      [key, [values]]
    end
  when :double
    TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
      res = []
      keys.each_with_index do |key,i|
        vs = zipped ?  values.collect{|l| l.length == 1 ? l : [l[i]] } : values
        res << [key, vs]
      end
      res.extend MultipleResult
      res
    end
  when :list
    TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
      key = keys === Array ? keys.first : keys
      [key, values]
    end
  when :flat
    TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
      key = keys === Array ? keys.first : keys
      [key, values]
    end
  else
    raise "Unknown type: " << parser.type.to_s
  end
  dumper
end

.report(msg, obj, into) ⇒ Object



63
64
65
66
67
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 63

def self.report(msg, obj, into)
  into = into[:into] if Hash === into and into.include? :into

  Log.low{"#{ msg } #{stream_name(obj)} -> #{stream_name(into)}"}
end

.setup(hash, type = nil, options = nil) ⇒ Object



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/rbbt/tsv.rb', line 44

def self.setup(hash, type = nil, options = nil)
  type, options = nil, type if options.nil? and (Hash === type or (String === type and type.include? "~"))
  options = TSV.str2options(options) if String === options and options.include? "~"
  options ||= {}
  options[:type] ||= type unless type.nil?

  options = Misc.add_defaults options, :default_value => [], :unnamed => TSV.unnamed
  default_value = Misc.process_options options, :default_value
  hash = Misc.array2hash(hash, default_value) if Array === hash
  hash.extend TSV

  IndiferentHash.setup(options)
  ENTRIES.each do |entry|
    hash.send("#{ entry }=", options[entry.to_s]) if options.include? entry.to_s
    hash.send("#{ entry }=", options[entry.to_sym]) if options.include? entry.to_sym
  end

  hash.unnamed = options[:unnamed]

  hash
end

.store_into(store, value) ⇒ Object



497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 497

def self.store_into(store, value)
  if MultipleResult === value
    value.each do |v|
      store_into store, v
    end
    return
  end
  begin
    return false if value.nil?
    case store
    when TSV
      if store.type == :double or store.type == :flat
        case value
        when TSV, Hash
          store.merge_zip value
        else
          store.zip_new *value
        end
      else
        k,v = value
        store[k] = v
      end
    when Hash
      case value
      when TSV, Hash
        store.merge! value 
      else
        k,v = value
        store[k] = v
      end
    when TSV::Dumper
      return false if value.nil?
      store.add *value
    when IO
      return false if value.nil?
      value.chomp!
      store.puts value
    else
      store << value
    end 
    true
  rescue Aborted, Interrupt
    Log.low "Aborted storing into #{Misc.fingerprint store}"
    abort_stream(store, $!)
    raise $!
  rescue Exception
    Log.low "Exception storing into #{Misc.fingerprint store}: #{$!.message}"
    abort_stream(store, $!)
    raise $!
  end
end

.str2options(str) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
# File 'lib/rbbt/tsv.rb', line 32

def self.str2options(str)
  field_options,_sep, rest =  str.partition("#")
  key, fields_str = field_options.split("~")

  fields = fields_str.nil? ? [] : fields_str.split(/,\s*/)

  rest = ":type=" << rest if rest =~ /^:?\w+$/
  rest_options = rest.nil? ? {} : Misc.string2hash(rest)

  {:key_field => key, :fields => fields}.merge(rest_options)
end

.stream_column(file, column) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/rbbt/tsv/util.rb', line 4

def self.stream_column(file, column)
  header = TSV.parse_header(file)
  pos = header.fields.index(column) + 1
  sep2 = header.options[:sep2] || "|"
  case header.type.to_s
  when nil, "double"
    TSV.traverse file, :type => :array, :into => :stream do |line|
      next if line =~ /^#/
      line.split("\t")[pos].gsub(sep2, "\n")
    end
  when "single"
    TSV.traverse file, :type => :array, :into => :stream do |line|
      next if line =~ /^#/
      line.split("\t")[1]
    end
  when "flat"
    TSV.traverse file, :type => :array, :into => :stream do |line|
      next if line =~ /^#/
      line.split("\t")[1..-1] * "\n"
    end
  when 'list'
    TSV.traverse file, :type => :array, :into => :stream do |line|
      next if line =~ /^#/
      line.split("\t")[pos]
    end
  end
end

.stream_flat2double(stream, options = {}) ⇒ Object



239
240
241
242
243
244
245
246
247
248
249
250
251
# File 'lib/rbbt/tsv/stream.rb', line 239

def self.stream_flat2double(stream, options = {})
  noheader = Misc.process_options options, :noheader
  parser = TSV::Parser.new TSV.get_stream(stream), :type => :flat
  dumper_options = parser.options.merge(options).merge(:type => :double)
  dumper = TSV::Dumper.new dumper_options
  dumper.init unless noheader
  TSV.traverse parser, :into => dumper do |key,values|
    key = key.first if Array === key
    values = [values] unless Array === values
    [key, [values.flatten]]
  end
  dumper
end

.stream_name(obj) ⇒ Object



55
56
57
58
59
60
61
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 55

def self.stream_name(obj)
  return "nil" if obj.nil?
  #filename_obj   = obj.respond_to?(:filename) ? obj.filename : nil
  #filename_obj ||= obj.respond_to?(:path) ? obj.path : nil
  stream_obj = obj_stream(obj) || obj
  obj.class.to_s << "-" << Misc.fingerprint(stream_obj)
end

.swap_id(tsv, field, format, options = {}, &block) ⇒ Object



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/rbbt/tsv/change_id.rb', line 56

def self.swap_id(tsv, field, format, options = {}, &block)
  options = Misc.add_defaults options, :persist => false, :identifiers => tsv.identifiers, :compact => true

  identifiers, persist_input, compact = Misc.process_options options, :identifiers, :persist, :compact
  identifiers = tsv.identifier_files.first if identifiers.nil?
  identifiers = Organism.identifiers(tsv.namespace) if defined?(Organism) && identifiers.nil? && tsv.namespace && Organism.identifiers(tsv.namespace).exists?
  identifiers.namespace ||= tsv.namespace

  fields = (identifiers and identifiers.all_fields.include?(field))? [field] : nil 
  #index = identifiers.index :target => format, :fields => fields, :persist => persist_input, :order => true

  grep = Organism.blacklist_genes(tsv.namespace).list  if defined?(Organism) && identifiers.namespace && Organism.blacklist_genes(tsv.namespace).exists?
  if fields.nil?
    index = identifiers.index(:data_tsv_grep => grep, :data_invert_grep => true, :target => format, :persist => true, :order => true, :unnamed => true, :data_persist => true)
  else
    index = identifiers.index(:data_tsv_grep => grep, :data_invert_grep => true, :target => format, :fields => fields, :order => true, :unnamed => true, :persist => true, :data_persist => true)
  end

  orig_type = tsv.type 
  tsv = tsv.to_double if orig_type != :double

  pos = tsv.fields.index field
  tsv.with_unnamed do
    if tsv.type == :list or tsv.type == :single
      tsv.through do |k,v|
        v[pos] = index[v[pos]]
        tsv[k] = v
      end
    else
      tsv.through do |k,v|
        _values = index.values_at(*v[pos])
        _values.compact! if compact
        v[pos] = _values
        tsv[k] = v
      end
    end

    tsv.fields = tsv.fields.collect{|f| f == field ? format : f}
  end

  tsv = tsv.to_flat  if orig_type == :flat

  tsv = tsv.to_list(&block)  if orig_type == :list

  tsv
end

.translate(tsv, field, format, options = {}) ⇒ Object



172
173
174
175
176
177
# File 'lib/rbbt/tsv/change_id.rb', line 172

def self.translate(tsv, field, format, options = {})
  persist_options = Misc.pull_keys options, :persist
  new = TSV.open translate_stream(tsv, field, format, options), :persist => persist_options[:persist], :persist_data => persist_options[:data], :persist_file => persist_options[:file]
  new.identifiers = tsv.identifiers
  new
end

.translate_stream(tsv, field, format, options = {}, &block) ⇒ Object



179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/rbbt/tsv/change_id.rb', line 179

def self.translate_stream(tsv, field, format, options = {}, &block)
  options = Misc.add_defaults options, :persist => false, :identifier_files => tsv.identifier_files, :compact => true

  identifier_files, identifiers, persist_input, compact = Misc.process_options options, :identifier_files, :identifiers, :persist, :compact
  identifier_files = [tsv, identifiers].compact if identifier_files.nil? or identifier_files.empty?

  identifier_files.uniq!

  index = translation_index identifier_files, format, field, options.dup
  raise "No index: #{Misc.fingerprint([identifier_files, field, format])}" if index.nil?

  orig_type = tsv.type 
  tsv = tsv.to_double if orig_type != :double

  pos = tsv.identify_field field

  new_options = tsv.options
  new_options[:identifiers] = tsv.identifiers.find if tsv.identifiers

  case pos
  when :key
    new_options[:key_field] = format if tsv.key_field == field
    dumper = TSV::Dumper.new new_options
    dumper.init
    TSV.traverse tsv, :into => dumper do |key,values|
      new_key = index[key]
      [new_key, values]
    end
  else
    new_options[:fields] = tsv.fields.collect{|f| f == field ? format : f }
    dumper = TSV::Dumper.new new_options
    dumper.init

    case tsv.type
    when :double
      TSV.traverse tsv, :into => dumper do |key,values|
        original = values[pos]
        new = index.values_at *original
        values[pos] = new
        [key, values]
      end
    when :list
      TSV.traverse tsv, :into => dumper do |key,values|
        original = values[pos]
        new = index[original]
        values[pos] = new
        [key, values]
      end
    when :flat
      TSV.traverse tsv, :into => dumper do |key,values|
        new = index.values_at *values
        [key, new]
      end
    when :single
      TSV.traverse tsv, :into => dumper do |key,original|
        new = index[original]
        [key, new]
      end
    end
  end

  dumper.stream
end

.translation_index(files, target = nil, source = nil, options = {}) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/rbbt/tsv/change_id.rb', line 107

def self.translation_index(files, target = nil, source = nil, options = {})
  return nil if source == target
  options = Misc.add_defaults options.dup, :persist => true

  target = Entity.formats.find(target) if Entity.formats.find(target)
  source = Entity.formats.find(source) if Entity.formats.find(source)
  fields = (source and not source.empty?) ? [source] : nil

  files.each do |file|
    if TSV === file
      all_fields = file.all_fields
      target = file.fields.first if target.nil?
      if (source.nil? or all_fields.include? source) and all_fields.include? target
        return file.index(options.merge(:target => target, :fields => fields, :order => true)) 
      end
    else
      next unless file.exists?
      begin
        all_fields = TSV.parse_header(file).all_fields
        target = all_fields[1] if target.nil?
        if (source.nil? or all_fields.include? source) and all_fields.include? target
          index = TSV.index(file, options.merge(:target => target, :fields => fields, :order => true)) 
          return index
        end
      rescue Exception
        Log.exception $!
        Log.error "Exception reading identifier file: #{file.find}"
      end
    end
  end

  files.each do |file|
    all_fields = TSV === file ? file.all_fields : TSV.parse_header(file).all_fields 

    files.each do |other_file|
      next if file == other_file

      other_all_fields = TSV === other_file ? other_file.all_fields : TSV.parse_header(other_file).all_fields 

      common_field = (all_fields & other_all_fields).first

      if common_field and (source.nil? or source.empty? or all_fields.include? source) and other_all_fields.include? target 

        index = Persist.persist_tsv(nil, Misc.fingerprint(files), {:files => files, :source => source, :target => target}, :prefix => "Translation index", :persist => options[:persist]) do |data|

          index = TSV === file ? 
            file.index(options.merge(:target => common_field, :fields => fields)) :
            TSV.index(file, options.merge(:target => common_field, :fields => fields))

          other_index = TSV === other_file ? 
            other_file.index(options.merge(:target => target, :fields => [common_field])) :
            TSV.index(other_file, options.merge(:target => target, :fields => [common_field]))

          data.serializer = :clean
          
          # ToDo: remove the need to to the `to_list` transformation
          data.merge! index.to_list.attach(other_index.to_list).slice([target]).to_single
        end
        return index
      end
    end
  end
  return nil
end

.traverse(obj, options = {}, &block) ⇒ Object



621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 621

def self.traverse(obj, options = {}, &block)
  into = options[:into]

  into = options[:into] = Open.open(into, :mode => "w") if Misc.is_filename?(into)

  case into
  when :stream
    sout = Misc.open_pipe false, false do |sin|                                                                                                                                           
      begin
        traverse(obj, options.merge(:into => sin), &block)                                                                                                                                  
      rescue Exception
        Log.exception $!
        begin
          sout.abort if sout.respond_to? :abort
          sout.join if sout.respond_to? :join
        ensure
          raise $!
        end
      end
    end                                                                                                                                                                                   
    return sout
  when :dumper
    obj_options = obj.respond_to?(:options) ? obj.options : {}
    dumper = TSV::Dumper.new obj_options.merge(options)
    dumper.init
    _options = options.merge(obj_options).merge(:into => dumper)
    traverse(obj, _options, &block)                                                                                                                                  
    return dumper
  end

  threads = Misc.process_options options, :threads
  cpus = Misc.process_options options, :cpus
  threads = nil if threads and threads.to_i <= 1
  cpus = nil if cpus and cpus.to_i <= 1

  if options[:keys]
    case options[:keys]
    when TrueClass
      options[:type] = :keys
    when String
      options[:type] = :keys
      options[:key_field] = options[:keys]
      options[:fields] = []
    end
  end

  bar = Misc.process_options options, :bar
  bar ||= Misc.process_options options, :progress
  options[:bar] = case bar
                  when String
                    max = guess_max(obj)
                    Log::ProgressBar.new_bar(max, {:desc => bar}) 
                  when TrueClass
                    max = guess_max(obj)
                    Log::ProgressBar.new_bar(max, nil) 
                  when Numeric
                    max = guess_max(obj)
                    Log::ProgressBar.new_bar(bar) 
                  when Hash
                    max = Misc.process_options(bar, :max) || max
                    Log::ProgressBar.new_bar(max, bar) 
                  when Log::ProgressBar
                    bar.max ||= guess_max(obj)
                    bar
                  else
                    if (defined? Step and Step === bar)
                      max = guess_max(obj)
                      Log::ProgressBar.new_bar(max, {:desc => bar.status, :file => bar.file(:progress)}) 
                    else
                      bar
                    end
                  end

  if into
    bar = Misc.process_options options, :bar

    options[:join] = Proc.new do |error|
      error = false if error.nil?
      Log::ProgressBar.remove_bar(bar, error) if bar
    end if bar

    options[:callback] = Proc.new do |e|
      begin
        store_into into, e
      rescue Aborted
        Log.low "Aborted callback #{stream_name(obj)} #{Log.color :green, "->"} #{stream_name(options[:into])}"
        abort_stream(into, $!)
        raise $!
      rescue Exception
        Log.low "Exception callback #{stream_name(obj)} #{Log.color :green, "->"} #{stream_name(options[:into])}"
        abort_stream(into, $!)
        raise $!
      ensure
        bar.tick if bar
      end
    end

    bar.init if bar
    begin
      case into
      when TSV::Dumper, IO
        traverse_stream(obj, threads, cpus, options, &block)
      else
        traverse_run(obj, threads, cpus, options, &block)
      end
    rescue Exception
      Log.exception $!
      abort_stream(into, $!)
      raise $!
    end

    into
  else
    traverse_run(obj, threads, cpus, options, &block)
  end
end

.traverse_array(array, options = {}, &block) ⇒ Object



144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 144

def self.traverse_array(array, options = {}, &block)
  callback, bar, join = Misc.process_options options, :callback, :bar, :join

  begin
    error = false
    if callback
      bar.init if bar
      array.each do |e|
        begin
          callback.call yield(e)
        rescue Exception
          Log.warn "Traverse exception on element: #{Misc.fingerprint(e)}"
          raise $!
        ensure
          bar.tick if bar
        end
      end
    else
      bar.init if bar
      array.each do |e|
        begin
          yield e
        rescue Exception
          Log.warn "Traverse exception on element: #{Misc.fingerprint(e)}"
          raise $!
        ensure
          bar.tick if bar
        end
      end
    end

  rescue
    error = true
    raise $!
  ensure
    join.call(error) if join
    Log::ProgressBar.remove_bar(bar, error) if bar
  end
end

.traverse_cpus(num, obj, options, &block) ⇒ Object



452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 452

def self.traverse_cpus(num, obj, options, &block)
  begin
    error = false
    
    callback, cleanup, join, respawn, bar = Misc.process_options options, :callback, :cleanup, :join, :respawn, :bar
    respawn = true if ENV["RBBT_RESPAWN"] and ENV["RBBT_RESPAWN"] == "true"

    Log.low "Traversing in #{ num } cpus: #{respawn ? "respawn" : "no respawn"}"
    q = RbbtProcessQueue.new num, cleanup, join, respawn, !!bar
    callback = Proc.new{ bar.tick } if callback.nil? and bar
    q.callback &callback
    q.init &block

    bar.init if bar
    traverse_obj(obj, options) do |*p|
      q.process *p
    end

    q.join

  rescue Interrupt, Aborted
    error = true
    Log.low{"Aborted traversal in CPUs for #{stream_name(obj) || Misc.fingerprint(obj)}: #{$!.backtrace*","}"}
    q.abort
    stream = obj_stream(obj)
    stream.abort if stream.respond_to? :abort
    stream = obj_stream(options[:into])
    stream.abort if stream.respond_to? :abort
    q.join
    raise "Traversal aborted"
  rescue Exception
    error = true
    Log.low{"Exception during traversal in CPUs for #{stream_name(obj) || Misc.fingerprint(obj)}: #{$!.message}"}
    q.abort
    stream = obj_stream(obj)
    stream.abort if stream.respond_to? :abort
    stream = obj_stream(options[:into])
    stream.abort if stream.respond_to? :abort
    q.join
    raise $!
  ensure
    Log::ProgressBar.remove_bar(bar, error) if bar
  end
end

.traverse_hash(hash, options = {}, &block) ⇒ Object



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 108

def self.traverse_hash(hash, options = {}, &block)
  callback, bar, join = Misc.process_options options, :callback, :bar, :join

  begin
    error = false
    if callback
      bar.init if bar
      hash.each do |k,v|
        begin
          callback.call yield(k,v)
        ensure
          bar.tick if bar
        end
      end
    else
      bar.init if bar
      hash.each do |k,v|
        begin
          yield k,v 
        rescue Exception
          Log.warn "Traverse exception on element: #{Misc.fingerprint([k, v])}"
          raise $!
        ensure
          bar.tick if bar
        end
      end
    end
  rescue
    error = true
    raise $!
  ensure
    join.call(error) if join
    Log::ProgressBar.remove_bar(bar, error) if bar
  end
end

.traverse_io(io, options = {}, &block) ⇒ Object



280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 280

def self.traverse_io(io, options = {}, &block)
  callback, bar, join = Misc.process_options options, :callback, :bar, :join

  begin
    error = false
    if File === io and io.closed? 
      begin
        Log.low{"Rewinding stream #{stream_name(io)}"}
        io.reopen io.filename, "r"
      rescue
        Log.exception $!
        raise "File closed and could not reopen #{stream_name(io)}"
      end
    end

    options[:monitor] = bar
    if callback
      bar.init if bar
      exception = nil
      begin
        TSV::Parser.traverse(io, options) do |k,v,f|
          begin
            callback.call yield k, v, f
          rescue Exception
            exception = $!
            Log.warn "Traverse exception on element: #{Misc.fingerprint([k, v, f])}"
            raise $!
          end
          bar.tick if bar
        end
      ensure
        raise exception if exception
      end
    else
      TSV::Parser.traverse(io, options.merge(:monitor => bar), &block)
    end
  rescue
    error = true
    raise $!
  ensure
    join.call(error) if join
    Log::ProgressBar.remove_bar(bar, error) if bar
  end
end

.traverse_io_array(io, options = {}, &block) ⇒ Object



226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 226

def self.traverse_io_array(io, options = {}, &block)
  callback, bar, join = Misc.process_options options, :callback, :bar, :join
  begin
    error = false
    if File === io and io.closed? 
      begin
        Log.low{"Rewinding stream #{stream_name(io)}"}
        io.reopen io.filename, "r"
      rescue
        Log.exception $!
        raise "File closed and could not reopen #{stream_name(io)}"
      end
    end

    if callback
      bar.init if bar
      while line = io.gets
        if line[-1] != "\n"
          while c = io.getc
            line << c
            break if c=="\n"
          end
        end
        begin
          callback.call yield line.chomp
        rescue Exception
          Log.warn "Traverse exception on element: #{Misc.fingerprint(line)}"
          raise $!
        ensure
          bar.tick if bar
        end
      end
    else
      bar.init if bar
      while line = io.gets
        begin
          yield line.chomp
        rescue Exception
          Log.warn "Traverse exception on element: #{Misc.fingerprint(line)}"
          raise $!
        ensure
          bar.tick if bar
        end
      end
    end
  rescue
    error = true
    raise $!
  ensure
    join.call(error) if join
    Log::ProgressBar.remove_bar(bar, error) if bar
  end
end

.traverse_obj(obj, options = {}, &block) ⇒ Object



325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 325

def self.traverse_obj(obj, options = {}, &block)
  if options[:type] == :keys
    options[:fields] = []
    options[:type] = :single
  end

  name = stream_name(obj)
  Log.low{"Traversing #{name} #{Log.color :green, "->"} #{stream_name(options[:into])}"}
  begin
    case obj
    when (defined? FastContainers && FastContainers::PriorityQueue === obj)
      traverse_priority_queue(obj, options, &block)
    when TSV
      traverse_tsv(obj, options, &block)
    when Hash
      traverse_hash(obj, options, &block)
    when TSV::Parser
      callback = Misc.process_options options, :callback
      if callback
        obj.traverse(options) do |k,v|
          callback.call yield k, v
        end
      else
        obj.traverse(options, &block)
      end
    when IO, File, Zlib::GzipReader, Bgzf, StringIO
      begin
        if options[:type] == :array or options[:type] == :line
          traverse_io_array(obj, options, &block)
        else
          traverse_io(obj, options, &block)
        end
      rescue Aborted
        obj.abort if obj.respond_to? :abort
        raise $!
      rescue Exception
        obj.abort if obj.respond_to? :abort
        raise $!
      ensure
        obj.close if obj.respond_to? :close and not obj.closed?
        obj.join if obj.respond_to? :join
      end
    when Path
      obj.open do |stream|
        traverse_obj(stream, options, &block)
      end
    when TSV::Dumper
      traverse_obj(obj.stream, options, &block)
    when (defined? Step and Step)

      obj.clean if obj.aborted? or obj.recoverable_error?
      obj.run(true) unless obj.done? || obj.started? || obj.result

      stream = obj.get_stream
      options = {:type => :array}.merge(options) if obj.result_type == :array

      if stream
        traverse_obj(stream, options, &block)
      else
        obj.join
        traverse_obj(obj.path, options, &block)
      end
    when Array
      traverse_array(obj, options, &block)
    when Set
      traverse_array(obj.to_a, options, &block)
    when String
      if Open.remote?(obj) || Open.ssh?(obj) || Misc.is_filename?(obj)
        Open.open(obj) do |s|
          traverse_obj(s, options, &block)
        end
      else
        raise "Can not open obj for traversal #{Misc.fingerprint obj}"
      end
    when nil
      raise "Can not traverse nil object into #{stream_name(options[:into])}"
    else
      raise "Unknown object for traversal: #{Misc.fingerprint obj }"
    end
  rescue IOError
    Log.low{"IOError traversing #{stream_name(obj)}: #{$!.message}"}
    abort_stream obj
    abort_stream options[:into], $!
    raise $!
  rescue Errno::EPIPE
    Log.low{"Pipe closed while traversing #{stream_name(obj)}: #{$!.message}"}
    abort_stream obj
    abort_stream options[:into], $!
    raise $!
  rescue Aborted
    Log.low{"Aborted traversing #{stream_name(obj)}"}
    abort_stream obj
    abort_stream options[:into], $!
    raise $!
  rescue Exception
    Log.low{"Exception traversing #{stream_name(obj)}"}
    abort_stream obj unless String === obj
    abort_stream options[:into], $!
    raise $!
  end
end

.traverse_priority_queue(queue, options = {}, &block) ⇒ Object



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 184

def self.traverse_priority_queue(queue, options = {}, &block)
  callback, bar, join = Misc.process_options options, :callback, :bar, :join

  begin
    error = false
    if callback
      bar.init if bar
      while queue.any?
        e = queue.pop
        begin
          callback.call yield(e)
        rescue Exception
          Log.warn "Traverse exception on element: #{Misc.fingerprint(e)}"
          raise $!
        ensure
          bar.tick if bar
        end
      end
    else
      bar.init if bar
      while queue.any?
        e = queue.pop
        begin
          yield e
        rescue Exception
          Log.warn "Traverse exception on element: #{Misc.fingerprint(e)}"
          raise $!
        ensure
          bar.tick if bar
        end
      end
    end

  rescue
    error = true
    raise $!
  ensure
    join.call(error) if join
    Log::ProgressBar.remove_bar(bar, error) if bar
  end
end

.traverse_run(obj, threads, cpus, options = {}, &block) ⇒ Object



576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 576

def self.traverse_run(obj, threads, cpus, options = {}, &block)
  threads = nil if threads == 1
  cpus = nil if cpus == 1
  if ENV["RBBT_NO_MAP_REDUCE"] == "true" or (threads.nil? and cpus.nil?)
    traverse_obj obj, options, &block
  else
    if threads
      traverse_threads threads, obj, options, &block 
    else
      close_streams = Misc.process_options(options, :close_streams) || []
      close_streams = [close_streams] unless Array === close_streams

      close_streams.concat(get_streams_to_close(obj))
      options[:close_streams] = close_streams

      if close_streams and close_streams.any?
        options[:cleanup] = Proc.new do
          close_streams.uniq.each do |s|
            s.close unless s.closed?
          end
        end 
      end

      traverse_cpus cpus, obj, options, &block
    end
  end
end

.traverse_stream(obj, threads = nil, cpus = nil, options = {}, &block) ⇒ Object



604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 604

def self.traverse_stream(obj, threads = nil, cpus = nil, options = {}, &block)
  into = options[:into]        

  thread = Thread.new do 
    begin
      traverse_run(obj, threads, cpus, options, &block)
      into.close if into.respond_to?(:close) and not (into.respond_to?(:closed?) and into.closed?) 
    rescue Exception
      abort_stream obj
      abort_stream into
      raise $!
    end
  end

  ConcurrentStream.setup(obj_stream(into), :threads => thread)
end

.traverse_threads(num, obj, options, &block) ⇒ Object



427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 427

def self.traverse_threads(num, obj, options, &block)
  callback = Misc.process_options options, :callback

  q = RbbtThreadQueue.new num

  if callback
    block = Proc.new do |*args|
      mutex = args.pop
      res = yield *args
      mutex.synchronize do
        callback.call res 
      end
    end
  end

  q.init true, &block

  traverse_obj(obj, options) do |*p|
    q.process p
  end

  q.join
  nil
end

.traverse_tsv(tsv, options = {}, &block) ⇒ Object

{{{ TRAVERSE OBJECTS



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/rbbt/tsv/parallel/traverse.rb', line 71

def self.traverse_tsv(tsv, options = {}, &block)
  callback, bar, join = Misc.process_options options, :callback, :bar, :join

  begin
    error = false
    fields = tsv.fields
    if callback
      bar.init if bar
      tsv.through options[:key_field], options[:fields] do |k,v|
        begin
          callback.call yield(k,v,fields)
        rescue Exception
          Log.warn "Traverse exception on element: #{Misc.fingerprint([k, v, fields])}"
          raise $!
        ensure
          bar.tick if bar
        end
      end
    else
      bar.init if bar
      tsv.through options[:key_field], options[:fields] do |k,v|
        begin
          yield k,v,fields
        ensure
          bar.tick if bar
        end
      end
    end
    rescue
      error = true
      raise $!
    ensure
      join.call(error) if join
      Log::ProgressBar.remove_bar(bar, error) if bar
    end
end

.xls(filename, options = {}) ⇒ Object



267
268
269
270
271
272
273
274
275
276
# File 'lib/rbbt/tsv/excel.rb', line 267

def self.xls(filename, options ={})
  if Open.remote? filename
    TmpFile.with_file nil, :extension => 'xls' do |tmp|
      Open.download(filename, tmp)
      TSV::XLS.read(tmp, options)
    end
  else
    TSV::XLS.read(filename, options)
  end
end

.xlsx(filename, options = {}) ⇒ Object



278
279
280
281
282
283
284
285
286
287
288
# File 'lib/rbbt/tsv/excel.rb', line 278

def self.xlsx(filename, options ={})
  if Open.remote? filename

    TmpFile.with_file nil, :extension => 'xlsx' do |tmp|
      Open.download(filename, tmp)
      TSV::XLSX.read(tmp, options)
    end
  else
    TSV::XLSX.read(filename, options)
  end
end

.zip_fields(list, fields = nil) ⇒ Object



503
504
505
506
507
508
509
# File 'lib/rbbt/tsv/accessor.rb', line 503

def self.zip_fields(list, fields = nil)
  return [] if list.nil? || list.empty?
  fields ||= list.fields if list.respond_to? :fields
  zipped = list[0].zip(*list[1..-1])
  zipped = zipped.collect{|v| setup_array(v, fields)} if fields 
  zipped 
end

Instance Method Details

#[](key, clean = false) ⇒ Object



234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/rbbt/tsv/accessor.rb', line 234

def [](key, clean = false)
  value = super(key)
  return value if clean or value.nil?
  @serializer_module ||= self.serializer_module

  if MultipleResult === value
    res = value.collect{|v| prepare_value key, v }
    res.extend MultipleResult
    res
  else
    prepare_value key, value
  end
end

#[]=(key, value, clean = false) ⇒ Object



248
249
250
251
# File 'lib/rbbt/tsv/accessor.rb', line 248

def []=(key, value, clean = false)
  return super(key, value) if clean || value.nil? || TSV::CleanSerializer == self.serializer_module 
  super(key, @serializer_module.dump(value))
end

#add_field(name = nil) ⇒ Object



656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
# File 'lib/rbbt/tsv/manipulate.rb', line 656

def add_field(name = nil)
  old_monitor = @monitor
  @monitor = {:desc => "Adding field #{ name }"} if TrueClass === monitor

  through do |key, values|
    new_values = yield(key, values)
    new_values = [new_values] if type == :double and not Array === new_values

    case
    when (values.nil? and (fields.nil? or fields.empty?))
      values = [new_values]
    when values.nil?  
      values = [nil] * fields.length + [new_values]
    when Array === values
      values += [new_values]
    else
      values << new_values
    end

    self[key] = values
  end
  @monitor = old_monitor

  if not fields.nil? and not name.nil?
    new_fields = self.fields + [name]
    self.fields = new_fields
  end

  self
end

#add_fields(names = []) ⇒ Object



687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
# File 'lib/rbbt/tsv/manipulate.rb', line 687

def add_fields(names = [])
  old_monitor = @monitor
  @monitor = {:desc => "Adding field #{ names * ", " }"} if TrueClass === monitor

  through do |key, values|
    values ||= fields ? [nil] * fields : []
    new_values = yield(key, values)

    case type
    when :double
      new_values = new_values.collect{|v| [v] } if Array === new_values and new_values.first and not Array === new_values.first
      values += new_values || [nil] * names.length
    when :list
      values += new_values || [nil] * names.length
    end

    self[key] = values
  end
  @monitor = old_monitor

  if not fields.nil? and not (names.nil? or names.empty?)
    new_fields = self.fields + names
    self.fields = new_fields
  end

  self
end

#all_fieldsObject



542
543
544
545
# File 'lib/rbbt/tsv/accessor.rb', line 542

def all_fields
  return nil if key_field.nil? or fields.nil?
  [key_field] + fields
end

#annotate(tsv) ⇒ Object



17
18
19
20
21
22
# File 'lib/rbbt/tsv/accessor.rb', line 17

def annotate(tsv)
  TSV.setup(tsv, info)
  tsv.entity_options = self.entity_options
  tsv.entity_templates = self.entity_templates
  tsv
end

#attach(other, options = {}) ⇒ Object



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/rbbt/tsv/attach.rb', line 196

def attach(other, options = {})
  options      = Misc.add_defaults options, :in_namespace => false, :persist_input => false
  fields, one2one, complete = Misc.process_options options, :fields, :one2one, :complete
  in_namespace = options[:in_namespace]

  unless TSV === other
    other_identifier_file = other.identifier_files.first if other.respond_to? :identifier_files
    other = TSV.open(other, :persist => options[:persist_input].to_s == "true")
    other.identifiers ||= other_identifier_file
  end

  fields = other.fields - [key_field].concat(self.fields) if other.fields and (fields.nil? or fields == :all)
  if in_namespace
    fields = other.fields_in_namespace - [key_field].concat(self.fields) if fields.nil?
  else
    fields = other.fields - [key_field].concat(self.fields) if fields.nil?
  end

  other_filename = other.respond_to?(:filename) ? other.filename : other.inspect
  Log.low("Attaching fields:#{Misc.fingerprint fields } from #{other_filename}.")

  same_key = true
  begin
    case
    when (Misc.match_fields(key_field, other.key_field) and same_key)
      Log.debug "Attachment with same key: #{other.key_field}"
      attach_same_key other, fields
    when (not in_namespace and self.fields.select{|f| Misc.match_fields(f, other.key_field)}.any?)
      Log.debug "Found other key field: #{other.key_field}"
      attach_source_key other, other.key_field, :fields => fields, :one2one => one2one
    when (in_namespace and self.fields_in_namespace.select{|f| Misc.match_fields(f, other.key_field)}.any?)
      Log.debug "Found other key field in #{in_namespace}: #{other.key_field}"
      attach_source_key other, other.key_field, :fields => fields, :one2one => one2one
    else
      index = TSV.find_traversal(self, other, options)
      raise FieldNotFoundError, "Cannot traverse identifiers" if index.nil?
      Log.debug "Attachment with index: #{other.key_field}"
      attach_index other, index, fields
    end
  rescue Exception
    if same_key
      Log.warn "Could not translate identifiers with same_key"
      same_key = false
      retry
    else
      raise $!
    end
  end
  Log.debug("Attachment of fields:#{Misc.fingerprint fields } from #{other.filename.inspect} finished.")

  if complete
    Log.warn "Attaching through index and completing empty rows; keys with wrong format may appear (#{other.key_field} insted of #{self.key_field})" if index
    fill = TrueClass === complete ? nil : complete
    field_length = self.fields.length 
    common_fields = (other.fields & self.fields)
    other_common_pos = common_fields.collect{|f| other.fields.index f}
    this_common_pos = common_fields.collect{|f| self.fields.index f}
    missing = other.keys - self.keys

    other = other.to_list if other.type == :single

    case type
    when :single
      missing.each do |k|
        self[k] = fill
      end
    when :list
      missing.each do |k|
        values = [fill] * field_length
        other_values = other[k]
        other_common_pos.zip(this_common_pos).each do |o,t|
          values[t] = other_values[o]
        end
        self[k] = values
      end
    when :double
      fill = [] if fill.nil?
      missing.each do |k|
        values = [fill] * field_length
        other_values = other[k]
        other_common_pos.zip(this_common_pos).each do |o,t|
          values[t] = other_values[o]
        end
        self[k] = values
      end
    when :flat
      fill = [] if fill.nil?
      missing.each do |k|
        self[k] = fill
      end
    end
  end

  self
end

#attach_index(other, index, fields = nil) ⇒ Object



158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# File 'lib/rbbt/tsv/attach/util.rb', line 158

def attach_index(other, index, fields = nil)
  fields = other.fields - [key_field].concat(self.fields) if fields.nil?
  fields = [fields] unless Array === fields

  other = other.tsv unless TSV === other
  field_positions = fields.collect{|field| other.identify_field field}
  field_names     = field_positions.collect{|pos| pos == :key ? other.key_field : other.fields[pos] }

  length = self.fields.length
  other.with_unnamed do
    index.with_unnamed do
      with_unnamed do
        through do |key, values|
          source_keys = index[key]
          source_keys = [source_keys] unless Array === source_keys
          if source_keys.nil? or source_keys.empty?
            all_new_values = []
          else
            all_new_values = []
            source_keys.each do |source_key|
              next unless other.include? source_key
              new_values = field_positions.collect do |pos|
                if pos == :key
                  if other.type == :double
                    [source_key]
                  else
                    source_key
                  end
                else
                  if other.type == :flat
                    other[source_key]
                  else
                    other[source_key][pos]
                  end
                end
              end
              new_values.collect!{|v| v.nil? ? [[]] : [v]}    if     type == :double and not other.type == :double
              new_values.collect!{|v| v.nil? ? nil : (other.type == :single ? v : v.first)} if not type == :double and     other.type == :double
              new_values.flatten! if type == :flat
              all_new_values << new_values
            end
          end

          if all_new_values.empty?
            if type == :double
              all_new_values = [[[]] * field_positions.length]
            else
              all_new_values = [[nil] * field_positions.length]
            end
          end

          current = self[key] || [[]] * fields.length

          current = [current] unless Array === current

          if current.length > length
            all_new_values << current.slice!(length..current.length - 1)
          end

          if type == :double
            all_new_values = TSV.zip_fields(all_new_values).collect{|l| l.flatten}
          else
            all_new_values = all_new_values.first
          end

          current += all_new_values

          self[key].replace current
        end
      end
    end
  end

  self.type = :list if self.type == :single

  self.fields = self.fields.concat field_names
end

#attach_same_key(other, fields = nil) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/rbbt/tsv/attach/util.rb', line 3

def attach_same_key(other, fields = nil)
  fields = other.fields - [key_field].concat(self.fields) if fields.nil?

  fields = [fields].compact unless Array === fields

  common_fields = self.fields & fields

  fields = fields - common_fields

  num_fields = fields.length

  field_positions = fields.collect{|field| other.identify_field field}

  if common_fields.any?
    common_field_positions = common_fields.collect{|field| self.identify_field field}
    common_field_positions_other = common_fields.collect{|field| other.identify_field field}
  end

  other.with_unnamed do
    with_unnamed do
      through do |key, values|
        self[key] = [] unless self.include? key
        current = self[key]
        current = [current] unless Array === current
        if other.include? key
          case
          when other.type == :flat
            if type == :flat
              new_values = other[key]
            else
              new_values = [other[key]]
            end
          when other.type == :single
            new_values = [other[key]]
          else
            other_values = other[key] || [nil] * other.fields.length
            new_values = field_positions.collect do |pos|
              pos == :key ? key : other_values[pos]
            end

            if common_fields.any?
              common_field_positions.zip(common_field_positions_other).each do |p1,p2|
                current[p1] += other_values[p2]
              end
            end
          end

          new_values.collect!{|v| [v]}     if     type == :double and not (other.type == :double or other.type == :flat)
          new_values.collect!{|v| v.nil? ? nil : (other.type == :single ? v : v.first)} if not type == :double and     other.type == :double

          new_values.flatten if type == :flat

          self[key] = current + new_values
        else
          if type == :double
            self[key] = current + [[]] * num_fields
          else
            self[key] = current + [nil] * num_fields
          end
        end
      end
    end
  end

  self.type = :list if self.type == :single

  self.fields = self.fields.concat fields

  self
end

#attach_source_key(other, source, options = {}) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/rbbt/tsv/attach/util.rb', line 74

def attach_source_key(other, source, options = {})
  fields = Misc.process_options options, :fields
  one2one = Misc.process_options options, :one2one

  fields = other.fields - [key_field].concat(self.fields) if fields.nil?

  other = other.tsv(:persistence => :no_create) unless TSV === other
  field_positions = fields.collect{|field| other.identify_field field}
  field_names     = field_positions.collect{|pos| pos == :key ? other.key_field : other.fields[pos] }

  corrected_source = all_fields.select{|f| Misc.match_fields(f, source) }.first
  source_pos = identify_field corrected_source

  other.with_unnamed do
    with_unnamed do
      through do |key, values|
        source_keys = values[source_pos]

        case
        when (source_keys.nil? or (Array === source_keys and source_keys.empty?))
          if type == :double
            self[key] = values.concat field_positions.collect{|v| []}
          else
            self[key] = values.concat [nil] * field_positions.length
          end
        when Array === source_keys
          all_new_values = source_keys.collect do |source_key|
            positions = field_positions.collect do |pos|
              if pos == :key
                [source_key]
              else
                if other.include? source_key
                  v = case other.type
                      when :flat
                        other[source_key] 
                      when :single
                        [other[source_key]]
                      when :double
                        other[source_key][pos]
                      when :list
                        [other[source_key][pos]]
                      end
                else
                  [nil]
                end
              end
            end

            positions.collect!{|v| v[0..0]} if one2one
            positions
          end

          new = Misc.zip_fields(all_new_values).each{|field_entry|
            field_entry.flatten!
            field_entry.compact!
          }

          self[key] = values.concat new
        else
          source_key = source_keys
          all_new_values = field_positions.collect do |pos|
            if pos == :key
              source_key
            else
              if other.include? source_key
                v = other[source_key][pos]
                Array === v ? v.first : v
              else
                nil
              end
            end
          end

          self[key] = values.concat all_new_values
        end

      end
    end
  end

  self.fields = self.fields.concat field_names
  self
end

#change_key(format, options = {}, &block) ⇒ Object



51
52
53
54
# File 'lib/rbbt/tsv/change_id.rb', line 51

def change_key(format, options = {}, &block)
  options = Misc.add_defaults options, :identifiers => self.identifiers
  TSV.change_key(self, format, options, &block)
end

#chunked_values_at(keys, max = 5000) ⇒ Object

def _values_at(*keys)

keys.collect do |key|
  self[key]
end

end



378
379
380
381
382
383
384
# File 'lib/rbbt/tsv/accessor.rb', line 378

def chunked_values_at(keys, max = 5000)
  Misc.ordered_divide(keys, max).inject([]) do |acc,c|
    new = self.values_at(*c)
    new.annotate acc if new.respond_to? :annotate and acc.empty?
    acc.concat(new)
  end
end

#closeObject



102
103
104
105
106
107
108
# File 'lib/rbbt/tsv/accessor.rb', line 102

def close
  begin
    super
  rescue Exception
    self
  end
end

#collectObject



335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
# File 'lib/rbbt/tsv/accessor.rb', line 335

def collect
  serializer_module = self.serializer_module
  super do |key, value|
    next if ENTRY_KEYS.include? key

    # TODO Update this to be more efficient
    value = serializer_module.load(value) unless serializer_module.nil? or TSV::CleanSerializer == serializer_module

    # Annotated with Entity and NamedArray
    if not @unnamed
      if not fields.nil? 
        case type
        when :double, :list
          setup_array value, fields, key, entity_options if Array === value 
        when :flat, :single
          value = prepare_entity(value, fields.first, entity_options)
        end
      end
      key = prepare_entity(key, key_field, entity_options)
    end

    if block_given?
      yield key, value
    else
      [key, value]
    end
  end
end

#column(field, cast = nil) ⇒ Object



565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
# File 'lib/rbbt/tsv/manipulate.rb', line 565

def column(field, cast = nil)
  new = slice(field)

  new.with_unnamed do
    new.each do |k,v|
      nv = v.first 
      nv = nv.send(cast) unless cast.nil?
      new[k] = nv
    end
  end

  case type
  when :double, :flat
    new.type = :flat
  else
    new.type = :single
  end

  new
end

#column_values(field, options = {}) ⇒ Object



586
587
588
589
590
591
592
593
# File 'lib/rbbt/tsv/manipulate.rb', line 586

def column_values(field, options = {})
  all = []
  through :key, field do |k,values|
    values = Array === values ? values.flatten : [values]
    all.concat values
  end
  prepare_entity(all, field, options = {})
end

#detach(file) ⇒ Object



292
293
294
295
296
297
# File 'lib/rbbt/tsv/attach.rb', line 292

def detach(file)
  file_fields = file.fields.collect{|field| field.fullname}
  detached_fields = []
  self.fields.each_with_index{|field,i| detached_fields << i if file_fields.include? field.fullname}
  reorder :key, detached_fields
end

#dump_entry_value(value) ⇒ Object



154
155
156
157
# File 'lib/rbbt/tsv/accessor.rb', line 154

def dump_entry_value(value)
  return value unless respond_to? :persistence_path
  (value.nil? or value == SERIALIZED_NIL) ? SERIALIZED_NIL : TSV_SERIALIZER.dump(value)
end

#dumper_stream(keys = nil, no_options = false, unmerge = false) ⇒ Object



570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
# File 'lib/rbbt/tsv/accessor.rb', line 570

def dumper_stream(keys = nil, no_options = false, unmerge = false)
  unmerge = false unless type == :double

  options = self.options
  options[:type] = :list if unmerge

  TSV::Dumper.stream options do |dumper|
    case no_options
    when FalseClass, nil
      dumper.init
    when Hash
      dumper.init(no_options)
    end

    begin
      if keys
        keys.each do |key|
          if unmerge
            value_list = self[key]
            max = value_list.collect{|v| v.length}.max

            if unmerge == :expand and max > 1
              value_list = value_list.collect do |values|
                if values.length == 1
                  [values.first] * max
                else
                  values
                end
              end
            end
            
            Misc.zip_fields(value_list).each do |values|
              dumper.add key, values
            end
          else
            dumper.add key, self[key]
          end
        end
      else
        with_unnamed do
          each do |k,value_list|

            if unmerge
              max = value_list.collect{|v| v.length}.max

              if unmerge == :expand and max > 1
                value_list = value_list.collect do |values|
                  if values.length == 1
                    [values.first] * max
                  else
                    values
                  end
                end
              end

              Misc.zip_fields(value_list).each do |values|
                dumper.add k, values
              end
            else
              dumper.add k, value_list
            end
          end
        end
      end
    rescue Exception
      Log.exception $!
      raise $!
    end
    dumper.close
  end
end

#eachObject



307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# File 'lib/rbbt/tsv/accessor.rb', line 307

def each
  fields = self.fields

  serializer_module = self.serializer_module
  super do |key, value|
    next if ENTRY_KEYS.include? key

    # TODO Update this to be more efficient
    value = serializer_module.load(value) unless value.nil? or serializer_module.nil? or TSV::CleanSerializer == serializer_module

    # Annotated with Entity and NamedArray
    if not @unnamed
      if not fields.nil? 
        case type
        when :double, :list
          setup_array value, fields, key, entity_options, entity_templates if Array == value
        when :flat, :single
          prepare_entity(value, fields.first, entity_options)
        end
      end
      key = prepare_entity(key, key_field, entity_options)
    end

    yield key, value if block_given?
    [key, value]
  end
end

#empty?Boolean

Returns:

  • (Boolean)


212
213
214
# File 'lib/rbbt/tsv/accessor.rb', line 212

def empty?
  length == 0
end

#excel(filename, options = {}) ⇒ Object



258
259
260
261
262
263
264
# File 'lib/rbbt/tsv/excel.rb', line 258

def excel(filename, options ={})
  if filename =~ /\.xlsx$/
    xlsx(filename, options)
  else
    xls(filename, options)
  end
end

#field_index(field) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/rbbt/tsv/field_index.rb', line 13

def field_index(field)
  @field_indices ||= {}
  @field_indices[field] ||= Persist.persist_tsv(self, filename, {:field => field}, :prefix => "FieldIndex", :dir => TSV.field_index_dir, :persist => true, :serializer => :list, :engine => "BDB" ) do |data|
    tsv = {}
    case type 
    when :single, :list
      through :key, [field] do |key, values|
        value = values.first
        tsv[value] ||= []
        tsv[value] << key
      end
    else
      through :key, [field] do |key, values|
        values.first.each do |value|
          tsv[value] ||= []
          tsv[value] << key
        end
      end
    end

    tsv.each do |v,keys|
      data[v] = keys.sort
    end

    data
  end
end

#field_index_select(matches) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/rbbt/tsv/field_index.rb', line 41

def field_index_select(matches)
  final = nil
  matches.each do |field,values|
    i = field_index(field)

    if Array === values
      keys = values.inject([]){|acc,value| m = i[value]; acc = m.nil? ? acc : Misc.merge_sorted_arrays( acc, m) }
    else
      keys = i[values] || []
    end

    final = final.nil? ? keys : Misc.intersect_sorted_arrays(final, keys)
  end
  final
end

#fieldsObject



481
482
483
484
485
486
487
488
489
# File 'lib/rbbt/tsv/accessor.rb', line 481

def fields
  #@fields ||= TSV_SERIALIZER.load(self.send(:[], "__tsv_hash_fields", :entry_key) || SERIALIZED_NIL)
  @fields ||= load_entry_value(self.send(:[], "__tsv_hash_fields", :entry_key))
  if true or @fields.nil? or @unnamed
    @fields
  else
    @named_fields ||= NamedArray.setup @fields, @fields, nil, entity_options, entity_templates
  end
end

#fields=(value) ⇒ Object



496
497
498
499
500
501
# File 'lib/rbbt/tsv/accessor.rb', line 496

def fields=(value)
  clean = true
  self.send(:[]=, "__tsv_hash_fields", dump_entry_value(value), clean)
  @fields = value
  @named_fields = nil
end

#filter(filter_dir = nil) ⇒ Object



287
288
289
290
291
292
# File 'lib/rbbt/tsv/filter.rb', line 287

def filter(filter_dir = nil)
  self.extend Filtered
  self.filter_dir = filter_dir
  self.filters = []
  self
end

#head_str(times = 10) ⇒ Object



686
687
688
689
690
691
692
693
694
# File 'lib/rbbt/tsv/accessor.rb', line 686

def head_str(times=10)
  stream = dumper_stream
  str = ""
  times.times do |i|
    break if stream.eof?
    str << stream.gets
  end
  str
end

#head_tsv(times = 10) ⇒ Object Also known as: head



696
697
698
699
700
701
702
703
704
705
# File 'lib/rbbt/tsv/accessor.rb', line 696

def head_tsv(times = 10)
  new = self.annotate({})
  i = 0
  self.each do |k,v|
    return new if i == times
    new[k] = v
    i += 1
  end
  new
end

#identifier_filesObject



511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
# File 'lib/rbbt/tsv/accessor.rb', line 511

def identifier_files
  case
  when (identifiers and TSV === identifiers)
    [identifiers]
  when (identifiers and Array === identifiers)
    case
    when (TSV === identifiers.first or identifiers.empty?)
      identifiers
    else
      identifiers.collect{|f| Path === f ? f : Path.setup(f)}
    end
  when identifiers
    [ Path === identifiers ? identifiers : Path.setup(identifiers) ]
  when Path === filename
    filename.identifier_files
  when filename
    Path.setup(filename.dup).identifier_files
  else
    []
  end
end

#identify_field(field) ⇒ Object



224
225
226
# File 'lib/rbbt/tsv/util.rb', line 224

def identify_field(field)
  TSV.identify_field(key_field, fields, field)
end

#index(options = {}) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/rbbt/tsv/index.rb', line 12

def index(options = {})
  options = Misc.add_defaults options, 
    :target => :key, :fields => nil, :type => :single, :order => false

  persist_options = Misc.pull_keys options, :persist
  persist_options[:prefix] ||= "Index[#{options[:target] || :key}]"

  Log.debug "Index: #{ filename } - #{Misc.fingerprint options}"
  Persist.persist_tsv self, filename, options, persist_options do |new|
    with_unnamed do
      target, fields, index_type, order = Misc.process_options options, :target, :fields, :type, :order

      new.serializer = index_type if new.respond_to? :serializer and new.serializer == :type

      if order

        # Maybe best to do the stuff in memory first instead of the original
        # object, which could be persisted
        save = new
        new = {} 

        new_key_field, new_fields = through target, fields, true do |key, values|
          next if key.empty? 
          case type
          when :single
            values = [values]
            values.unshift key
          when :double
            values = values.dup
            values.unshift [key]
          when :list, :flat
            values = values.dup
            values.unshift key
          end

          values.each_with_index do |list, i|
            list = [list] unless type == :double

            list.uniq.each do |value|
              if new.include? value
                new_value = new[value]
              else
                new_value = []
              end

              if new_value[i].nil?
                new_value[i] =  key
              else
                new_value[i] += "|" <<  key 
              end
              new[value] = new_value
            end
          end
        end

        # Update original object
        new.each do |key, values|
          case
          when index_type == :double
            save[key] = [values.compact.collect{|v| v.split "|"}.flatten.uniq]
          when index_type == :flat
            save[key] = values.compact.collect{|v| v.split "|"}.flatten.uniq
          when index_type == :single
            save[key] = values.compact.collect{|v| v.split "|"}.flatten.first
          end
        end

        new = save
      else
        new_key_field, new_fields = through target, fields, true do |key, values|
          case
          when type == :single
            values = [values]
          when type == :double
            values = values.flatten
          else
            values = values.dup
          end

          values.unshift key

          values.uniq.each do |value|
            case index_type
            when :double
              if not new.include? value
                new[value] = [[key]]
              else
                current = new[value]
                current[0] << key
                new[value] = current
              end
            when :flat
              if not new.include? value
                new[value] = [key]
              else
                current = new[value]
                current << key
                new[value] = current
              end

            else
              new[value] = key unless new.include? value
            end
          end
        end
      end

      new_fields = [] if new_fields.nil?
      TSV.setup(new, :type => index_type, :filename => filename, :fields => [new_key_field], :key_field => new_fields * ", ", :namespace => namespace)
    end
  end
end

#infoObject



13
14
15
# File 'lib/rbbt/tsv/accessor.rb', line 13

def info
  {:key_field => key_field, :fields => fields, :namespace => namespace, :entity_options => entity_options, :type => type, :filename => filename, :identifiers => identifiers, :unnamed => unnamed, :cast => cast}.delete_if{|k,v| v.nil? }
end

#keysObject



284
285
286
287
288
289
# File 'lib/rbbt/tsv/accessor.rb', line 284

def keys
  keys = super - ENTRY_KEYS.to_a
  return keys if @unnamed or key_field.nil?

  prepare_entity(keys, key_field, entity_options.merge(:dup_array => true))
end

#lengthObject



368
369
370
# File 'lib/rbbt/tsv/accessor.rb', line 368

def length
  keys.length
end

#load_entry_value(value) ⇒ Object



149
150
151
152
# File 'lib/rbbt/tsv/accessor.rb', line 149

def load_entry_value(value)
  return value unless respond_to? :persistence_path
  (value.nil? or value == SERIALIZED_NIL) ? nil : TSV_SERIALIZER.load(value)
end

#marshal_dumpObject



2
3
4
5
6
7
8
# File 'lib/rbbt/tsv/marshal.rb', line 2

def marshal_dump
  if defined?(Persist::TCAdapter) && Persist::TCAdapter === self
    super
  else
    [info, to_hash]
  end
end

#matrix_melt(*args) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/rbbt/tsv/matrix.rb', line 34

def matrix_melt(*args)
  tsv = TSV.read_matrix(self, *args)

  melt = Association.index tsv, :persist => false, :recycle => true
  source_field,_sep,target_field = melt.key_field.partition "~"
  melt.add_field source_field do |k,v|
    k.partition("~").first
  end
  melt.add_field target_field do |k,v|
    k.partition("~").last
  end
  melt
end

#melt(header_field = nil, *info_fields, &block) ⇒ Object



29
30
31
32
# File 'lib/rbbt/tsv/melt.rb', line 29

def melt(header_field = nil, *info_fields, &block)
  info_fields = fields if info_fields.nil? || info_fields.empty?
  TSV.melt self, key_field, header_field, fields, *info_fields, &block
end

#merge_different_fields(other, options = {}) ⇒ Object



178
179
180
181
182
183
184
185
186
# File 'lib/rbbt/tsv/attach.rb', line 178

def merge_different_fields(other, options = {})
  TmpFile.with_file do |output|
    TSV.merge_different_fields(self, other, output, options)
    tsv = TSV.open output, options
    tsv.key_field = self.key_field unless self.key_field.nil?
    tsv.fields = self.fields + other.fields unless self.fields.nil? or other.fields.nil?
    tsv
  end
end

#merge_zip(other) ⇒ Object



188
189
190
191
192
193
# File 'lib/rbbt/tsv/attach.rb', line 188

def merge_zip(other)
  other.each do |k,v|
    self.zip_new k, v
  end
  self
end

#namespace=(value) ⇒ Object



491
492
493
494
# File 'lib/rbbt/tsv/accessor.rb', line 491

def namespace=(value)
  self.send(:[]=, "__tsv_hash_namespace", dump_entry_value(value), true)
  @namespace = value
end

#optionsObject



533
534
535
536
537
538
539
# File 'lib/rbbt/tsv/accessor.rb', line 533

def options
  options = {}
  ENTRIES.each do |entry|
    options[entry.to_sym] = self.send(entry)
  end
  IndiferentHash.setup options
end

#page(pnum, psize, field = nil, just_keys = false, reverse = false, &block) ⇒ Object

Starts in page 1



466
467
468
469
470
471
472
473
474
475
476
477
478
# File 'lib/rbbt/tsv/accessor.rb', line 466

def page(pnum, psize, field = nil, just_keys = false, reverse = false, &block)
  pstart = psize * (pnum - 1)
  pend = psize * pnum - 1
  field = :key if field == "key"
  keys = sort_by(field || :key, true, &block)
  keys.reverse! if reverse

  if just_keys
    keys[pstart..pend]
  else
    select :key => keys[pstart..pend]
  end
end

#pos_index(pos_field = nil, options = {}) ⇒ Object



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/rbbt/tsv/index.rb', line 140

def pos_index(pos_field = nil, options = {})
  pos_field ||= "Position"

  options = Misc.add_defaults options,
    :persist => false, :persist_file => nil, :persist_update => false 

  persist_options = Misc.pull_keys options, :persist
  persist_options[:prefix] ||= "PosIndex[#{pos_field}]"

  Persist.persist(filename || self.object_id.to_s, :fwt, persist_options) do 
    max_key_size = 0
    index_data = []
    with_unnamed do
      with_monitor :desc => "Creating Index Data", :step => 10000 do
        through :key, pos_field do |key, values|
          key_size = key.length
          max_key_size = key_size if key_size > max_key_size

          pos = values.first
          if Array === pos
            pos.each do |p|
              index_data << [key, p.to_i]
            end
          else
            index_data << [key, pos.to_i]
          end
        end
      end
    end

    index = FixWidthTable.get(:memory, max_key_size, false)
    index.add_point index_data
    index.read
    index
  end
end

#ppthrough(num_procs = 7, new_key_field = nil, new_fields = nil, uniq = false, zipped = false, &block) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/rbbt/tsv/parallel/through.rb', line 23

def ppthrough(num_procs = 7, new_key_field = nil, new_fields = nil, uniq = false, zipped = false, &block)

  q = RbbtProcessQueue.new num_procs

  q.callback &@ppthrough_callback
  @ppthrough_callback = nil

  q.init do |k,v|
    block.call k,v
  end

  begin
    res = through(new_key_field, new_fields, uniq, zipped) do |*p|
      q.process q
    end
    q.join
  ensure
    q.clean
  end

  res
end

#ppthrough_callback(&block) ⇒ Object



19
20
21
# File 'lib/rbbt/tsv/parallel/through.rb', line 19

def ppthrough_callback(&block)
  @ppthrough_callback = block
end

#prepare_entity(entity, field, options = {}) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/rbbt/tsv/accessor.rb', line 47

def prepare_entity(entity, field, options = {})
  return entity if entity.nil?
  return entity unless defined? Entity
  entity = entity if options.delete :dup_array
  if (template = entity_templates[field]) and template.respond_to?(:annotate)
    if String === entity or Array === entity
      entity = entity.dup if entity.frozen? 
      template.annotate entity
      entity.extend AnnotatedArray if Array === entity
    end
    entity
  else
    if entity_templates.include? field
      entity
    else
      template = Misc.prepare_entity("TEMPLATE", field, options)
      if template.respond_to?(:annotate)
        entity_templates[field] = template
        if String === entity or Array === entity
          entity = entity.dup if entity.frozen? 
          template.annotate entity
          entity.extend AnnotatedArray if Array === entity
        end
        entity
      else
        entity_templates[field] = nil
        entity
      end
    end
  end
end

#prepare_value(key, value) ⇒ Object

{{{ GETTERS AND SETTERS



218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# File 'lib/rbbt/tsv/accessor.rb', line 218

def prepare_value(key, value)
  value = @serializer_module.load(value) if @serializer_module and not TSV::CleanSerializer == @serializer_module

  return value if @unnamed or fields.nil?

  case type
  when :double, :list
    setup_array value, fields, key, entity_options, entity_templates
  when :flat, :single
    begin value = value.dup; rescue; end if value.frozen?

    value = prepare_entity(value, fields.first, entity_options)
  end
  value
end

#process(field, &block) ⇒ Object



612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
# File 'lib/rbbt/tsv/manipulate.rb', line 612

def process(field, &block)
  field_pos = identify_field field

  through do |key, values|
    case
    when type == :single
      field_values = values
    when type == :flat
      field_values = values
    else
      next if values.nil?
      field_values = values[field_pos]
    end

    new_values = case 
                 when block.arity == 1
                   yield(field_values)
                 when block.arity == 2
                   yield(field_values, key)
                 when block.arity == 3
                   yield(field_values, key, values)
                 else
                   raise "Unexpected arity in block, must be 1, 2 or 3: #{block.arity}"
                 end

    case
    when type == :single
      self[key] = new_values
    when type == :flat
      self[key] = new_values
    else
      if ! values[field_pos].frozen? && ((String === values[field_pos] && String === new_values) ||
        (Array === values[field_pos] && Array === new_values))
         values[field_pos].replace new_values
      else
        values[field_pos] = new_values
      end
      self[key] = values
    end
  end

  self
end

#process_key(&block) ⇒ Object



596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
# File 'lib/rbbt/tsv/manipulate.rb', line 596

def process_key(&block)
  new = annotate({})
  through do |key, values|
    key = case 
          when block.arity == 1
            yield(key)
          when block.arity == 2
            yield(key, values)
          else
            raise "Unexpected arity in block, must be 1, 2 or 3: #{block.arity}"
          end
    new[key] = values
  end
  new
end

#pthrough(num_threads = 10, new_key_field = nil, new_fields = nil, uniq = false, zipped = false, &block) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/rbbt/tsv/parallel/through.rb', line 3

def pthrough(num_threads = 10, new_key_field = nil, new_fields = nil, uniq = false, zipped = false, &block)
  q = RbbtThreadQueue.new num_threads

  q.init(true, &block)

  begin
    res = through(new_key_field, new_fields, uniq, zipped) do |*p|
      q.process p
    end
    q.join
  ensure
    q.clean
  end

end

#R(script, source = nil, open_options = {}) ⇒ Object



173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/rbbt/util/R.rb', line 173

def R(script, source = nil, open_options = {})
  open_options, source = source, nil if Hash === source

  source ||= Misc.process_options open_options, :source
  source = [source] unless Array === source 

  require_sources  = source.collect{|source|
    source = R::LIB_DIR["#{source.to_s}.R"] if R::LIB_DIR["#{source.to_s}.R"].exists?
    "source('#{source}')"
  } * ";\n" if Array === source and source.any?

  script = require_sources + "\n\n" + script if require_sources

  r_options = Misc.pull_keys open_options, :R

  r_options[:monitor] = open_options[:monitor] if open_options.include?(:monitor)
  r_options[:method] = open_options[:method] if open_options.include?(:method)
  r_options[:debug] = open_options[:debug] if open_options.include?(:debug)

  r_options[:debug] = true if r_options[:method] == :debug
  if r_options.delete :debug
    r_options[:monitor] = true
    r_options[:method] = :shell
    erase = false
  else
    erase = true
  end

  tsv_R_option_str = r_options.delete :open
  tsv_R_option_str = ", "  + tsv_R_option_str if String === tsv_R_option_str and not tsv_R_option_str.empty?

  raw = open_options.delete :raw
  TmpFile.with_file nil, erase do |f|
    Open.write(f, self.to_s)

    script = <<-EOF
## Loading tsv into data
data = rbbt.tsv('#{f}'#{tsv_R_option_str});

#{script.strip}

## Resaving data
if (! is.null(data)){ rbbt.tsv.write('#{f}', data); }
NULL
    EOF

    case r_options.delete :method
    when :eval
      R.eval_run script
    else 
      R.run script, r_options
    end

    open_options = Misc.add_defaults open_options, :type => :list
    if raw
      Open.read(f)
    else
      tsv = TSV.open(f, open_options) unless open_options[:ignore_output]
      tsv.key_field = open_options[:key] if open_options.include? :key
      tsv.namespace ||= self.namespace if self.namespace
      tsv
    end
  end
end

#R_console(pre_script = nil) ⇒ Object



257
258
259
260
261
262
263
264
265
266
267
# File 'lib/rbbt/util/R.rb', line 257

def R_console(pre_script = nil)
  TmpFile.with_file do |f|
    Log.debug{"R Console:\n" << pre_script } if pre_script
    TmpFile.with_file(pre_script) do |script_file|
      Open.write(f, self.to_s)
      script = "data_file = '#{f}';\n"
      script <<  "\n#\{{{Pre-script:\n\n" << pre_script << "\n#}}}Pre-script\n\n"
      R.console(script)
    end
  end
end

#R_interactive(script = nil, source = []) ⇒ Object



238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# File 'lib/rbbt/util/R.rb', line 238

def R_interactive(script = nil, source = [])
  TmpFile.with_file do |data_file|
    Open.write(data_file, self.to_s)

    Log.debug{"R Interactive:\n" << script } if script

    script =<<-EOF
# Loading data
data_file = '#{data_file}'
data = rbbt.tsv(data_file)

# Script
#{script}
    EOF

    R.interactive(script)
  end
end

#range_index(start_field = nil, end_field = nil, options = {}) ⇒ Object



210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/rbbt/tsv/index.rb', line 210

def range_index(start_field = nil, end_field = nil, options = {})
  start_field ||= "Start"
  end_field ||= "End"

  options = Misc.add_defaults options,
    :persist => false, :persist_file => nil, :persist_update => false 

  persist_options = Misc.pull_keys options, :persist
  persist_options[:prefix] ||= "RangeIndex[#{start_field}-#{end_field}]"

  Persist.persist(filename || self.object_id.to_s, :fwt, persist_options) do 
    max_key_size = 0
    index_data = []
    with_unnamed do
      with_monitor :desc => "Creating Index Data", :step => 10000 do
        through :key, [start_field, end_field] do |key, values|
          key_size = key.length
          max_key_size = key_size if key_size > max_key_size

          start_pos, end_pos = values
          if Array === start_pos
            start_pos.zip(end_pos).each do |s,e|
              index_data << [key, [s.to_i, e.to_i]]
            end
          else
            index_data << [key, [start_pos.to_i, end_pos.to_i]]
          end
        end
      end
    end

    index = FixWidthTable.get(:memory, max_key_size, true)
    index.add_range index_data
    index.read
    index
  end
end

#read(force = false) ⇒ Object



110
111
112
113
114
115
116
117
118
# File 'lib/rbbt/tsv/accessor.rb', line 110

def read(force = false)
  begin
    super
  rescue Exception
    Log.exception $!
    @writable = false
    self
  end
end

#remove_duplicates(pivot = 0) ⇒ Object



835
836
837
838
839
840
841
# File 'lib/rbbt/tsv/accessor.rb', line 835

def remove_duplicates(pivot = 0)
  new = self.annotate({})
  self.through do |k,values|
    new[k] = Misc.zip_fields(Misc.zip_fields(values).uniq)
  end
  new
end

#rename_field(field, new) ⇒ Object



228
229
230
231
# File 'lib/rbbt/tsv/util.rb', line 228

def rename_field(field, new)
  self.fields = self.fields.collect{|f| f == field ? new : f }
  self
end

#reorder(new_key_field = nil, new_fields = nil, options = {}) ⇒ Object



251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# File 'lib/rbbt/tsv/manipulate.rb', line 251

def reorder(new_key_field = nil, new_fields = nil, options = {}) 
  zipped, uniq, merge = Misc.process_options options, :zipped, :uniq, :merge

  persist_options = Misc.pull_keys options, :persist
  persist_options[:prefix] = "Reorder"

  Persist.persist_tsv self, self.filename, self.options.merge({:key_field => new_key_field, :fields => new_fields}), persist_options do |data|
    data.serializer = type if data.respond_to? :serializer and data.serializer == :type

    new_key_field_name, new_field_names = nil, nil
    with_unnamed do
      if zipped or (type != :double and type != :flat)
        new_key_field_name, new_field_names = through new_key_field, new_fields, uniq, zipped do |key, value|
          if merge 
            if data[key]
              new_values = data[key].dup
              value.each_with_index do |v,i|
                new_values[i] += [v]
              end
              data[key] = new_values if Array === value
            else
              data[key] = value.collect{|v| [v]} if Array === value
            end
          else
            data[key] = value.clone if Array === value
          end
        end
      else
        case type 
        when :double
          new_key_field_name, new_field_names = through new_key_field, new_fields, uniq, zipped do |keys, value|
            keys = [keys] unless Array === keys
            keys.each do |key|
              if data[key] 
                current = data[key].dup
                value.each_with_index do |v, i|
                  if current[i]
                    current[i] += v if v
                  else
                    current[i] = v || []
                  end
                end
                data[key] = current 
              else
                data[key] = value.collect{|v| v.nil? ? nil : v.dup}
              end
            end
          end
        when :flat
          new_key_field_name, new_field_names = through new_key_field, new_fields, uniq, zipped do |key, value|
            data[key] ||= []
            data[key] += value
          end
        else
          raise "Unkown type #{type}"
        end
      end
    end

    #if real_data and real_data.respond_to? :persistence_path
    #  real_data.serializer = type if real_data.respond_to? :serializer
    #  real_data.merge!(data)
    #  data = real_data
    #end

    data.extend TSV unless TSV === data
    self.annotate(data)
    data.entity_options = self.entity_options
    data.entity_templates = self.entity_templates

    data.key_field = new_key_field_name
    data.fields = new_field_names
    data.fields.each do |field|
      data.entity_templates[field] = entity_templates[field] if entity_templates.include? field
    end
    data.type = zipped ? (merge ? :double : :list) : type

    data
  end
end

#reset_filtersObject



294
295
296
297
298
299
300
301
302
303
# File 'lib/rbbt/tsv/filter.rb', line 294

def reset_filters
  if @filter_dir.nil? or @filter_dir.empty?
    @filters.each do |filter| filter.reset end if Array === @filters
    return
  end

  Dir.glob(File.join(@filter_dir, '*.filter')).each do |f|
    FileUtils.rm f
  end
end

#select(method = nil, invert = false, &block) ⇒ Object



369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
# File 'lib/rbbt/tsv/manipulate.rb', line 369

def select(method = nil, invert = false, &block)
  new = TSV.setup({}, :key_field => key_field, :fields => fields, :type => type, :filename => filename, :identifiers => identifiers)

  self.annotate(new)
  #new.key_field = key_field
  #new.fields    = fields.dup unless fields.nil?
  #new.type      = type
  #new.filename  = filename
  #new.namespace = namespace
  #new.entity_options = entity_options
  #new.entity_templates = entity_templates
  
  case
  when (method.nil? and block_given?)
    through do |key, values|
      new[key] = values if invert ^ (yield key, values)
    end
  when Array === method
    method = Set.new method
    with_unnamed do
      case type
      when :single
        through do |key, value|
          new[key] = value if invert ^ (method.include? key or method.include? value)
        end
      when :list, :flat
        through do |key, values|
          new[key] = values if invert ^ (method.include? key or (method & values).length > 0)
        end
      else
        through do |key, values|
          new[key] = values if invert ^ (method.include? key or (method & values.flatten).length > 0)
        end
      end
    end
  when Regexp === method
    with_unnamed do
      through do |key, values|
        new[key] = values if invert ^ ([key,values].flatten.select{|v| v =~ method}.any?)
      end
    end
  when (String === method || Symbol === method)
    if block_given?
      case 
      when block.arity == 1
        with_unnamed do
          case
          when (method == key_field or method == :key)
            through do |key, values|
              new[key] = values if invert ^ (yield(key))
            end
          when (type == :single or type == :flat)
            through do |key, value|
              new[key] = value if invert ^ (yield(value))
            end
          else
            pos = identify_field method
            raise "Field #{ method } not identified. Available: #{ fields * ", " }" if pos.nil?

            through do |key, values|
              new[key] = values if invert ^ (yield(values[pos]))
            end
          end
        end
      when block.arity == 2
        with_unnamed do
          case
          when (method == key_field or method == :key)
            through do |key, values|
              new[key] = values if invert ^ (yield(key, key))
            end
          when (type == :single or type == :flat)
            through do |key, value|
              new[key] = value if invert ^ (yield(key, value))
            end
          else
            pos = identify_field method
            through do |key, values|
              new[key] = values if invert ^ (yield(key, values[pos]))
            end
          end

        end
      end

    else
      with_unnamed do
        through do |key, values|
          new[key] = values if invert ^ ([key,values].flatten.select{|v| v == method}.any?)
        end
      end
    end
  when Hash === method
    key  = method.keys.first
    method = method.values.first
    case
    when (Array === method and (key == :key or key_field == key))
      with_unnamed do
        Annotated.purge(method).each{|key| 
          new[key] = self[key] if invert ^ (self.include? key)
        }
      end
    when Array === method
      with_unnamed do
        method = Set.new method unless Set === method
        case type
        when :single
          through :key, key do |key, value|
            new[key] = self[key] if invert ^ (method.include? value)
          end
        when :list
          through :key, key do |key, values|
            new[key] = self[key] if invert ^ (method.include? values.first)
          end
        when :flat #untested
          through :key, key do |key, values|
            new[key] = self[key] if invert ^ ((method & values.flatten).any?)
          end
        else
          through :key, key do |key, values|
            new[key] = self[key] if invert ^ ((method & values.flatten).any?)
          end
        end
      end

    when Regexp === method
      with_unnamed do
        through :key, key do |key, values|
          values = [values] if type == :single
          new[key] = self[key] if invert ^ (values.flatten.select{|v| v =~ method}.any?)
        end
      end

    when (String === method and method =~ /name:(.*)/)
      name = $1
      old_unnamed = self.unnamed
      self.unnamed = false
      if name.strip =~ /^\/(.*)\/$/
        regexp = Regexp.new $1
        through :key, key do |key, values|
          case type
          when :single
            values = values.annotate([values])
          when :double
            values = values[0]
          end
          new[key] = self[key] if invert ^ (values.select{|v| v.name =~ regexp}.any?)
        end
      else
        through :key, key do |key, values|
          case type
          when :single
            values = values.annotate([values])
          when :double
            values = values[0]
          end
          new[key] = self[key] if invert ^ (values.select{|v| v.name == name}.any?)
        end
      end
      self.unnamed = old_unnamed

    when String === method
      if method =~ /^([<>]=?)(.*)/
        with_unnamed do
          through :key, key do |key, values|
            value = Array === values ? values.flatten.first : values
            new[key] = self[key] if value.to_f.send($1, $2.to_f)
          end
        end
      else
        with_unnamed do
          through :key, key do |key, values|
            values = [values] if type == :single
            new[key] = self[key] if invert ^ (values.flatten.select{|v| v == method}.length > 0)
          end
        end
      end
    when Numeric === method
      with_unnamed do
        through :key, key do |key, values|
          new[key] = self[key] if invert ^ (values.flatten.length >= method)
        end
      end
    when Proc === method
      with_unnamed do
        through :key, key do |key, values|
          values = [values] if type == :single
          new[key] = self[key] if invert ^ (values.flatten.select{|v| method.call(v)}.length > 0)
        end
      end
    end
  end

  new
end

#serializer=(serializer) ⇒ Object



196
197
198
199
200
# File 'lib/rbbt/tsv/accessor.rb', line 196

def serializer=(serializer)
  @serializer = serializer
  self.send(:[]=, KEY_PREFIX + 'serializer', dump_entry_value(serializer), :entry_key)
  @serializar_module = serializer.nil? ? TSV::CleanSerializer : (Module === serializer ? serializer : SERIALIZER_ALIAS[serializer.to_sym])
end

#setup_array(*args) ⇒ Object



79
80
81
82
83
84
# File 'lib/rbbt/tsv/accessor.rb', line 79

def setup_array(*args)
  res = NamedArray.setup(*args)
  return res if res.nil?
  res.instance_variable_set(:@entity_templates, entity_templates) 
  res
end

#sizeObject



364
365
366
# File 'lib/rbbt/tsv/accessor.rb', line 364

def size
  super - ENTRY_KEYS.select{|k| self.include? k}.length
end

#slice(fields) ⇒ Object



332
333
334
# File 'lib/rbbt/tsv/manipulate.rb', line 332

def slice(fields)
  reorder :key, fields
end

#sort(*fields) ⇒ Object



336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# File 'lib/rbbt/tsv/manipulate.rb', line 336

def sort(*fields)
  fields = nil if fields.empty?

  elems = []
  through :key, fields do |key, value|
    elems << case
    when block_given?
      [key, yield(*value)]
    else
      case
      when type == :single
        [key, value]
      when type == :double
        [key, value.first.first]
      else
        [key, value.first]
      end
    end
  end

  elems.sort_by{|k,v| v}.collect{|k,v| k}
end

#sort_by(field = nil, just_keys = false, &block) ⇒ Object

{{{ Sorting



388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
# File 'lib/rbbt/tsv/accessor.rb', line 388

def sort_by(field = nil, just_keys = false, &block)
  field = :all if field.nil?

  if field == :all
    elems = collect
  else
    elems = []
    case type
    when :single
      through :key, field do |key, field|
        elems << [key, field]
      end
    when :list, :flat
      through :key, field do |key, fields|
        elems << [key, fields.first]
      end
    when :double
      through :key, field do |key, fields|
        elems << [key, fields.first]
      end
    end
  end

  if not block_given?
    if fields == :all
      if just_keys
        keys = elems.sort_by{|key, value| key }.collect{|key, values| key}
        keys = prepare_entity(keys, key_field, entity_options.merge(:dup_array => true))
      else
        elems.sort_by{|key, value| key }
      end
    else
      sorted = elems.sort do |a, b| 
        a_value = a.last
        b_value = b.last
        a_empty = a_value.nil? or (a_value.respond_to?(:empty?) and a_value.empty?)
        b_empty = b_value.nil? or (b_value.respond_to?(:empty?) and b_value.empty?)
        case
        when (a_empty and b_empty)
          0
        when a_empty
          -1
        when b_empty
          1
        when Array === a_value
          if a_value.length == 1 and b_value.length == 1
            a_value.first <=> b_value.first
          else
            a_value.length <=> b_value.length
          end
        else
          a_value <=> b_value
        end
      end
      if just_keys
        keys = sorted.collect{|key, value| key}
        keys = prepare_entity(keys, key_field, entity_options.merge(:dup_array => true)) unless @unnamed
        keys
      else
        sorted.collect{|key, value| [key, self[key]]}
      end
    end
  else
    if just_keys
      keys = elems.sort_by(&block).collect{|key, value| key}
      keys = prepare_entity(keys, key_field, entity_options.merge(:dup_array => true)) unless @unnamed
      keys
    else
      elems.sort_by(&block).collect{|key, value| [key, self[key]]}
    end
  end
end

#subset(keys) ⇒ Object



359
360
361
362
363
364
365
366
367
# File 'lib/rbbt/tsv/manipulate.rb', line 359

def subset(keys)
  new = TSV.setup({}, :key_field => key_field, :fields => fields, :type => type, :filename => filename, :identifiers => identifiers)
  self.with_unnamed do
    keys.each do |k|
      new[k] = self[k] if self.include?(k)
    end
  end
  new
end

#summaryObject



709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
# File 'lib/rbbt/tsv/accessor.rb', line 709

def summary

  key = nil
  values = nil
  self.each do |k, v|
    key = k
    values = v
    break
  end

  filename = @filename
  filename = "No filename" if filename.nil? || filename.empty?
  filename.find if Path === filename 
  filename = File.basename(filename) + " [" + File.basename(persistence_path) + "]" if respond_to?(:persistence_path) and persistence_path

  with_unnamed do
    <<-EOF
Filename = #{filename}
Key field = #{key_field || "*No key field*"}
Fields = #{fields ? Misc.fingerprint(fields) : "*No field info*"}
Type = #{type}
Serializer = #{serializer.inspect}
Size = #{size}
namespace = #{Misc.fingerprint namespace}
identifiers = #{Misc.fingerprint identifiers}
Example:
- #{key} -- #{Misc.fingerprint values }
    EOF
  end
end

#swap_id(*args) ⇒ Object



103
104
105
# File 'lib/rbbt/tsv/change_id.rb', line 103

def swap_id(*args)
  TSV.swap_id(self, *args)
end

#through(new_key_field = nil, new_fields = nil, uniq = false, zipped = false) ⇒ Object

{{{ Methods



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/rbbt/tsv/manipulate.rb', line 163

def through(new_key_field = nil, new_fields = nil, uniq = false, zipped = false)

  traverser = Traverser.new key_field, fields, new_key_field, new_fields, type, uniq

  if @monitor
    if Log::ProgressBar === @monitor
      @monitor.max = size
      progress_monitor = @monitor
    else
      desc = "Iterating TSV"
      step = 100
      if Hash === @monitor
        desc = @monitor[:desc] if @monitor.include? :desc 
        step = @monitor[:step] if @monitor.include? :step 
      elsif String === @monitor
        desc = @monitor
      end
      progress_monitor = Log::ProgressBar.new_bar(size, :desc => desc)
    end
  else
    progress_monitor = nil
  end

  each do |key, value|
    progress_monitor.tick if progress_monitor

    keys, value = traverser.process(key, value)

    next if keys.nil?
    
    keys = [keys].compact unless Array === keys

    # Annotated with Entity and NamedArray
    if not @unnamed and not traverser.new_field_names.nil? 

      case type
      when :double, :list
        #Log.warn "Value frozen: #{ value }" if value.frozen?

        value.nil? ?
          nil :
          NamedArray.setup(value, traverser.new_field_names, key, entity_options, entity_templates)

      when :flat, :single
        prepare_entity(value, traverser.new_field_names.first, entity_options)
      end
    end



    if zipped

      keys.each_with_index do |k,i|
        v = value.collect{|v|
          r = v[i]
          r = v[0] if r.nil?
          r
        }

        if not @unnamed 
          k = Misc.prepare_entity(k, traverser.new_key_field_name, entity_options)
        end

        v.key = k if NamedArray === v

        yield k, v
 
      end

    else

      keys.each do |key|
        if not @unnamed
          k = Misc.prepare_entity(k, traverser.new_key_field_name, entity_options)
        end
        value.key = key if NamedArray === value
        yield key, value
      end

    end

  end

  Log::ProgressBar.remove_bar progress_monitor if progress_monitor

  [traverser.new_key_field_name, traverser.new_field_names]
end

#to_doubleObject



284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# File 'lib/rbbt/tsv/util.rb', line 284

def to_double
  new = {}
  case type
  when :double
    return self
  when :flat
    through do |k,v|
      new[k] = v.nil? ? [] : [v]
    end
  when :single
    through do |k,v|
      new[k] = v.nil? ? [[]] : [[v]]
    end
  when :list
    if block_given?
      through do |k,v|
        if v.nil?
          new[k] = nil
        else
          new[k] = v.collect{|e| yield e}
        end
      end
    else
      through do |k,v|
        if v.nil?
          new[k] = nil
        else
          new[k] = v.collect{|e| [e]}
        end
      end
    end
  end
  self.annotate(new)
  new.type = :double
  new
end

#to_flat(field = nil) ⇒ Object



321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# File 'lib/rbbt/tsv/util.rb', line 321

def to_flat(field = nil)
  new = {}
  case type
  when :double
    if field.nil?
      through do |k,v| new[k] = v.first end
    elsif field == :all
      through do |k,v| new[k] = v.flatten.compact end
    else
      pos = identify_field field
      through do |k,v| new[k] = v[pos] end
    end
  when :flat
    return self
  when :single
    through do |k,v|
      new[k] = [v]
    end
  when :list
    through do |k,v|
      new[k] = [v.first]
    end
  end
  self.annotate(new)
  if new.fields
    case field
    when nil
      new.fields = new.fields[0..0]
    when :all
      new.fields = [new.fields * "+"]
    else
      new.fields = [field]
    end
  end
  new.type = :flat
  new
end

#to_hashObject



740
741
742
743
744
# File 'lib/rbbt/tsv/accessor.rb', line 740

def to_hash
  new = self.dup
  ENTRY_KEYS.each{|entry| new.delete entry}
  new
end

#to_list(&block) ⇒ Object



251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# File 'lib/rbbt/tsv/util.rb', line 251

def to_list(&block)
  new = {}
  case type
  when :double
    if block_given?
      through do |k,v|
        if block.arity == 1
          new[k] = v.collect{|e| yield e}
        else
          new[k] = yield k, v
        end
      end
    else
      through do |k,v|
        new[k] = v.collect{|e| e.first}
      end
    end
  when :flat
    through do |k,v|
      new[k] = [v.first]
    end
  when :single
    through do |k,v|
      new[k] = [v]
    end
  when :list
    return self
  end
  self.annotate(new)
  new.type = :list
  new
end

#to_onehot(boolean = false) ⇒ Object



392
393
394
395
396
397
398
399
400
401
402
# File 'lib/rbbt/tsv/util.rb', line 392

def to_onehot(boolean = false)
  all_values = values.flatten.uniq.collect{|v| v.to_s}.sort
  index = TSV.setup({}, :key_field => key_field, :fields => all_values, :type => :list)
  index.cast = :to_i unless boolean
  through do |key,values|
    v = all_values.collect{|_v| values.include?(_v)}
    v = v.collect{|_v| _v ? 1 : 0 } unless boolean
    index[key] = v
  end
  index
end

#to_s(keys = nil, no_options = false, unmerge = false) ⇒ Object



642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
# File 'lib/rbbt/tsv/accessor.rb', line 642

def to_s(keys = nil, no_options = false, unmerge = false)
  if FalseClass === keys or TrueClass === keys or Hash === keys
    no_options = keys
    keys = nil
  end

  if keys == :sort
    with_unnamed do
      keys = self.keys.sort
    end
  end

  io = dumper_stream(keys, no_options, unmerge)

  str = ''
  while block = io.read(Misc::BLOCK_SIZE)
    str << block
  end

  str
end

#to_singleObject



359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
# File 'lib/rbbt/tsv/util.rb', line 359

def to_single
  new = {}

  if block_given?
    through do |k,v|
      new[k] = yield v
    end
  else
    case type
    when :double
      through do |k,v|
        new[k] = v.first.first
      end
    when :flat
      through do |k,v|
        new[k] = v.first
      end
    when :single
      return self
    when :list
      through do |k,v|
        new[k] = v.first
      end
    end
  end

  self.annotate(new)
  new.type = :single
  new.fields = [new.fields.first] if new.fields.length > 1
  new
end

#to_unmerged_expanded_s(keys = nil, no_options = false) ⇒ Object



668
669
670
# File 'lib/rbbt/tsv/accessor.rb', line 668

def to_unmerged_expanded_s(keys = nil, no_options = false)
  to_s keys, no_options, :expand
end

#to_unmerged_s(keys = nil, no_options = false) ⇒ Object



664
665
666
# File 'lib/rbbt/tsv/accessor.rb', line 664

def to_unmerged_s(keys = nil, no_options = false)
  to_s keys, no_options, true
end

#transpose(key_field = "Unkown ID") ⇒ Object



738
739
740
741
742
743
744
745
746
747
# File 'lib/rbbt/tsv/manipulate.rb', line 738

def transpose(key_field = "Unkown ID")
  case type
  when :single, :flat
    self.to_list.transpose_list key_field
  when :list
    transpose_list key_field
  when :double
    transpose_double key_field
  end
end

#transpose_double(key_field = "Unkown ID") ⇒ Object



731
732
733
734
735
736
# File 'lib/rbbt/tsv/manipulate.rb', line 731

def transpose_double(key_field = "Unkown ID")
  sep = "-!SEP--#{rand 10000}!-"
  tmp = self.to_list{|v| v * sep}
  new = tmp.transpose_list(key_field)
  new.to_double{|v| v.split(sep)}
end

#transpose_list(key_field = "Unkown ID") ⇒ Object



715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
# File 'lib/rbbt/tsv/manipulate.rb', line 715

def transpose_list(key_field="Unkown ID")
  new_fields = keys.dup
  new = self.annotate({})
  TSV.setup(new, :key_field => key_field, :fields => new_fields, :type => type, :filename => filename, :identifiers => identifiers)

  require 'matrix'
  m = Matrix.rows values 
  new_rows = m.transpose.to_a

  fields.zip(new_rows) do |key,row|
    new[key] = row
  end

  new
end

#tsv_sort(&block) ⇒ Object



461
462
463
# File 'lib/rbbt/tsv/accessor.rb', line 461

def tsv_sort(&block)
  collect.sort &block
end

#unzip(field = 0, merge = false, sep = ":", delete = true) ⇒ Object



746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
# File 'lib/rbbt/tsv/accessor.rb', line 746

def unzip(field = 0, merge = false, sep = ":", delete = true)
  new = {}
  self.annotate new

  field_pos = self.identify_field field
  new.with_unnamed do
    if merge
      self.through do |key,values|
        field_values = values[field_pos]
        if delete
          values = values.dup
          values.delete_at(field_pos) 
        end
        next if field_values.nil?
        zipped = Misc.zip_fields(values)
        field_values.zip(zipped).each do |field_value,rest|
          rest = [nil] * values.length if rest.nil?
          k = [key,field_value]*sep
          if new.include? k
            new[k] = Misc.zip_fields(Misc.zip_fields(new[k]) << rest)
          else
            new[k] = rest.nil? ? nil : rest.collect{|v| [v]}
          end
        end
      end
      new.type = :double
    else
      self.through do |key,values|
        field_values = values[field_pos]
        values.delete_at(field_pos) if delete
        next if field_values.nil?
        zipped = Misc.zip_fields(values)
        field_values.zip(zipped).each do |field_value,rest|
          rest = [nil] * values.length if rest.nil?
          k = [key,field_value]*sep
          new[k] = rest
        end
      end
      new.type = :list
    end
  end

  if self.key_field and self.fields
    new.key_field = [self.key_field, self.fields[field_pos]] * sep
    new_fields = self.fields.dup 
    new_fields.delete_at(field_pos) if delete
    new.fields = new_fields
  end

  new
end

#unzip_replicatesObject



233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/rbbt/tsv/util.rb', line 233

def unzip_replicates
  raise "Can only unzip replicates in :double TSVs" unless type == :double

  new = {}
  self.with_unnamed do
    through do |k,vs|
      Misc.zip_fields(vs).each_with_index do |v,i|
        new[k + "(#{i})"] = v
      end
    end
  end

  self.annotate(new)
  new.type = :list

  new
end

#value_peekObject



672
673
674
675
676
677
678
679
680
681
682
683
684
# File 'lib/rbbt/tsv/accessor.rb', line 672

def value_peek
  peek = {}
  i = 0
  begin
    through do |k,v|
      peek[k] = v 
      i += 1
      raise "STOP" if i > 10
    end
  rescue
  end
  peek
end

#valuesObject



291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# File 'lib/rbbt/tsv/accessor.rb', line 291

def values
  values = chunked_values_at(keys)
  return values if @unnamed or fields.nil?

  case type
  when :double, :list
    values.each{|value| setup_array value, fields, nil, entity_options}
  when :single
    values = prepare_entity(values, fields.first, entity_options)
  when :flat
    values = values.collect{|v| prepare_entity(v, fields.first, entity_options)}
  end
    
  values
end

#values_to_s(values) ⇒ Object



547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
# File 'lib/rbbt/tsv/accessor.rb', line 547

def values_to_s(values)
  case values
  when nil
    if fields.nil? or fields.empty?
      "\n"
    else
      "\t" << ([""] * fields.length) * "\t" << "\n"
    end
  when Array
    if fields.nil? or fields.empty?
      "\n"
    else
      "\t" << values.collect{|v| Array === v ? v * "|" : v} * "\t" << "\n"
    end
  else
    if fields.nil? or fields.empty?
      "\n"
    else
      "\t" << values.to_s << "\n"
    end
  end
end

#with_monitor(value = true) ⇒ Object



94
95
96
97
98
99
100
# File 'lib/rbbt/tsv/accessor.rb', line 94

def with_monitor(value = true)
  saved_monitor = @monitor
  @monitor = value.nil? ? false : value
  res = yield
  @monitor = saved_monitor
  res
end

#with_unnamedObject



86
87
88
89
90
91
92
# File 'lib/rbbt/tsv/accessor.rb', line 86

def with_unnamed
  saved_unnamed = @unnamed 
  @unnamed = true
  res = yield
  @unnamed = saved_unnamed
  res
end

#write(force = false) ⇒ Object



120
121
122
123
124
125
126
127
# File 'lib/rbbt/tsv/accessor.rb', line 120

def write(force = false)
  begin
    super
  rescue Exception
    @writable = true
    self
  end
end

#write?Boolean

Returns:

  • (Boolean)


129
130
131
# File 'lib/rbbt/tsv/accessor.rb', line 129

def write?
  @writable ||= false
end

#xls(filename, options = {}) ⇒ Object



250
251
252
# File 'lib/rbbt/tsv/excel.rb', line 250

def xls(filename, options ={})
  TSV::XLS.write(self, filename, options)
end

#xlsx(filename, options = {}) ⇒ Object



254
255
256
# File 'lib/rbbt/tsv/excel.rb', line 254

def xlsx(filename, options ={})
  TSV::XLSX.write(self, filename, options)
end

#zip(merge = false, field = "New Field", sep = ":") ⇒ Object



798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
# File 'lib/rbbt/tsv/accessor.rb', line 798

def zip(merge = false, field = "New Field", sep = ":")
  new = {}
  self.annotate new

  new.type = :double if merge

  new.with_unnamed do
    if merge
      self.through do |key,values|
        new_key, new_value = key.split(sep)
        new_values = values + [[new_value] * values.first.length]
        if new.include? new_key
          current = new[new_key]
          current.each_with_index do |v,i|
            v.concat(new_values[i])
          end
        else
          new[new_key] = new_values
        end
      end
    else
      self.through do |key,values|
        new_key, new_value = key.split(sep)
        new_values = values + [new_value]
        new[new_key] = new_values
      end
    end
  end

  if self.key_field and self.fields
    new.key_field = self.key_field.partition(sep).first
    new.fields = new.fields + [field]
  end

  new
end

#zip_new(key, values) ⇒ Object



253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# File 'lib/rbbt/tsv/accessor.rb', line 253

def zip_new(key, values)
  values = [values] unless Array === values
  case type
  when :double
    if self.include? key
      new = []
      self[key, true].each_with_index do |v,i|
        _v = values[i]
        case _v
        when Array
          _n = v + _v
        else
          _n = v << _v
        end
        new << _n
      end
      self[key] = new
    else
      self[key] = Array === values.first ? values.dup : values.collect{|v| [v] }
    end
  when :flat
    if self.include? key
      self[key] = (self[key] + values).uniq
    else
      self[key] = values
    end
  else
    raise "Cannot zip_new for type: #{type}"
  end
end