Module: TSV

Extended by:
Annotation
Defined in:
lib/scout/tsv.rb,
lib/scout/tsv/csv.rb,
lib/scout/tsv/open.rb,
lib/scout/tsv/util.rb,
lib/scout/tsv/index.rb,
lib/scout/tsv/attach.rb,
lib/scout/tsv/dumper.rb,
lib/scout/tsv/parser.rb,
lib/scout/tsv/stream.rb,
lib/scout/tsv/traverse.rb,
lib/scout/tsv/change_id.rb,
lib/scout/tsv/util/melt.rb,
lib/scout/tsv/util/sort.rb,
lib/scout/tsv/util/unzip.rb,
lib/scout/tsv/transformer.rb,
lib/scout/tsv/util/filter.rb,
lib/scout/tsv/util/select.rb,
lib/scout/association/item.rb,
lib/scout/tsv/util/process.rb,
lib/scout/tsv/util/reorder.rb,
lib/scout/tsv/change_id/translate.rb

Defined Under Namespace

Classes: Dumper, Parser, Transformer

Constant Summary collapse

KEY_PARAMETERS =
begin
  params = []
  (method(:parse_line).parameters + method(:parse_stream).parameters).each do |type, name|
    params << name if type == :key
  end
  params
end

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Annotation

list_tsv_values, load_info, load_tsv, load_tsv_values, obj_tsv_values, resolve_tsv_array, tsv

Class Method Details

.all_fields(file) ⇒ Object



152
153
154
155
156
157
158
# File 'lib/scout/tsv/util.rb', line 152

def self.all_fields(file)
  if file.respond_to?(:all_fields)
    file.all_fields
  else
    TSV.parse_header(file)["all_fields"]
  end
end

.attach(source, other, target: nil, fields: nil, index: nil, identifiers: nil, match_key: nil, other_key: nil, one2one: true, complete: false, insitu: nil, persist_input: false, bar: nil) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/scout/tsv/attach.rb', line 45

def self.attach(source, other, target: nil, fields: nil, index: nil, identifiers: nil, match_key: nil, other_key: nil, one2one: true, complete: false, insitu: nil, persist_input: false, bar: nil)
  source = TSV::Transformer.new source unless TSV === source || TSV::Parser === source
  other = TSV::Parser.new other unless TSV === other || TSV::Parser === other

  fields = [fields] if String === fields

  match_key, other_key = TSV.match_keys(source, other, match_key: match_key, other_key: other_key)

  if ! (TSV === other)
    other_key_name = other_key == :key ? other.key_field : other.fields[other_key]
    other = TSV.open other, key_field: other_key_name, fields: fields, one2one: true, persist: persist_input
    other_key = :key if other.key_field == source.key_field
  end

  if TSV::Transformer === source
    source.dumper = case target
                    when :stream
                      TSV::Dumper.new(source.options.merge(sep: "\t"))
                    when nil
                      TSV.setup({}, **source.options.dup)
                    else
                      target
                    end
  end

  other.with_unnamed do
    source.with_unnamed do

      other_key_name = other_key == :key ? other.key_field : other_key
      other_key_name = other.fields[other_key_name] if Integer === other_key
      fields = other.all_fields - [other_key_name, source.key_field] if fields.nil?

      match_key_name = match_key == :key ? source.key_field : match_key_name

      if index.nil? && ! source.identify_field(other_key_name)
        identifier_files = []
        identifier_files << identifiers if identifiers
        identifier_files << source
        identifier_files << TSV.identifier_files(source)
        identifier_files << TSV.identifier_files(other)
        identifier_files << other

        index = TSV.translation_index(identifier_files.flatten, match_key_name, other_key_name)
      end

      if other_key != :key 
        other = other.reorder other_key, fields, one2one: one2one, merge: true, type: :double
      end

      other_field_positions = other.identify_field(fields.dup) 

      log_message = "Attach #{Log.fingerprint fields - source.fields} to #{Log.fingerprint source} (#{[match_key, other_key] * "=~"})"
      Log.debug log_message
      bar = log_message if TrueClass === bar

      new = fields - source.fields

      source.fields = (source.fields + fields).uniq

      overlaps = source.identify_field(fields)
      orig_type = source.type

      type = source.type == :single ? :list : source.type

      empty_other_values = case type
                           when :list
                             [nil] * other.fields.length
                           when :flat
                             []
                           when :double
                             [[]] * other.fields.length
                           end

      empty_other_values = nil if other.type == :single

      insitu = TSV === source ? true : false if insitu.nil?
      insitu = false if source.type == :single

      match_key_pos = source.identify_field(match_key)
      source.traverse bar: bar, unnamed: true do |orig_key,current_values|
        current_values = [current_values] if source.type == :single

        keys = (match_key == :key || match_key_pos == :key) ? [orig_key] : current_values[match_key_pos]
        keys = [keys].compact unless Array === keys

        keys = index.chunked_values_at(keys).flatten if index

        current_values = current_values.dup unless insitu
        keys = [nil] if keys.empty?
        keys.each do |current_key|
          other_values = current_key.nil? ? empty_other_values : other[current_key]

          if other_values.nil?
            other_values = empty_other_values
          elsif other.type == :flat 
            other_values = [other_values]
          elsif other.type == :list && source.type == :double
            other_values = other_values.collect{|v| [v] }
          elsif other.type == :double && source.type == :list
            other_values = other_values.collect{|v| v.first }
          end

          other_values = other_field_positions.collect do |pos|
            if pos == :key
              current_key
            else
              other.type == :single ? other_values : other_values[pos]
            end
          end

          other_values.zip(overlaps).each do |v,overlap|
            if type == :list
              current_values[overlap] = v if current_values[overlap].nil? || (String === current_values[overlap] && current_values[overlap].empty?)
            elsif type == :flat
              next if v.nil?
              v = [v] unless Array === v
              current_values.concat v
            else
              current_values[overlap] ||= []
              next if v.nil?
              v = [v] unless Array === v
              current_values[overlap].concat (v - current_values[overlap])
            end
          end
        end
        source[orig_key] = current_values unless insitu
        nil
      end

      if complete && match_key == :key
        empty_self_values = case type
                            when :list
                              [nil] * source.fields.length
                            when :flat
                              []
                            when :double
                              [[]] * source.fields.length
                            end
        other.each do |other_key,other_values|
          next if source.include?(other_key)
          if other.type == :flat 
            other_values = [other_values]
          elsif other.type == :single 
            other_values = [other_values]
          elsif other.type == :list && type == :double
            other_values = other_values.collect{|v| [v] }
          elsif other.type == :double && type == :list
            other_values = other_values.collect{|v| v.first }
          end

          new_values = case type
                       when :list
                         [nil] * source.fields.length
                       when :flat
                         []
                       when :double
                         source.fields.length.times.collect{ [] }
                       end

          other_values.zip(overlaps).each do |v,overlap|
            next if v.nil?
            if overlap == :key
              other_key = Array === v ? v : v.first
            elsif type == :list
              new_values[overlap] = v if new_values[overlap].nil? || (String === new_values[overlap] && new_values[overlap].empty?)
            else
              v = [v] unless Array === v
              new_values[overlap].concat v
            end
          end
          source[other_key] = new_values
        end
      end
      source.type = type
    end
  end

  source
end

.cast_value(value, cast) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
# File 'lib/scout/tsv/parser.rb', line 3

def self.cast_value(value, cast)
  if Array === value
    value.collect{|e| cast_value(e, cast) }
  else
    if Proc === cast
      cast.call value
    else
      value.send(cast)
    end
  end
end

.change_id(source, source_id, new_id, identifiers: nil, one2one: false, insitu: false) ⇒ Object



30
31
32
33
34
35
36
37
38
# File 'lib/scout/tsv/change_id.rb', line 30

def self.change_id(source, source_id, new_id, identifiers: nil, one2one: false, insitu: false)
  source = TSV::Parser.new source if String === source

  identifiers = identifiers.nil? ? source.identifiers : identifiers

  new_fields = source.fields.dup
  new_fields[new_fields.index(source_id)] = new_id
  return source.attach(identifiers, fields: [new_id], insitu: insitu).slice(new_fields)
end

.change_key(source, new_key_field, identifiers: nil, one2one: false, merge: true, stream: false, keep: false, persist_identifiers: nil) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/scout/tsv/change_id.rb', line 4

def self.change_key(source, new_key_field, identifiers: nil, one2one: false, merge: true, stream: false, keep: false, persist_identifiers: nil)
  source = TSV::Parser.new source if String === source
  identifiers = source.identifiers if identifiers.nil? and source.respond_to?(:identifiers)
  if identifiers && source.identify_field(new_key_field, strict: true).nil?
    identifiers = identifiers.nil? ? source.identifiers : identifiers
    new = source.attach(identifiers, fields: [new_key_field], insitu: false, one2one: true, persist_input: persist_identifiers)
    new = new.change_key(new_key_field, keep: keep, stream: stream, one2one: one2one, merge: merge)
    return new
  end

  fields = source.fields.dup - [new_key_field]
  fields.unshift source.key_field if keep
  transformer = TSV::Transformer.new source
  transformer.key_field = new_key_field
  transformer.fields = fields
  transformer.traverse key_field: new_key_field, fields: fields, one2one: one2one, unnamed: true do |k,v|
    [k, v]
  end

  stream ? transformer : transformer.tsv(merge: merge, one2one: one2one)
end

.collapse_stream(stream, *args, **kwargs, &block) ⇒ Object



209
210
211
212
213
214
215
# File 'lib/scout/tsv/open.rb', line 209

def self.collapse_stream(stream, *args, **kwargs, &block)
  stream = stream.stream if stream.respond_to?(:stream)
  self.process_stream(stream) do |sin, line|
    collapsed = Open.collapse_stream(stream, line: line)
    Open.consume_stream(collapsed, false, sin)
  end
end

.concat_streams(streams) ⇒ Object



212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/scout/tsv/stream.rb', line 212

def self.concat_streams(streams)

  streams = streams.collect do |stream|
    case stream
    when(defined? Step and Step)
      stream.stream
    when Path
      stream.open
    when TSV::Dumper
      stream.stream
    when TSV
      stream.dumper_stream
    else
      stream
    end
  end.compact

  done_streams = []
  Open.open_pipe do |sin|
    first_stream = streams.first
    while line = first_stream.gets
      sin.write line
      break unless line[0] == "#"
    end

    while streams.any?
      streams.each do |stream|
        line = stream.gets
        sin.write line unless line[0] == "#"
      end
      streams.delete_if{|stream| stream.eof? }
    end
  end
end

.csv(obj, options = {}) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/scout/tsv/csv.rb', line 4

def self.csv(obj, options = {}) 
  options = IndiferentHash.add_defaults options, :headers => true, :type => :list
  headers = options[:headers]

  noheaders = ! headers

  type = options.delete :type
  cast = options.delete :cast
  merge = options.delete :merge
  key_field = options.delete :key_field
  fields = options.delete :fields
  
  if key_field || fields
    orig_type = type
    type = :double
    merge = true
  end

  options[:headers] = false

  csv = case obj
        when Path
          CSV.read obj.find.open, **options
        when String
          if Open.remote?(obj)
            CSV.read Open.open(obj), **options
          elsif Path.is_filename?(obj)
            CSV.read obj, **options
          else
            CSV.new obj, **options
          end
        else
          CSV.new obj, **options
        end

  tsv = if noheaders
          TSV.setup({}, :key_field => nil, :fields => nil, :type => type)
        else
          key, *csv_fields = csv.shift
          TSV.setup({}, :key_field => key, :fields => csv_fields, :type => type)
        end

  csv.each_with_index do |row,i|
    if noheaders
      key, values = ["row-#{i}", row]
    else
      key, *values = row
    end
    
    if cast
      values = values.collect{|v| v.send cast }
    end

    case type
    when :double, :flat
      tsv.zip_new(key, values)
    when :single
      tsv[key] = values.first
    when :list
      tsv[key] = values
    end
  end

  if key_field || fields
    tsv = tsv.reorder(key_field, fields, :one2one => true, :merge => true)
    if tsv.type != orig_type
      tsv = case orig_type
            when :list
              tsv.to_list
            when :single
              tsv.to_single
            when :list
              tsv.to_list
            when :flat
              tsv.to_flat
            end
    end
  end

  tsv
end

.field_match_counts(file, values, options = {}) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/scout/tsv/util.rb', line 13

def self.field_match_counts(file, values, options = {})
  options = IndiferentHash.add_defaults options, :persist_prefix => "Field_Matches"
  persist_options = IndiferentHash.pull_keys options, :persist

  filename = TSV === file ? file.filename : file
  path = Persist.persist filename, :string, persist_options.merge(:no_load => true) do
    tsv = TSV === file ? file : TSV.open(file, options)

    text = ""
    fields = nil
    tsv.tap{|e| e.unnamed =  true; fields = e.fields}.through do |gene, names|
      names.zip(fields).each do |list, format|
        list = [list] unless Array === list
        list.delete_if do |name| name.empty? end
        next if list.empty?
        text << list.collect{|name| [name, format] * "\t"} * "\n" << "\n"
      end
      text << [gene, tsv.key_field] * "\t" << "\n"
    end
    text
  end

  TmpFile.with_file(values.uniq * "\n", false) do |value_file|
    cmd = "cat '#{ path }' | sed 's/\\t/\\tHEADERNOMATCH/' | grep -w -F -f '#{ value_file }' | sed 's/HEADERNOMATCH//' |sort -u|cut -f 2  |sort|uniq -c|sed 's/^ *//;s/ /\t/'"
    begin
      TSV.open(CMD.cmd(cmd), :key_field => 1, :fields => [0], :type => :single, :cast => :to_i)
    rescue
      Log.exception $!
      TSV.setup({}, :type => :single, :cast => :to_i)
    end
  end
end

.identifier_files(obj) ⇒ Object



252
253
254
255
256
257
258
259
260
# File 'lib/scout/tsv/attach.rb', line 252

def self.identifier_files(obj)
  if TSV === obj
    obj.identifier_files
  elsif Path === obj
    obj.dirname.identifiers
  else
    nil
  end
end

.identify_field(key_field, fields, name, strict: nil) ⇒ Object



46
47
48
49
50
# File 'lib/scout/tsv/util.rb', line 46

def self.identify_field(key_field, fields, name, strict: nil)
  return :key if name == :key || (! strict && NamedArray.field_match(key_field, name))
  name.collect!{|n| NamedArray.field_match(key_field, n) ? :key : n } if Array === name
  NamedArray.identify_name(fields, name, strict: strict)
end

.identify_field_in_obj(obj, field) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
# File 'lib/scout/tsv/change_id/translate.rb', line 3

def self.identify_field_in_obj(obj, field)
  case obj
  when TSV
    obj.identify_field(field)
  when TSV::Parser, TSV::Dumper
    TSV.identify_field(obj.key_field, obj.fields, field)
  when Path, String
    all_fields = TSV.parse_header(obj)["all_fields"]
    identify_field_in_obj(all_fields, field)
  when Array
    key_field, *fields = obj
    TSV.identify_field(key_field, fields, field)
  end
end

.incidence(tsv, **kwargs) ⇒ Object



224
225
226
# File 'lib/scout/association/item.rb', line 224

def self.incidence(tsv, **kwargs)
  AssociationItem.incidence Association.index(tsv, **kwargs).keys
end

.index(tsv_file, target: :key, fields: nil, order: true, bar: nil, **kwargs) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/scout/tsv/index.rb', line 40

def self.index(tsv_file, target: :key, fields: nil, order: true, bar: nil, **kwargs)
  kwargs = IndiferentHash.add_defaults kwargs, unnamed: true
  engine = IndiferentHash.process_options kwargs, :engine

  fields = :all if fields.nil?

  prefix = case fields
           when :all
             "Index[#{target}]"
           else
             "Index[#{Log.fingerprint(fields)}->#{target}]"
           end

  prefix += select_prefix_str(kwargs[:select])

  persist_options = IndiferentHash.pull_keys kwargs, :persist
  persist_options = IndiferentHash.add_defaults persist_options, :prefix => prefix, :engine => :HDB, :persist => false

  data_options = IndiferentHash.pull_keys kwargs, :data

  Persist.persist(tsv_file, persist_options[:engine], persist_options.merge(other_options: kwargs.merge(target: target, fields: fields, order: order, data_options: data_options))) do |filename|
    if filename
      index = ScoutCabinet.open(filename, true, engine)
      TSV.setup(index, :type => :single)
      index.extend TSVAdapter 
    else
      index = TSV.setup({}, :type => :single)
    end

    tsv_file = TSV.open(tsv_file, **data_options) if ! TSV === tsv_file

    log_msg = "Index #{Log.fingerprint tsv_file} target #{Log.fingerprint target}"
    Log.low log_msg
    bar = log_msg if TrueClass === bar

    if order
      tmp_index = {}
      include_self = fields == :all || (Array === fields) && fields.include?(target)
      target_key_field, source_field_names = Open.traverse tsv_file, type: :double, key_field: target, fields: fields, bar: bar, **kwargs do |k,values|
        tmp_index[k] ||= [[k]] if include_self
        values.each_with_index do |list,i|
          i += 1 if include_self
          list.each do |e|
            tmp_index[e] ||= []
            tmp_index[e][i] ||= []
            tmp_index[e][i] << k
          end
        end
      end
      tmp_index.each do |e,list|
        index[e] = list.flatten.compact.uniq.first
      end

      index.key_field = source_field_names * ","
      index.fields = [target_key_field]

      tmp_index = {}

    else
      target_key_field, source_field_names =  Open.traverse tsv_file, key_field: target, fields: fields, type: :flat, unnamed: true, bar: bar, **kwargs do |k,values|
        values.each do |e|
          index[e] = k unless index.include?(e)
        end
      end
    end

    index.key_field = source_field_names * ","
    index.fields = [target_key_field]

    index
  end
end

.match_keys(source, other, match_key: nil, other_key: nil) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/scout/tsv/attach.rb', line 3

def self.match_keys(source, other, match_key: nil, other_key: nil)
  #match_key = (source.all_fields & other.all_fields).first if match_key.nil?
  if match_key.nil?
    match_key_pos = NamedArray.identify_name(source.all_fields, other.all_fields).first
    match_key = source.all_fields[match_key_pos] if match_key_pos
  end

  if match_key.nil?
    source.all_fields.collect do |f|
      other_key = other.identify_field(f)
      if other_key
        other_key = other.key_field if other_key == :key
        match_key = f
        break
      end
    end
  end

  if match_key.nil?
    other.all_fields.collect do |f|
      match_key = source.identify_field(f)
      if match_key
        other_key = f
        break
      end
    end
  end

  match_key = source.key_field if match_key.nil? 

  if other_key.nil?
    other_key = other.identify_field(match_key)
  end

  other_key = other.key_field if other_key.nil?

  match_key = :key if NamedArray.field_match(match_key, source.key_field)
  other_key = :key if NamedArray.field_match(other_key, other.key_field)

  [match_key, other_key]
end

.open(file, options = {}) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/scout/tsv.rb', line 76

def self.open(file, options = {})
  grep, invert_grep, nocache, monitor, entity_options = IndiferentHash.process_options options, :grep, :invert_grep, :nocache, :monitor, :entity_options

  persist_options = IndiferentHash.pull_keys options, :persist
  persist_options = IndiferentHash.add_defaults persist_options, prefix: "TSV", type: :HDB, persist: false
  persist_options[:data] ||= options[:data]

  file = StringIO.new file if String === file && ! (Path === file) && file.index("\n")

  source_name, options = 
    case file
    when StringIO
      [file.inspect, options]
    when TSV::Parser
      [file.options[:filename], file.options]
    else
      [file, options]
    end

  Persist.tsv(source_name, options, persist_options: persist_options) do |data|
    options[:data] = data if data
    options[:filename] ||= if TSV::Parser === file
                           file.options[:filename]
                         elsif Path === file
                           file
                         elsif file.respond_to?(:filename)
                           file.filename
                         elsif Path.is_filename?(file)
                           file
                         else
                           nil
                         end

    if data
      Log.debug "TSV open #{Log.fingerprint file} into #{Log.fingerprint data}"
    else
      Log.debug "TSV open #{Log.fingerprint file}"
    end

    tsv = if TSV::Parser === file
      TSV.parse(file, **options)
    else
      options[:tsv_invert_grep] ||= invert_grep if invert_grep
      Open.open(file, grep: grep, invert_grep: invert_grep, nocache: nocache) do |f|
        TSV.parse(f, **options)
      end
    end

    tsv
  end
end

.original_setupObject



34
# File 'lib/scout/tsv.rb', line 34

alias original_setup setup

.parse(stream, fix: true, header_hash: "#", sep: "\t", filename: nil, namespace: nil, unnamed: false, serializer: nil, **kwargs, &block) ⇒ Object



438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
# File 'lib/scout/tsv/parser.rb', line 438

def self.parse(stream, fix: true, header_hash: "#", sep: "\t", filename: nil, namespace: nil, unnamed: false, serializer: nil, **kwargs, &block)
  parser = TSV::Parser === stream ? stream : TSV::Parser.new(stream, fix: fix, header_hash: header_hash, sep: sep)

  cast = kwargs[:cast]
  cast = parser.options[:cast] if cast.nil?
  identifiers = kwargs.delete(:identifiers)
  type = kwargs[:type] ||=  parser.options[:type] ||= :double

  if (data = kwargs[:data]) && data.respond_to?(:persistence_class)
    TSV.setup(data, type: type)
    data.extend TSVAdapter
    serializer ||= if cast
                     case [cast, type]
                     when [:to_i, :single]
                       :integer
                     when [:to_i, :list], [:to_i, :flat]
                       :integer_array
                     when [:to_f, :single]
                       :float
                     when [:to_f, :list], [:to_f, :flat]
                       :float_array
                     when [:to_f, :double], [:to_i, :double]
                       :marshal
                     else
                       type
                     end
                   else
                     type
                   end
    data.serializer = TSVAdapter::SERIALIZER_ALIAS[serializer] || serializer
  end

  kwargs[:data] = {} if kwargs[:data].nil?

  data = parser.traverse **kwargs, &block
  data.type = type
  data.cast = cast
  data.filename = filename || parser.options[:filename]
  data.namespace = namespace || parser.options[:namespace]
  data.identifiers = identifiers
  data.unnamed = unnamed
  data.save_annotation_hash if data.respond_to?(:save_annotation_hash)
  data
end

.parse_header(stream, fix: true, header_hash: '#', sep: "\t") ⇒ Object



230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# File 'lib/scout/tsv/parser.rb', line 230

def self.parse_header(stream, fix: true, header_hash: '#', sep: "\t")
  sep = "\t" if sep.nil?
  if (Path === stream) || ((String === stream) && Path.is_filename?(stream))
    Open.open(stream) do |f|
      return parse_header(f, fix: fix, header_hash: header_hash, sep: sep)
    end
  end

  if IO === stream && stream.closed?
    stream.join if stream.respond_to?(:join)
    raise "Closed stream" 
  end

  opts = {}
  preamble = []

  # Get line

  begin
    #Thread.pass while IO.select([stream], nil, nil, 1).nil? if IO === stream
    line = stream.gets
    return {} if line.nil?
    line = Misc.fixutf8 line.chomp if fix

    # Process options line
    if line and (String === header_hash && m = line.match(/^#{header_hash}: (.*)/))
      opts = IndiferentHash.string2hash m.captures.first.chomp
      line = stream.gets
      if line && fix
        if Proc === fix
          line = fix.call line
        else
          line = Misc.fixutf8 line.chomp if line && fix
        end
      end
    end

    # Determine separator
    sep = opts[:sep] if opts[:sep]

    # Process fields line
    preamble << line if line
    while line && (TrueClass === header_hash || (String === header_hash && line.start_with?(header_hash)))
      fields = line.split(sep, -1)
      key_field = fields.shift
      key_field = key_field.sub(header_hash, '') if String === header_hash && ! header_hash.empty?

      line = (header_hash != "" ?  stream.gets : nil)
      line = Misc.fixutf8 line.chomp if line
      preamble << line if line
      break if TrueClass === header_hash || header_hash == ""
    end

    preamble = preamble[0..-3] * "\n"

    line ||= stream.gets

    first_line = line

    opts[:type] = opts[:type].to_sym if opts[:type]
    opts[:cast] = opts[:cast].to_sym if opts[:cast]

    all_fields = [key_field] + fields if key_field && fields
    NamedArray.setup([opts, key_field, fields, first_line, preamble, all_fields], %w(options key_field fields first_line preamble all_fields))
  rescue Exception
    raise stream.stream_exception if stream.respond_to?(:stream_exception) && stream.stream_exception
    stream.abort($!) if stream.respond_to?(:abort)
    raise $!
  end
end

.parse_line(line, type: :list, key: 0, positions: nil, sep: "\t", sep2: "|", cast: nil, select: nil, field_names: nil) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/scout/tsv/parser.rb', line 15

def self.parse_line(line, type: :list, key: 0, positions: nil, sep: "\t", sep2: "|", cast: nil, select: nil, field_names: nil)
  items = line.split(sep, -1)

  return nil if select && ! TSV.select(items[0], items[1..-1], select, fields: field_names, type: type, sep: sep2)

  if positions.nil? && key == 0
    key = items.shift
  elsif positions.nil?
    if type == :flat
      key = items[1..-1].collect{|e| e.split(sep2, -1) }.flatten
      items = items.slice(0,1)
    else
      key = items.delete_at(key)
    end
    key = key.split(sep2) if type == :double
  else 
    key, items = items[key], items.values_at(*positions)
    key = key.split(sep2) if type == :double || type == :flat
  end

  items = case type
          when :list
            items
          when :single
            items.first
          when :flat
            items.collect{|i| i.split(sep2, -1) }.flatten
          when :double
            items.collect{|i| i.nil? ? [] : i.split(sep2, -1) }
          end


  if cast
    items = cast_value(items, cast)
  end

  [key, items]
end

.parse_stream(stream, data: nil, source_type: nil, type: :list, merge: true, one2one: false, fix: true, bar: false, first_line: nil, field_names: nil, head: nil, **kwargs, &block) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/scout/tsv/parser.rb', line 54

def self.parse_stream(stream, data: nil, source_type: nil, type: :list, merge: true, one2one: false, fix: true, bar: false, first_line: nil, field_names: nil, head: nil, **kwargs, &block)
  begin
    bar = "Parsing #{Log.fingerprint stream}" if TrueClass === bar
    bar = Log::ProgressBar.get_obj_bar(stream, bar) if bar
    bar.init if bar

    source_type = type if source_type.nil?

    type_swap_key = [source_type.to_s, type.to_s] * "_"

    same_type = source_type.to_s == type.to_s

    if data && data.respond_to?(:load_stream) && 
        data.serializer.to_s.include?("String") &&
        same_type && 
        ! (head || kwargs[:cast] || kwargs[:positions] || (kwargs[:key] && kwargs[:key] != 0) || Proc === fix ) &&
        (kwargs[:sep].nil? || kwargs[:sep] == "\t")


      Log.debug "Loading #{Log.fingerprint stream} directly into #{Log.fingerprint data}"
      if first_line
        full_stream = Open.open_pipe do |sin|
          sin.puts first_line
          Open.consume_stream(stream, false, sin)
        end
        data.load_stream(full_stream)
      else
        data.load_stream(stream)
      end

      return data
    end


    data = {} if data.nil?
    merge = false if type != :double && type != :flat
    line = first_line || stream.gets
    while line 
      break if head && head <= 0
      begin
        line.chomp!
        if Proc === fix
          line = fix.call line
        elsif fix
          line = Misc.fixutf8(line)
        end
        bar.tick if bar
        if type == :array || type == :line
          block.call line
          next
        end

        key, items = parse_line(line, type: source_type, field_names: field_names, **kwargs)

        next if key.nil?

        if Array === key
          keys = key
          if one2one
            key_items = keys.length.times.collect{|i| items.collect{|list| [list[i] || list[0]] } }
          else
            key_items = false
          end
        else
          keys = [key]
          key_items = false
        end

        keys.each_with_index do |key,i|
          if key_items
            these_items = key_items[i]
          else
            these_items = items
          end

          these_items = 
            case type_swap_key
            when "single_single"
              these_items
            when "list_single"
              these_items.first
            when "flat_single"
              these_items.first
            when "double_single"
              these_items.first.first
            when "single_list"
              [these_items]
            when "list_list"
              these_items
            when "flat_list"
              these_items
            when "double_list"
              these_items.collect{|l| l.first }
            when "single_flat"
              [these_items]
            when "list_flat"
              these_items
            when "flat_flat"
              these_items
            when "double_flat"
              these_items.flatten
            when "single_double"
              [[these_items]]
            when "list_double"
              these_items.collect{|l| l.nil? ? [] : [l] }
            when "flat_double"
              [these_items]
            when "double_double"
              these_items
            end

          if block_given?
            res = block.call(key, these_items, field_names)
            data[key] = res unless res.nil? || FalseClass === data
            next
          end

          if ! merge || ! data.include?(key)
            these_items = these_items.collect{|i| i.empty? ? [nil] : i } if type == :double && one2one
            data[key] = these_items
          elsif type == :double
            current = data[key]
            if merge == :concat
              these_items.each_with_index do |new,i|
                new = [nil] if new.empty?
                current[i].concat(new)
              end
            else
              merged = []
              these_items.each_with_index do |new,i|
                new = [nil] if new.empty?
                merged[i] = current[i] + new
              end
              data[key] = merged
            end
          elsif type == :flat
            current = data[key]
            if merge == :concat
              current[i].concat these_items
            else
              data[key] = current + these_items
            end
          end
        end
      rescue Exception
        raise stream.stream_exception if stream.respond_to?(:stream_exception) && stream.stream_exception
        stream.abort($!) if stream.respond_to?(:abort)
        raise $!
      ensure
        head = head - 1 if head
        if stream.closed?
          line = nil
        else
          line = stream.gets 
        end
      end
    end
    data
  ensure
    if stream.respond_to?(:stream_exception) && stream.stream_exception
      bar.remove(stream.stream_exception)
    else
      bar.remove
    end if bar

    if stream.respond_to?(:join)
      eof = begin
              stream.eof?
            rescue IOError
              true
            end
      stream.join if eof
    end
  end
end

.paste_streams(streams, type: nil, sort: nil, sort_memory: nil, sep: nil, preamble: nil, header: nil, same_fields: nil, fix_flat: nil, all_match: nil, field_prefix: nil) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/scout/tsv/stream.rb', line 2

def self.paste_streams(streams, type: nil, sort: nil, sort_memory: nil, sep: nil, preamble: nil, header: nil, same_fields: nil, fix_flat: nil, all_match: nil, field_prefix: nil)

  streams = streams.collect do |stream|
    case stream
    when(defined? Step and Step)
      stream.stream
    when Path
      stream.open
    when TSV::Dumper
      stream.stream
    when TSV
      stream.dumper_stream
    else
      stream
    end
  end.compact

  num_streams = streams.length

  streams = streams.collect do |stream|
    Open.sort_stream(stream, memory: sort_memory)
  end if sort

  begin

    lines         =[]
    fields        =[]
    sizes         =[]
    key_fields    =[]
    input_options =[]
    empty         =[]
    preambles     =[]
    parser_types  =[]

    type ||= :double

    streams = streams.collect do |stream|

      parser = TSV::Parser.new stream, type: type, sep: sep

      sfields = parser.fields

      if field_prefix
        index = streams.index stream
        prefix = field_prefix[index]

        sfields = sfields.collect{|f|[prefix, f]* ":"}
      end

      first_line = parser.first_line
      first_line = nil if first_line == ""

      lines         << first_line
      key_fields    << parser.key_field
      fields        << sfields
      sizes         << sfields.length if sfields
      input_options << parser.options
      preambles     << parser.preamble      if preamble and not parser.preamble.empty?
      parser_types  << parser.type

      empty         << stream               if parser.first_line.nil? || parser.first_line.empty?

      stream
    end


    all_fields = fields.dup

    key_field = key_fields.compact.first

    if same_fields
      fields = fields.first
    else
      fields = fields.compact.flatten
    end

    options = input_options.first 
    type ||= options[:type]
    type ||= :list if type == :single
    type ||= :double if type == :flat

    preamble_txt = case preamble
                   when TrueClass
                     preambles * "\n"
                   when String
                     if preamble[0]== '+'
                       preambles * "\n" + "\n" + preamble[1..-1]
                     else
                       preamble
                     end
                   else
                     nil
                   end

    empty_pos = empty.collect{|stream| streams.index stream}

    keys =[]
    parts =[]
    lines.each_with_index do |line,i|
      if line.nil? || line.empty?
        keys[i]= nil
        parts[i]= nil
      else
        vs = line.chomp.split(sep, -1)
        key, *p = vs
        keys[i]= key
        parts[i]= p
      end
      sizes[i] ||= parts[i].length unless parts[i].nil?
    end
    done_streams =[]

    fields = nil if fields && fields.empty?
    dumper = TSV::Dumper.new key_field: key_field, fields: fields, type: type
    dumper.init(preamble: preamble_txt || !!key_field)

    t = Thread.new do
      Thread.report_on_exception = false
      Thread.current["name"] = "Paste streams"

      last_min = nil
      while lines.reject{|line| line.nil?}.any?
        min = keys.compact.sort.first
        break if min.nil?
        new_values =[]

        skip = all_match && keys.uniq !=[min]

        keys.each_with_index do |key,i|
          case key
          when min
            new_values << parts[i]

            begin
              line = lines[i]= begin
                                 streams[i].gets
                             rescue
                               Log.exception $!
                               nil
                             end
            if line.nil?
              keys[i]= nil
              parts[i]= nil
            else
              k, *p = line.chomp.split(sep, -1)
              p = p.collect{|e| e.nil? ? "" : e }

              if k == keys[i]
                new_values = NamedArray.zip_fields(new_values).zip(p).collect{|p| [p.flatten * "|"] }
                raise TryAgain 
              end
              keys[i]= k
              parts[i]= p
            end
          rescue TryAgain
            keys[i]= nil
            parts[i]= nil
            Log.debug "Skipping repeated key in stream #{i}: #{key} - #{min}"
            retry
          end
        else
          p = [nil] * sizes[i]
          new_values << p
        end
      end

      next if skip

      if same_fields
        new_values_same = []
        new_values.each do |list|
          list.each_with_index do |l,i|
            new_values_same[i] ||= []
            new_values_same[i] << l
          end
        end
        new_values = new_values_same
      else
        new_values = new_values.inject([]){|acc,l| acc.concat l }
      end

      dumper.add min, new_values
    end

    dumper.close

    streams.each do |stream|
      stream.close if stream.respond_to?(:close) && ! stream.closed?
      stream.join if stream.respond_to? :join
    end
    end
  rescue Aborted
    Log.error "Aborted pasting streams #{streams.inspect}: #{$!.message}"
    streams.each do |stream|
      stream.abort if stream.respond_to? :abort
    end
    raise $!
  rescue Exception
    Log.error "Exception pasting streams #{streams.inspect}: #{$!.message}"
    streams.each do |stream|
      stream.abort if stream.respond_to? :abort
    end
    raise $!
  end

  Thread.pass until t["name"]

  ConcurrentStream.setup(dumper.stream, threads: [t])
end

.pos_index(tsv_file, pos_field = nil, key_field: :key, bar: nil, **kwargs) ⇒ Object



161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/scout/tsv/index.rb', line 161

def self.pos_index(tsv_file, pos_field = nil, key_field: :key, bar: nil, **kwargs)
  kwargs = IndiferentHash.add_defaults kwargs, unnamed: true
  type, data_persist = IndiferentHash.process_options kwargs, :type

  prefix = "PositionIndex[#{pos_field}]"

  prefix += select_prefix_str(kwargs[:select])

  persist_options = IndiferentHash.pull_keys kwargs, :persist
  persist_options = IndiferentHash.add_defaults persist_options, :prefix => prefix, :type => :fwt, :persist => true

  data_options = IndiferentHash.pull_keys kwargs, :data

  Persist.persist(tsv_file, persist_options[:type], persist_options.merge(other_options: kwargs.merge(pos_field: pos_field, key_field: key_field))) do |filename|
    tsv_file = TSV.open(tsv_file, *data_options) if data_options[:persist] && ! TSV === tsv_file

    log_msg = "PositionIndex #{Log.fingerprint tsv_file} #{pos_field}"
    Log.low log_msg
    bar = log_msg if TrueClass === bar

    max_key_size = 0
    index_data = []
    TSV.traverse tsv_file, key_field: key_field, fields: [pos_field], type: :flat, cast: :to_i, bar: bar, **kwargs do |key, pos|
      key_size = key.length
      max_key_size = key_size if key_size > max_key_size

      if Array === pos
        pos.each do |p|
          index_data << [key, p]
        end
      else
        index_data << [key, pos]
      end
    end

    filename = :memory if filename.nil?
    index = FixWidthTable.get(filename, max_key_size, false)
    index.add_point index_data
    index.read
    index
  end
end

.process_stream(stream, header_hash: "#", &block) ⇒ Object



199
200
201
202
203
204
205
206
207
# File 'lib/scout/tsv/open.rb', line 199

def self.process_stream(stream, header_hash: "#", &block)
  sout = Open.open_pipe do |sin|
    while line = stream.gets 
      break unless line.start_with?(header_hash)
      sin.puts line
    end
    yield sin, line
  end
end

.range_index(tsv_file, start_field = nil, end_field = nil, key_field: :key, bar: nil, **kwargs) ⇒ Object



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/scout/tsv/index.rb', line 117

def self.range_index(tsv_file, start_field = nil, end_field = nil, key_field: :key, bar: nil, **kwargs)
  kwargs = IndiferentHash.add_defaults kwargs, unnamed: true
  type, data_persist = IndiferentHash.process_options kwargs, :type, :data_persist

  prefix = "RangeIndex[#{start_field}-#{end_field}]"

  prefix += select_prefix_str(kwargs[:select])

  persist_options = IndiferentHash.pull_keys kwargs, :persist
  persist_options = IndiferentHash.add_defaults persist_options, :prefix => prefix, :type => :fwt, :persist => true

  data_options = IndiferentHash.pull_keys kwargs, :data

  Persist.persist(tsv_file, persist_options[:type], persist_options.merge(other_options: kwargs.merge(start_field: start_field, end_field: end_field, key_field: key_field))) do |filename|
    tsv_file = TSV.open(tsv_file, *data_options) if data_options[:persist] && ! TSV === tsv_file

    log_msg = "RangeIndex #{Log.fingerprint tsv_file} #{[start_field, end_field]*"-"}"
    Log.low log_msg
    bar = log_msg if TrueClass === bar

    max_key_size = 0
    index_data = []
    TSV.traverse tsv_file, key_field: key_field, fields: [start_field, end_field], bar: bar, unnamed: true, **kwargs do |key, values|
      key_size = key.length
      max_key_size = key_size if key_size > max_key_size

      start_pos, end_pos = values
      if Array === start_pos
        start_pos.zip(end_pos).each do |s,e|
          index_data << [key, [s.to_i, e.to_i]]
        end
      else
        index_data << [key, [start_pos.to_i, end_pos.to_i]]
      end
    end

    filename = :memory if filename.nil?
    index = FixWidthTable.get(filename, max_key_size, true)
    index.add_range index_data
    index.read
    index
  end
end

.select(key, values, method, fields: nil, field: nil, invert: false, type: nil, sep: nil, &block) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/scout/tsv/util/select.rb', line 2

def self.select(key, values, method, fields: nil, field: nil, invert: false, type: nil, sep: nil, &block)
  return ! select(key, values, method, field: field, invert: false, type: type, sep: sep, &block) if invert

  return yield(key, values) if method.nil? && block_given

  if Hash === method
    if method.include?(:invert)
      method = method.dup
      invert = method.delete(:invert)
      return select(key, values, method, fields: fields, field: field, invert: invert, type: type, sep: sep, &block)
    end
    field = method.keys.first
    value = method[field]
    return select(key, values, value, fields: fields, field: field, invert: invert, type: type, sep: sep, &block)
  end

  if field
    field = NamedArray.identify_name(fields, field) if fields && String === field
    set = field == :key ? [key] : (type == :double ? values[field].split(sep) : values[field])
  else
    set = [key, (type == :double ? values.collect{|v| v.split(sep) } : values)]
  end

  if Array === set
    set.flatten!
  else
    set = [set]
  end

  case method
  when Array
    (method & set).any?
  when Regexp
    set.select{|v| v =~ method }.any?
  when Symbol
    set.first.send(method)
  when Numeric
    set.size > method
  when String
    if block_given?
      field = method
      field = fields.index?(field) if fields && String === field
      case 
      when block.arity == 1
        if (method == key_field or method == :key)
          yield(key)
        else
          yield(values[method])
        end
      when block.arity == 2
        if (method == key_field or method == :key)
          yield(key, key)
        else
          yield(key, values[method])
        end
      end
    elsif m = method.match(/^([<>]=?)(.*)/)
      set.select{|v| v.to_f.send($1, $2.to_f) }.any?
    else
      set.select{|v| v == method }.any?
    end
  when Proc
    set.select{|v| method.call(v) }.any?
  end
end

.select_prefix_str(select) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/scout/tsv/index.rb', line 6

def self.select_prefix_str(select)
  str = begin
          case select
          when nil
            nil
          when Array
            case select.first
            when nil
              nil
            when Array
              select.collect{|p| p * "="}*","
            else
              select.collect{|p| p.to_s }*"="
            end
          when Hash
            if select.empty?
              nil
            else
              select.collect do |key,value|
                [key.to_s, value.to_s] * "="
              end * ","
            end
          end
        rescue
          Log.warn "Error in select_prefix_str: #{Log.fingerprint(select)}: #{$!.message}"
          str = nil
        end
  if str.nil?
    ""
  else
    "[select:#{str}]"
  end
end

.setup(obj, *rest, &block) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/scout/tsv.rb', line 36

def setup(obj, *rest, &block)

  if rest.length == 1 && String === rest.first
    options = TSV.str2options(rest.first)
    if Array === obj
      default_value = case options[:type]
                      when :double, :flat, :list, nil
                        []
                      when :single
                        nil
                      end
      obj = Misc.array2hash(obj, default_value)
    end
    original_setup(obj, options, &block)
  else
    if Array === obj
      options = rest.first if Hash === rest.first
      options ||= {}
      default_value = case options[:type]
                      when :double, :flat, :list, nil
                        []
                      when :single
                        nil
                      end
      obj = Misc.array2hash(obj, default_value)
    end
    original_setup(obj, *rest, &block)
  end

  obj.save_annotation_hash if obj.respond_to?(:save_annotation_hash)

  obj
end

.str2options(str) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
# File 'lib/scout/tsv.rb', line 21

def self.str2options(str)
  field_options,_sep, rest =  str.partition("#")
  key, fields_str = field_options.split("~")

  fields = fields_str.nil? ? [] : fields_str.split(/,\s*/)

  rest = ":type=" << rest if rest =~ /^:?\w+$/
  rest_options = rest.nil? ? {} : IndiferentHash.string2hash(rest)

  {:key_field => key, :fields => fields}.merge(rest_options)
end

.str_setup(option_str, obj) ⇒ Object



71
72
73
74
# File 'lib/scout/tsv.rb', line 71

def self.str_setup(option_str, obj)
  options = TSV.str2options(option_str) 
  setup(obj, **options)
end

.translate(tsv, field, format, identifiers: nil, one2one: false, merge: true, stream: false, keep: false, persist_index: true) ⇒ Object



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/scout/tsv/change_id/translate.rb', line 108

def self.translate(tsv, field, format, identifiers: nil, one2one: false, merge: true, stream: false, keep: false, persist_index: true)

  identifiers ||= tsv.identifier_files
  index = translation_index([tsv, identifiers].flatten.compact, field, format, persist: persist_index)

  key_field, *fields = TSV.all_fields(tsv)
  if field == key_field
    new_key_field = format
    new_fields = fields
  else
    new_key_field = key_field
    new_fields = fields.collect{|f| f == field ? format : f }
  end

  field_pos = new_key_field == key_field ? new_fields.index(format) : :key

  transformer = TSV::Transformer.new tsv
  transformer.key_field = new_key_field
  transformer.fields = new_fields
  transformer.traverse one2one: one2one, unnamed: true do |k,v|
    if field_pos == :key
      [index[k], v]
    else
      v = v.dup
      if Array === v[field_pos]
        v[field_pos] = index.values_at(*v[field_pos]).compact
      else
        v[field_pos] = index[v[field_pos]]
      end
      [k, v]
    end
  end

  stream ? transformer : transformer.tsv(merge: merge, one2one: one2one)
end

.translation_index(files, source, target, persist_options = {}) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/scout/tsv/change_id/translate.rb', line 49

def self.translation_index(files, source, target, persist_options = {})
  return nil if source == target
  persist_options = IndiferentHash.add_defaults persist_options.dup, :persist => true, :prefix => "Translation index"

  file_fields = {}

  files = [files] unless Array === files

  files.each do |file|
    next if Path === file && ! Open.exist?(file)
    file = file.find if Path === file
    file_fields[file] = all_fields(file)
  end

  begin
    path = translation_path(file_fields, source, target)
  rescue
    exception = $!
    begin
      path = translation_path(file_fields, source, target)
    rescue
      raise exception
    end
  end

  name = [source || "all", target] * "->" + " (#{files.length} files - #{Misc.digest(files)})"
  second_target = if path.length == 1
                    target
                  else
                    file1, file2 = path.values_at 0, 1
                    pos = NamedArray.identify_name(TSV.all_fields(file1), TSV.all_fields(file2))
                    TSV.all_fields(file1)[pos.compact.first]
                  end
  Persist.persist(name, "HDB", persist_options) do 
    index = path.inject(nil) do |acc,file|
      if acc.nil?
        if source.nil?
          if TSV === file
            acc = file.index target: second_target
          else
            acc = TSV.index(file, target: second_target)
          end
        else
          if TSV === file
            acc = (file.key_field == source || source.nil?) ? file.annotate(file.dup) : file.reorder(source)
          else
            acc = TSV.open(file, key_field: source)
          end
        end
      else
        acc = acc.attach file, insitu: false
      end

      acc
    end
    index.slice([target]).to_single
  end
end

.translation_path(file_fields, source, target) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/scout/tsv/change_id/translate.rb', line 18

def self.translation_path(file_fields, source, target)
  target_files = file_fields.select{|f,fields| identify_field_in_obj(fields, target) }.collect{|file,f| file }
  if source.nil?
    source_files = file_fields.keys
  else
    source_files = file_fields.select{|f,fields| identify_field_in_obj(fields, source) }.collect{|file,f| file }
  end

  if source && (one_step = target_files & source_files).any?
    [one_step.first]
  else
    source_fields = file_fields.values_at(*source_files).flatten
    target_fields = file_fields.values_at(*target_files).flatten
    if (common_fields = source_fields & target_fields).any?
      source_file = source_files.select{|file| fields = file_fields[file]; (fields & common_fields).any? }.collect{|file,f| file }.first
      target_file = target_files.select{|file| fields = file_fields[file]; (fields & common_fields).any? }.collect{|file,f| file }.first
      [source_file, target_file]
    else
      file_fields.select{|f,fields| (fields & source_fields).any? && (fields & target_fields).any? }
      middle_file, middle_fields = file_fields.select{|f,fields| (fields & source_fields).any? && (fields & target_fields).any? }.first
      if middle_file
        source_file = source_files.select{|file| fields = file_fields[file]; (fields & middle_fields).any? }.collect{|file,f| file }.first
        target_file = target_files.select{|file| fields = file_fields[file]; (fields & middle_fields).any? }.collect{|file,f| file }.first
        [source_file, middle_file, target_file]
      else
        raise "Could not traverse identifier path from #{Log.fingerprint source} to #{Log.fingerprint target} in #{Log.fingerprint file_fields}"
      end
    end
  end
end

.traverse(*args, **kwargs, &block) ⇒ Object



195
196
197
# File 'lib/scout/tsv/open.rb', line 195

def self.traverse(*args, **kwargs, &block)
  Open.traverse(*args, **kwargs, &block)
end

.unzip(source, field, target: nil, sep: ":", delete: true, type: :list, merge: false, one2one: true, bar: nil) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/scout/tsv/util/unzip.rb', line 3

def self.unzip(source, field, target: nil, sep: ":", delete: true, type: :list, merge: false, one2one: true, bar: nil)
  source = TSV::Parser.new source if String === source

  field_pos = source.identify_field(field)
  new_fields = source.fields.dup
  field_name = new_fields[field_pos]
  new_fields.delete_at(field_pos) if delete
  new_key_field = [source.key_field, field_name] * sep
  type = :double if merge

  stream = target == :stream

  target = case target
           when :stream
             TSV::Dumper.new(source.options.merge(sep: "\t"))
           when nil
             TSV.setup({})
           else
             target
           end
             
  target.fields = new_fields
  target.key_field = new_key_field
  target.type = type

  transformer = TSV::Transformer.new source, target, unnamed: true

  bar = "Unzip #{new_key_field}" if TrueClass === bar

  transformer.traverse unnamed: true, one2one: one2one, bar: bar do |k,v|
    if source.type == :double
      if one2one
        res = NamedArray.zip_fields(v).collect do |_v|
          field_value = _v[field_pos]

          if delete
            new_values = _v.dup
            new_values.delete_at field_pos
          else
            new_values = _v
          end

          new_key = [k,field_value] * sep
          new_values = new_values.collect{|e| [e] } if transformer.type == :double
          [new_key, new_values]
        end
      else
        all_values = v.collect{|e| e.dup }
        all_values.delete_at field_pos if delete
        res = NamedArray.zip_fields(v).collect do |_v|
          field_value = _v[field_pos]

          new_key = [k,field_value] * sep
          new_values = all_values if transformer.type == :double
          [new_key, new_values]
        end
      end
      
      MultipleResult.setup(res)
    else
      field_value = v[field_pos]

      if delete
        new_values = v.dup
        new_values.delete_at field_pos
      else
        new_values = v
      end

      new_key = [k,field_value] * sep

      new_values = new_values.collect{|e| [e] } if transformer.type == :double

      [new_key, new_values]
    end
  end

  stream ? transformer : transformer.tsv(merge: merge)
end

Instance Method Details

#[](key, *rest) ⇒ Object



56
57
58
59
60
# File 'lib/scout/tsv/util.rb', line 56

def [](key, *rest)
  v = super(key, *rest)
  NamedArray.setup(v, @fields, key) unless @unnamed || @type == :flat || ! (Array === v)
  v
end

#add_field(name = nil) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/scout/tsv/util/process.rb', line 46

def add_field(name = nil)
  through do |key, values|
    new_values = yield(key, values)
    new_values = [new_values].compact if type == :double and not Array === new_values

    case
    when (values.nil? and (fields.nil? or fields.empty?))
      values = [new_values]
    when values.nil?  
      values = [nil] * fields.length + [new_values]
    when Array === values
      values += [new_values]
    else
      values << new_values
    end

    self[key] = values
  end

  if not fields.nil? and not name.nil?
    new_fields = self.fields + [name]
    self.fields = new_fields
  end

  self
end

#all_fieldsObject



147
148
149
150
# File 'lib/scout/tsv/util.rb', line 147

def all_fields
  return [] if @fields.nil?
  [@key_field] + @fields
end

#attach(*args, **kwargs) ⇒ Object



225
226
227
# File 'lib/scout/tsv/attach.rb', line 225

def attach(*args, **kwargs)
  TSV.attach(self, *args, **kwargs)
end

#change_id(*args, **kwargs) ⇒ Object



40
41
42
# File 'lib/scout/tsv/change_id.rb', line 40

def change_id(*args, **kwargs)
  TSV.change_id(self, *args, **kwargs)
end

#change_key(*args, **kwargs) ⇒ Object



26
27
28
# File 'lib/scout/tsv/change_id.rb', line 26

def change_key(*args, **kwargs)
  TSV.change_key(self, *args, **kwargs)
end

#chunked_values_at(keys, max = 5000) ⇒ Object



266
267
268
269
270
271
272
# File 'lib/scout/tsv/util/select.rb', line 266

def chunked_values_at(keys, max = 5000)
  Misc.ordered_divide(keys, max).inject([]) do |acc,c|
    new = self.values_at(*c)
    new.annotate acc if new.respond_to? :annotate and acc.empty?
    acc.concat(new)
  end
end

#collapse_stream(*args, **kwargs, &block) ⇒ Object



217
218
219
# File 'lib/scout/tsv/open.rb', line 217

def collapse_stream(*args, **kwargs, &block)
  TSV.collapse_stream(self.dumper_stream, *args, **kwargs, &block)
end

#collect(*args, &block) ⇒ Object



96
97
98
99
100
101
102
103
104
105
106
# File 'lib/scout/tsv/util.rb', line 96

def collect(*args, &block)
  if block_given?
    res = []
    each do |k,v|
      res << yield(k, v)
    end
    res
  else
    super(*args)
  end
end

#column(field, **kwargs) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
# File 'lib/scout/tsv/util/reorder.rb', line 47

def column(field, **kwargs)
  new_type = case type
             when :double, :flat
               :flat
             else
               :single
             end

  kwargs[:type] = new_type
  slice(field, **kwargs)
end

#digest_strObject



168
169
170
# File 'lib/scout/tsv/util.rb', line 168

def digest_str
  "TSV:{"<< Log.fingerprint(self.all_fields|| []) << ";" << Log.fingerprint(self.keys) << ";" << Log.fingerprint(self.values) << "}"
end

#dumper_stream(options = {}) ⇒ Object Also known as: stream



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/scout/tsv/dumper.rb', line 147

def dumper_stream(options = {})
  preamble, unmerge, keys, stream = IndiferentHash.process_options options, 
    :preamble, :unmerge, :keys, :stream,
    :preamble => true, :unmerge => false
  unmerge = false unless @type === :double
  dumper = TSV::Dumper.new self.annotation_hash.merge(options)

  dump_entry = Proc.new do |k,value_list|
    if unmerge
      max = value_list.collect{|v| v.length}.max

      if unmerge == :expand and max > 1
        value_list = value_list.collect do |values|
          if values.length == 1
            [values.first] * max
          else
            values
          end
        end
      end

      NamedArray.zip_fields(value_list).each do |values|
        dumper.add k, values
      end
    else
      dumper.add k, value_list
    end
  end

  if stream.nil?
    t = Thread.new do 
      begin
        Thread.current.report_on_exception = true
        Thread.current["name"] = "Dumper thread"
        dumper.init(preamble: preamble)

        if keys
          keys.each do |k|
            dump_entry.call k, self[k]
          end
        else
          self.each &dump_entry
        end

        dumper.close
      rescue
        dumper.abort($!)
      end
    end
    Thread.pass until t["name"]
    stream = dumper.stream
    ConcurrentStream.setup(stream, :threads => [t])
    stream
  else
    dumper.set_stream stream
    begin
      dumper.init(preamble: preamble)
      if keys
        keys.each do |k|
          dump_entry.call k, self[k]
        end
      else
        self.each &dump_entry
      end

      dumper.close
    rescue
      dumper.abort($!)
    end
    stream
  end
end

#each(*args, &block) ⇒ Object



85
86
87
88
89
90
91
92
93
94
# File 'lib/scout/tsv/util.rb', line 85

def each(*args, &block)
  if block_given?
    super(*args) do |k,v|
      NamedArray.setup(v, @fields) unless @unnamed || @type == :flat || ! (Array === v)
      block.call(k, v)
    end
  else
    super(*args)
  end
end

#filter(filter_dir = nil) ⇒ Object



285
286
287
288
289
290
# File 'lib/scout/tsv/util/filter.rb', line 285

def filter(filter_dir = nil)
  self.extend Filtered
  self.filter_dir = filter_dir
  self.filters = []
  self
end

#fingerprintObject



164
165
166
# File 'lib/scout/tsv/util.rb', line 164

def fingerprint
  "TSV:{"<< Log.fingerprint(self.all_fields|| []) << ";" << Log.fingerprint(self.keys) << "}"
end

#identifier_filesObject



229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/scout/tsv/attach.rb', line 229

def identifier_files
  case
  when (identifiers and TSV === identifiers)
    [identifiers]
  when (identifiers and Array === identifiers)
    case
    when (TSV === identifiers.first or identifiers.empty?)
      identifiers
    else
      identifiers.collect{|f| Path === f ? f : Path.setup(f)}
    end
  when identifiers
    [ Path === identifiers ? identifiers : Path.setup(identifiers) ]
  when Path === filename
    path_files = filename.dirname.identifiers
    [path_files].flatten.compact.select{|f| f.exists?}
  when filename
    [Path.setup(filename.dup).dirname.identifiers]
  else
    []
  end
end

#identify_field(name, strict: nil) ⇒ Object



52
53
54
# File 'lib/scout/tsv/util.rb', line 52

def identify_field(name, strict: nil)
  TSV.identify_field(@key_field, @fields, name, strict: strict)
end

#index(*args, **kwargs, &block) ⇒ Object



113
114
115
# File 'lib/scout/tsv/index.rb', line 113

def index(*args, **kwargs, &block)
   TSV.index(self, *args, **kwargs, &block)
end

#inspectObject



172
173
174
# File 'lib/scout/tsv/util.rb', line 172

def inspect
  fingerprint
end

#melt_columns(value_field, column_field) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
# File 'lib/scout/tsv/util/melt.rb', line 2

def melt_columns(value_field, column_field)
  target = TSV.setup({}, :key_field => "ID", :fields => [key_field, value_field, column_field], :type => :list, :cast => cast)
  each do |k,values|
    i = 0
    values.zip(fields).each do |v,f|
      target["#{k}:#{i}"] = [k,v,f]
      i+=1
    end
  end
  target
end

#merge(other) ⇒ Object



176
177
178
# File 'lib/scout/tsv/util.rb', line 176

def merge(other)
  self.annotate(super(other))
end

#optionsObject



62
63
64
# File 'lib/scout/tsv/util.rb', line 62

def options
  annotation_hash
end

#page(pnum, psize, field = nil, just_keys = false, reverse = false, &block) ⇒ Object



148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/scout/tsv/util/sort.rb', line 148

def page(pnum, psize, field = nil, just_keys = false, reverse = false, &block)
  pstart = psize * (pnum - 1)
  pend = psize * pnum - 1
  field = :key if field == "key"
  keys = sort_by(field || :key, true, &block)
  keys.reverse! if reverse

  if just_keys
    keys[pstart..pend]
  else
    select :key => keys[pstart..pend]
  end
end

#pos_index(*args, **kwargs, &block) ⇒ Object



208
209
210
# File 'lib/scout/tsv/index.rb', line 208

def pos_index(*args, **kwargs, &block)
  TSV.pos_index(self, *args, **kwargs, &block)
end

#process(field, &block) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/scout/tsv/util/process.rb', line 2

def process(field, &block)
  field_pos = identify_field field

  through do |key, values|
    case
    when type == :single
      field_values = values
    when type == :flat
      field_values = values
    else
      next if values.nil?
      field_values = values[field_pos]
    end

    new_values = case 
                 when block.arity == 1
                   yield(field_values)
                 when block.arity == 2
                   yield(field_values, key)
                 when block.arity == 3
                   yield(field_values, key, values)
                 else
                   raise "Unexpected arity in block, must be 1, 2 or 3: #{block.arity}"
                 end

    case
    when type == :single
      self[key] = new_values
    when type == :flat
      self[key] = new_values
    else
      if ! values[field_pos].frozen? && ((String === values[field_pos] && String === new_values) ||
        (Array === values[field_pos] && Array === new_values))
         values[field_pos].replace new_values
      else
        values[field_pos] = new_values
      end
      self[key] = values
    end
  end

  self
end

#range_index(*args, **kwargs, &block) ⇒ Object



204
205
206
# File 'lib/scout/tsv/index.rb', line 204

def range_index(*args, **kwargs, &block)
  TSV.range_index(self, *args, **kwargs, &block)
end

#remove_duplicates(pivot = 0) ⇒ Object



73
74
75
76
77
78
79
# File 'lib/scout/tsv/util/process.rb', line 73

def remove_duplicates(pivot = 0)
  new = self.annotate({})
  self.through do |k,values|
    new[k] = NamedArray.zip_fields(NamedArray.zip_fields(values).uniq)
  end
  new
end

#reorder(key_field = nil, fields = nil, merge: true, one2one: true, **kwargs) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/scout/tsv/util/reorder.rb', line 4

def reorder(key_field = nil, fields = nil, merge: true, one2one: true, **kwargs) 
  res = self.annotate({})
  res.type = kwargs[:type] if kwargs.include?(:type)
  kwargs[:one2one] = one2one
  key_field_name, field_names = with_unnamed do
    traverse key_field, fields, **kwargs do |k,v|
      if res.type == :double && merge && res.include?(k)
        current = res[k]
        if merge == :concat
          v.each_with_index do |new,i|
            next if new.empty?
            current[i].concat(new)
          end
        else
          merged = []
          v.each_with_index do |new,i|
            next if new.empty?
            merged[i] = current[i] + new
          end
          res[k] = merged
        end
      elsif res.type == :flat
        res[k] ||= []
        if merge == :concat
          res[k].concat v
        else
          res[k] += v
        end
      else
        res[k] = v
      end
    end
  end

  res.key_field = key_field_name
  res.fields = field_names
  res
end

#reset_filtersObject



292
293
294
295
296
297
298
299
300
301
# File 'lib/scout/tsv/util/filter.rb', line 292

def reset_filters
  if @filter_dir.nil? or @filter_dir.empty?
    @filters.each do |filter| filter.reset end if Array === @filters
    return
  end

  Dir.glob(File.join(@filter_dir, '*.filter')).each do |f|
    FileUtils.rm f
  end
end

#select(method = nil, invert = false, &block) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# File 'lib/scout/tsv/util/select.rb', line 68

def select(method = nil, invert = false, &block)
  new = TSV.setup({}, :key_field => key_field, :fields => fields, :type => type, :filename => filename, :identifiers => identifiers)

  self.annotate(new)
  
  case
  when (method.nil? and block_given?)
    through do |key, values|
      new[key] = values if invert ^ (yield key, values)
    end
  when Array === method
    method = Set.new method
    with_unnamed do
      case type
      when :single
        through do |key, value|
          new[key] = value if invert ^ (method.include? key or method.include? value)
        end
      when :list, :flat
        through do |key, values|
          new[key] = values if invert ^ (method.include? key or (method & values).length > 0)
        end
      else
        through do |key, values|
          new[key] = values if invert ^ (method.include? key or (method & values.flatten).length > 0)
        end
      end
    end
  when Regexp === method
    with_unnamed do
      through do |key, values|
        new[key] = values if invert ^ ([key,values].flatten.select{|v| v =~ method}.any?)
      end
    end
  when ((String === method) || (Symbol === method))
    if block_given?
      case 
      when block.arity == 1
        with_unnamed do
          case
          when (method == key_field or method == :key)
            through do |key, values|
              new[key] = values if invert ^ (yield(key))
            end
          when (type == :single or type == :flat)
            through do |key, value|
              new[key] = value if invert ^ (yield(value))
            end
          else
            pos = identify_field method
            raise "Field #{ method } not identified. Available: #{ fields * ", " }" if pos.nil?

            through do |key, values|
              new[key] = values if invert ^ (yield(values[pos]))
            end
          end
        end
      when block.arity == 2
        with_unnamed do
          case
          when (method == key_field or method == :key)
            through do |key, values|
              new[key] = values if invert ^ (yield(key, key))
            end
          when (type == :single or type == :flat)
            through do |key, value|
              new[key] = value if invert ^ (yield(key, value))
            end
          else
            pos = identify_field method
            through do |key, values|
              new[key] = values if invert ^ (yield(key, values[pos]))
            end
          end

        end
      end

    else
      with_unnamed do
        through do |key, values|
          new[key] = values if invert ^ ([key,values].flatten.select{|v| v == method}.any?)
        end
      end
    end
  when Hash === method
    key  = method.keys.first
    method = method.values.first
    case
    when ((Array === method) and (key == :key or key_field == key))
      with_unnamed do
        keys.each do |key|
          new[key] = self[key] if invert ^ (method.include? key)
        end
      end
    when Array === method
      with_unnamed do
        method = Set.new method unless Set === method
        case type
        when :single
          through :key, key do |key, value|
            new[key] = self[key] if invert ^ (method.include? value)
          end
        when :list
          through :key, key do |key, values|
            new[key] = self[key] if invert ^ (method.include? values.first)
          end
        when :flat #untested
          through :key, key do |key, values|
            new[key] = self[key] if invert ^ ((method & values.flatten).any?)
          end
        else
          through :key, key do |key, values|
            new[key] = self[key] if invert ^ ((method & values.flatten).any?)
          end
        end
      end

    when Regexp === method
      with_unnamed do
        through :key, key do |key, values|
          values = [values] if type == :single
          new[key] = self[key] if invert ^ (values.flatten.select{|v| v =~ method}.any?)
        end
      end

    when ((String === method) and (method =~ /name:(.*)/))
      name = $1
      old_unnamed = self.unnamed
      self.unnamed = false
      if name.strip =~ /^\/(.*)\/$/
        regexp = Regexp.new $1
        through :key, key do |key, values|
          case type
          when :single
            values = values.annotate([values])
          when :double
            values = values[0]
          end
          new[key] = self[key] if invert ^ (values.select{|v| v.name =~ regexp}.any?)
        end
      else
        through :key, key do |key, values|
          case type
          when :single
            values = values.annotate([values])
          when :double
            values = values[0]
          end
          new[key] = self[key] if invert ^ (values.select{|v| v.name == name}.any?)
        end
      end
      self.unnamed = old_unnamed

    when String === method
      if method =~ /^([<>]=?)(.*)/
        with_unnamed do
          through :key, key do |key, values|
            value = Array === values ? values.flatten.first : values
            new[key] = self[key] if value.to_f.send($1, $2.to_f)
          end
        end
      else
        with_unnamed do
          through :key, key do |key, values|
            values = [values] if type == :single
            new[key] = self[key] if invert ^ (values.flatten.select{|v| v == method}.length > 0)
          end
        end
      end
    when Numeric === method
      with_unnamed do
        through :key, key do |key, values|
          new[key] = self[key] if invert ^ (values.flatten.length >= method)
        end
      end
    when Proc === method
      with_unnamed do
        through :key, key do |key, values|
          values = [values] if type == :single
          new[key] = self[key] if invert ^ (values.flatten.select{|v| method.call(v)}.length > 0)
        end
      end
    end
  end
  new
end

#slice(fields, **kwargs) ⇒ Object



43
44
45
# File 'lib/scout/tsv/util/reorder.rb', line 43

def slice(fields, **kwargs)
  reorder :key, fields, **kwargs
end

#sort(field = nil, just_keys = false, &block) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# File 'lib/scout/tsv/util/sort.rb', line 75

def sort(field = nil, just_keys = false, &block)
  field = :all if field.nil?

  if field == :all
    elems = collect
  else
    elems = []
    case type
    when :single
      through :key, field do |key, field|
        elems << [key, field]
      end
    when :list, :flat
      through :key, field do |key, fields|
        elems << [key, fields.first]
      end
    when :double
      through :key, field do |key, fields|
        elems << [key, fields.first]
      end
    end
  end

  if not block_given?
    if fields == :all
      if just_keys
        keys = elems.sort_by{|key, value| key }.collect{|key, values| key}
        keys = prepare_entity(keys, key_field, entity_options.merge(:dup_array => true))
      else
        elems.sort_by{|key, value| key }
      end
    else
      sorted = elems.sort do |a, b| 
        a_value = a.last
        b_value = b.last
        a_empty = a_value.nil? or (a_value.respond_to?(:empty?) and a_value.empty?)
        b_empty = b_value.nil? or (b_value.respond_to?(:empty?) and b_value.empty?)
        case
        when (a_empty and b_empty)
          0
        when a_empty
          -1
        when b_empty
          1
        when Array === a_value
          if a_value.length == 1 and b_value.length == 1
            a_value.first <=> b_value.first
          else
            a_value.length <=> b_value.length
          end
        else
          a_value <=> b_value
        end
      end
      if just_keys
        keys = sorted.collect{|key, value| key}
        keys = prepare_entity(keys, key_field, entity_options.merge(:dup_array => true)) unless @unnamed
        keys
      else
        sorted.collect{|key, value| [key, self[key]]}
      end
    end
  else
    if just_keys
      keys = elems.sort(&block).collect{|key, value| key}
      keys = prepare_entity(keys, key_field, entity_options.merge(:dup_array => true)) unless @unnamed
      keys
    else
      elems.sort(&block).collect{|key, value| [key, self[key]]}
    end
  end
end

#sort_by(field = nil, just_keys = false, &block) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/scout/tsv/util/sort.rb', line 2

def sort_by(field = nil, just_keys = false, &block)
  field = :all if field.nil?

  if field == :all
    elems = collect
  else
    elems = []
    case type
    when :single
      through :key, field do |key, field|
        elems << [key, field]
      end
    when :list, :flat
      through :key, field do |key, fields|
        elems << [key, fields.first]
      end
    when :double
      through :key, field do |key, fields|
        elems << [key, fields.first]
      end
    end
  end

  if not block_given?
    if fields == :all
      if just_keys
        keys = elems.sort_by{|key, value| key }.collect{|key, values| key}
        keys = prepare_entity(keys, key_field, (entity_options || {}).merge(:dup_array => true)) unless @unnamed
      else
        elems.sort_by{|key, value| key }
      end
    else
      sorted = elems.sort do |a, b| 
        a_value = a.last
        b_value = b.last
        a_empty = a_value.nil? or (a_value.respond_to?(:empty?) and a_value.empty?)
        b_empty = b_value.nil? or (b_value.respond_to?(:empty?) and b_value.empty?)
        case
        when (a_empty and b_empty)
          0
        when a_empty
          -1
        when b_empty
          1
        when Array === a_value
          if a_value.length == 1 and b_value.length == 1
            a_value.first <=> b_value.first
          else
            a_value.length <=> b_value.length
          end
        else
          a_value <=> b_value
        end
      end
      if just_keys
        keys = sorted.collect{|key, value| key}
        keys = prepare_entity(keys, key_field, (entity_options || {}).merge(:dup_array => true)) unless @unnamed
        keys
      else
        sorted.collect{|key, value| [key, self[key]]}
      end
    end
  else
    if just_keys
      keys = elems.sort_by(&block).collect{|key, value| key}
      keys = prepare_entity(keys, key_field, (entity_options || {}).merge(:dup_array => true)) unless @unnamed
      keys
    else
      elems.sort_by(&block).collect{|key, value| [key, self[key]]}
    end
  end
end

#subset(keys) ⇒ Object



256
257
258
259
260
261
262
263
264
# File 'lib/scout/tsv/util/select.rb', line 256

def subset(keys)
  new = self.annotate({})
  self.with_unnamed do
    keys.each do |k|
      new[k] = self[k] if self.include?(k)
    end
  end
  new
end

#summaryObject



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/scout/tsv/util.rb', line 118

def summary
  key = nil
  values = nil
  self.each do |k, v|
    key = k
    values = v
    break
  end

  filename = @filename
  filename = "No filename" if filename.nil? || String === filename && filename.empty?
  filename.find if Path === filename 
  filename = File.basename(filename) + " [" + File.basename(persistence_path) + "]" if respond_to?(:persistence_path) and persistence_path

  with_unnamed do
    <<-EOF
Filename = #{filename}
Key field = #{key_field || "*No key field*"}
Fields = #{fields ? Log.fingerprint(fields) : "*No field info*"}
Type = #{type}
Size = #{size}
namespace = #{Log.fingerprint namespace}
identifiers = #{Log.fingerprint identifiers}
Example:
- #{key} -- #{Log.fingerprint values }
    EOF
  end
end

#to_doubleObject



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/scout/tsv/transformer.rb', line 135

def to_double
  return self if self.type == :double
  res = self.annotate({})
  self.with_unnamed do
    transformer = Transformer.new self, res
    transformer.type = :double
    transformer.traverse do |k,v|
      case self.type
      when :single
        [k, [[v]]]
      when :list
        [k, v.collect{|v| [v] }]
      when :flat
        [k, [v]]
      end
    end
  end
  res
end

#to_flatObject



168
169
170
171
172
173
174
175
176
177
# File 'lib/scout/tsv/transformer.rb', line 168

def to_flat
  res = self.annotate({})
  transformer = Transformer.new self, res
  transformer.type = :flat
  transformer.traverse do |k,v|
    v = Array === v ? v.flatten : [v]
    [k, v]
  end
  res
end

#to_hashObject



128
129
130
# File 'lib/scout/tsv.rb', line 128

def to_hash
  self.dup
end

#to_listObject



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/scout/tsv/transformer.rb', line 116

def to_list
  res = self.annotate({})
  self.with_unnamed do
    transformer = Transformer.new self, res
    transformer.type = :list
    transformer.traverse do |k,v|
      case self.type
      when :single
        [k, [v]]
      when :double
        [k, v.collect{|v| v.first }]
      when :flat
        [k, v.slice(0,1)]
      end
    end
  end
  res
end

#to_s(options = {}) ⇒ Object



220
221
222
# File 'lib/scout/tsv/dumper.rb', line 220

def to_s(options = {})
  dumper_stream({stream: ''}.merge(options))
end

#to_singleObject



156
157
158
159
160
161
162
163
164
165
166
# File 'lib/scout/tsv/transformer.rb', line 156

def to_single
  res = self.annotate({})
  transformer = Transformer.new self, res
  transformer.type = :single
  transformer.unnamed = true
  transformer.traverse do |k,v|
    v = v.first while Array === v
    [k, v]
  end
  res
end

#translate(*args, **kwargs) ⇒ Object



144
145
146
# File 'lib/scout/tsv/change_id/translate.rb', line 144

def translate(*args, **kwargs)
  TSV.translate(self, *args, **kwargs)
end

#transpose(key_field = "Unkown ID") ⇒ Object



81
82
83
84
85
86
87
88
89
90
# File 'lib/scout/tsv/util/reorder.rb', line 81

def transpose(key_field = "Unkown ID")
  case type
  when :single, :flat
    self.to_list.transpose_list key_field
  when :list
    transpose_list key_field
  when :double
    transpose_double key_field
  end
end

#transpose_double(key_field = "Unkown ID") ⇒ Object



74
75
76
77
78
79
# File 'lib/scout/tsv/util/reorder.rb', line 74

def transpose_double(key_field = "Unkown ID")
  sep = "-!SEP--#{rand 10000}!-"
  tmp = self.to_list{|v| v * sep}
  new = tmp.transpose_list(key_field)
  new.to_double{|v| v.split(sep)}
end

#transpose_list(key_field = "Unkown ID") ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/scout/tsv/util/reorder.rb', line 59

def transpose_list(key_field="Unkown ID")
  new_fields = keys.dup
  new = self.annotate({})
  TSV.setup(new, :key_field => key_field, :fields => new_fields, :type => type, :filename => filename, :identifiers => identifiers)

  m = Matrix.rows values 
  new_rows = m.transpose.to_a

  fields.zip(new_rows) do |key,row|
    new[key] = row
  end

  new
end

#traverse(key_field_pos = :key, fields_pos = nil, type: nil, one2one: false, unnamed: false, key_field: nil, fields: nil, bar: false, cast: nil, select: nil, uniq: false, &block) ⇒ Object Also known as: through



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/scout/tsv/traverse.rb', line 3

def traverse(key_field_pos = :key, fields_pos = nil, type: nil, one2one: false, unnamed: false, key_field: nil, fields: nil, bar: false, cast: nil, select: nil, uniq: false, &block)
  key_field = key_field_pos if key_field.nil?
  fields = fields_pos.dup if fields.nil?
  type = @type if type.nil?
  key_pos = self.identify_field(key_field)
  fields = self.all_fields if fields == :all
  fields = [fields] unless fields.nil? || Array === fields
  positions = (fields.nil? || fields == :all) ? nil : self.identify_field(fields)
  positions = nil if fields == self.fields

  if key_pos == :key
    key_name = @key_field
  else
    key_name = @fields[key_pos]
    if positions.nil?
      positions = (0..@fields.length-1).to_a
      positions.delete_at key_pos
      positions.unshift :key
    end
  end

  fields = positions.collect{|p| p == :key ? self.key_field : self.fields[p] } if positions

  if positions.nil? && key_pos == :key
    field_names = @fields.dup
  elsif positions.nil? && key_pos != :key
    field_names = @fields.dup
    field_names.delete_at key_pos unless fields == :all
  elsif positions.include?(:key)
    field_names = positions.collect{|p| p == :key ? @key_field : @fields[p] }
  else
    field_names = @fields.values_at *positions
  end

  key_index = positions.index :key if positions
  positions.delete :key if positions

  log_message = "Traverse #{Log.fingerprint self}"
  Log.debug log_message
  bar = log_message if TrueClass === bar

  type_swap_tag = [type.to_s, @type.to_s] * "_"
  Log::ProgressBar.with_obj_bar(self, bar) do |bar|
    with_unnamed unnamed do
      each do |key,values|
        bar.tick if bar
        values = [values] if @type == :single
        if positions.nil?
          if key_pos != :key
            values = values.dup
            if @type == :flat
              key = values
            else
              key = values.delete_at(key_pos)
            end
          end
        else 
          orig_key = key
          key = @type == :flat ? values : values[key_pos] if key_pos != :key 

          values = values.values_at(*positions)
          NamedArray.setup(values, fields)
          if key_index
            if @type == :double
              values.insert key_index, [orig_key]
            else
              values.insert key_index, orig_key
            end
          end
        end

        values = TSV.cast_value(values, cast) if cast

        if Array === key 
          key = key.uniq if uniq
          if @type == :double && one2one
            if one2one == :strict
              key.each_with_index do |key_i,i|
                if type == :double
                  v_i = values.collect{|v| [v[i]] }
                else
                  v_i = values.collect{|v| v[i] }
                end
                yield key_i, v_i
              end
            else
              key.each_with_index do |key_i,i|
                if type == :double
                  v_i = values.collect{|v| [v[i] || v.first] }
                else
                  v_i = values.collect{|v| v[i] || v.first }
                end
                yield key_i, v_i, @fields
              end
            end
          else
            key.each_with_index do |key_i, i|
              if type == :double
                yield key_i, values
              elsif type == :list
                yield key_i, values.collect{|v| v[i] }
              elsif type == :flat
                yield key_i, values.flatten
              elsif type == :single
                yield key_i, values.first
              end
            end
          end
        else
          if type == @type
            if type == :single
              yield key, values.first
            else
              yield key, values
            end
          else
            case type_swap_tag
            when "double_list"
              yield key, values.collect{|v| [v] }
            when "double_flat"
              yield key, [values]
            when "double_single"
              yield key, [values]
            when "list_double"
              yield key, values.collect{|v| v.first }
            when "list_flat"
              yield key, [values.first]
            when "list_single"
              yield key, values
            when "flat_double"
              yield key, values.flatten
            when "flat_list"
              yield key, values.flatten
            when "flat_single"
              yield key, values
            when "single_double"
              yield key, values.flatten.first
            when "single_list"
              yield key, values.first
            when "single_flat"
              yield key, values.first
            end
          end
        end
      end
    end
  end
  

  [key_name, field_names]
end

#unzip(*args, **kwargs) ⇒ Object



83
84
85
# File 'lib/scout/tsv/util/unzip.rb', line 83

def unzip(*args, **kwargs)
  TSV.unzip(self, *args, **kwargs)
end

#unzip_replicatesObject



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/scout/tsv/util/unzip.rb', line 87

def unzip_replicates
  raise "Can only unzip replicates in :double TSVs" unless type == :double

  new = {}
  self.with_unnamed do
    through do |k,vs|
      NamedArray.zip_fields(vs).each_with_index do |v,i|
        new[k + "(#{i})"] = v
      end
    end
  end

  self.annotate(new)
  new.type = :list

  new
end

#with_filters(filters, &block) ⇒ Object



303
304
305
306
307
308
309
310
# File 'lib/scout/tsv/util/filter.rb', line 303

def with_filters(filters, &block)
  filter
  begin
    filters.each{|field,value| add_filter field, value }
  ensure
    reset_filters
  end
end

#with_unnamed(unnamed = true) ⇒ Object



108
109
110
111
112
113
114
115
116
# File 'lib/scout/tsv/util.rb', line 108

def with_unnamed(unnamed = true)
  begin
    old_unnamed = @unnamed
    @unnamed = unnamed
    yield
  ensure
    @unnamed = old_unnamed
  end
end

#write_file(file) ⇒ Object



226
227
228
229
230
# File 'lib/scout/tsv/dumper.rb', line 226

def write_file(file)
  Open.open(file, mode: 'w') do |f|
    dumper_stream(stream: f)
  end
end

#zip(merge = false, field = "New Field", sep = ":") ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/scout/tsv/util/unzip.rb', line 105

def zip(merge = false, field = "New Field", sep = ":")
  new = {}
  self.annotate new

  new.type = :double if merge

  new.with_unnamed do
    if merge
      self.through do |key,values|
        new_key, new_value = key.split(sep)
        new_values = values + [[new_value] * values.first.length]
        if new.include? new_key
          current = new[new_key]
          current.each_with_index do |v,i|
            v.concat(new_values[i])
          end
        else
          new[new_key] = new_values
        end
      end
    else
      self.through do |key,values|
        new_key, new_value = key.split(sep)
        new_values = values + [new_value]
        new[new_key] = new_values
      end
    end
  end

  if self.key_field and self.fields
    new.key_field = self.key_field.partition(sep).first
    new.fields = new.fields + [field]
  end

  new
end

#zip_new(key, values, insitu: :lax) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/scout/tsv/util.rb', line 66

def zip_new(key, values, insitu: :lax)
  values = values.collect{|v| Array === v ? v : [v] } unless Array === values.first
  if current_values = self[key]
    if insitu == :lax
      self[key] = NamedArray.add_zipped(current_values, values)
    elsif insitu
      NamedArray.add_zipped(current_values, values)
    else
      self[key] = NamedArray.add_zipped(current_values.dup, values)
    end
  else
    if insitu && insitu != :lax
      self[key] = values.dup
    else
      self[key] = values
    end
  end
end