Module: Association

Defined in:
lib/rbbt/association.rb,
lib/rbbt/association/open.rb,
lib/rbbt/association/util.rb,
lib/rbbt/association/index.rb,
lib/rbbt/association/database.rb

Defined Under Namespace

Modules: Index

Class Method Summary collapse

Class Method Details

.add_reciprocal(tsv) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/rbbt/association/database.rb', line 6

def self.add_reciprocal(tsv)
  new = TSV.open(tsv.dumper_stream)
  tsv.with_unnamed do
    case tsv.type
    when :double
      tsv.through do |source, values|
        Misc.zip_fields(values).each do |info|
          target, *rest = info
          next if target == source
          rest.unshift source
          new.zip_new target, rest
        end
      end
    else
    end
  end

  tsv.annotate(new)

  new
end

.database(file, options = {}) ⇒ Object



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/rbbt/association/database.rb', line 191

def self.database(file,  options = {})
  database = case file
             when (defined? Step and Step)
               file.clean if file.error? or file.aborted? or file.dirty?
               file.run(true) unless file.done? or file.started?
               file.join unless file.done?
               open_stream(TSV.get_stream(file), options.dup)
             when TSV
               file = file.to_double unless file.type == :double
               tsv = reorder_tsv(file, options.dup)
               if options[:data]
                 data = options[:data]
                 tsv.with_unnamed do
                   tsv.with_monitor("Saving database #{Misc.fingerprint file}") do
                     tsv.through do |k,v|
                       data[k] = v
                     end
                   end
                 end
               end
               tsv
             when IO
               open_stream(file, options.dup)
             else
               stream = TSV.get_stream(file)
               open_stream(stream, options.dup)
             end

  database.filename = file if Path === file && file.identifier_files.any?
  database.entity_options = options[:entity_options] if options[:entity_options]

  database
end

.extract_specs(all_fields = nil, options = {}) ⇒ Object



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/rbbt/association/util.rb', line 44

def self.extract_specs(all_fields=nil, options = {})
  source, source_format, target, target_format, format = Misc.process_options options, :source, :source_format, :target, :target_format, :format

  key_field, *fields = all_fields.nil? ? [nil] : all_fields

  source_specs = normalize_specs  source, all_fields
  target_specs = normalize_specs  target, all_fields

  source_specs = [nil, nil, nil] if source_specs.nil?
  target_specs = [nil, nil, nil] if target_specs.nil?

  source_specs[2] = source_format if source_format
  target_specs[2] = target_format if target_format

  if source_specs.first and not all_fields.include? source_specs.first and defined? Entity and (_format = Entity.formats[source_specs.first.to_s])
    _source = all_fields.select{|f| Entity.formats[f.to_s] == _format }.first
    raise "Source not found #{source_specs}. Options: #{Misc.fingerprint all_fields}" if _source.nil?
    source_specs[0] = _source
  end

  if target_specs.first and  not all_fields.include? target_specs.first and defined? Entity and (_format = Entity.formats[target_specs.first.to_s])
    _target = all_fields.select{|f| Entity.formats[f.to_s].to_s == _format.to_s }.first
    raise "Target not found #{target_specs}. Options: #{Misc.fingerprint all_fields}" if _target.nil?
    target_specs[0] = _target
  end

  if source_specs[0].nil? and target_specs[0].nil?
    source_specs[0] = key_field 
    target_specs[0] = fields[0]
  elsif source_specs[0].nil? 
    if target_specs[0] == :key or target_specs[0] == key_field
      source_specs[0] = fields[0]
    else
      source_specs[0] = key_field
    end
  elsif target_specs[0].nil? 
    if source_specs[0] == fields.first 
      target_specs[0] = key_field
    else
      target_specs[0] = fields.first 
    end
  end

  # If format is specified, then perhaps we need to change the
  if target_specs[2].nil? 
    target_type = Entity.formats[target_specs[1] || target_specs[0]]
    target_specs[2] = format[target_type.to_s] if format
    target_specs[2] = nil if target_specs[2] == target_specs[0] or target_specs[2] == target_specs[1]
  end

  if source_specs[2].nil? 
    source_type = Entity.formats[source_specs[1] || source_specs[0]]
    source_specs[2] = format[source_type.to_s] if format
    source_specs[2] = nil if source_specs[2] == source_specs[0] or source_specs[2] == source_specs[1]
  end

  {:source => source_specs, :target => target_specs}
end

.headers(all_fields, info_fields = nil, options = {}) ⇒ Object



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/rbbt/association/util.rb', line 112

def self.headers(all_fields, info_fields = nil, options = {})
  specs = extract_specs all_fields, options

  source_field = specs[:source][0]
  target_field = specs[:target][0]

  #source_pos = all_fields.index source_field
  #target_pos = all_fields.index target_field
 
  source_pos = TSV.identify_field all_fields.first, all_fields[1..-1], source_field
  target_pos = TSV.identify_field all_fields.first, all_fields[1..-1], target_field

  source_pos = source_pos == :key ? 0 : source_pos + 1
  target_pos = target_pos == :key ? 0 : target_pos + 1

  source_header = specs[:source][1] || specs[:source][0]
  target_header = specs[:target][1] || specs[:target][0]

  info_fields = all_fields.dup if info_fields.nil?
  info_fields.delete source_field
  info_fields.delete target_field
  info_fields.unshift target_field

  field_headers = [target_header] 
  info_fields[1..-1].each do |field|
    header = case field
             when String 
               field
             when Numeric
               all_fields[field] 
             when :key
               all_fields.first
             end

    field_headers << header
  end

  field_pos = info_fields.collect do |f| 
    p = TSV.identify_field all_fields.first, all_fields[1..-1], f
    p == :key ? 0 : p + 1
  end

  field_pos.delete source_pos

  source_format = specs[:source][2]
  target_format = specs[:target][2]


  if format = options[:format]
    source_format = process_formats(specs[:source][1] || specs[:source][0], format) || source_format unless source_format
    target_format = process_formats(specs[:target][1] || specs[:target][0], format) || target_format unless target_format
  end

  res = [source_pos, field_pos, source_header, field_headers, source_format, target_format]
  Log.low "Headers -- #{res}"
  res
end

.identify_entity_format(format, fields) ⇒ Object



4
5
6
7
8
9
10
# File 'lib/rbbt/association/util.rb', line 4

def self.identify_entity_format(format, fields)
  entity_type = Entity.formats[format]
  raise "Field #{ format } could not be resolved: #{fields}" if entity_type.nil?
  main_field = fields.select{|f| Entity.formats[f] == entity_type}.first
  raise "Field #{ format } not present, options: #{Misc.fingerprint fields}" if main_field.nil?
  [main_field, nil, format]
end

.index(file, options = nil, persist_options = nil) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/rbbt/association/index.rb', line 6

def self.index(file, options = nil, persist_options = nil)
  options = options.nil? ? {} : options.dup
  persist_options = persist_options.nil? ?  Misc.pull_keys(options, :persist)  : persist_options.dup 
  persist_options[:serializer] ||= options[:serializer] if options.include?(:serializer)

  persist_options = Misc.add_defaults persist_options.dup, :persist => true, :dir => Rbbt.var.associations
  persist = persist_options[:persist]

  file = version_file(file, options[:namespace]) if options[:namespace] and String === file
  Persist.persist_tsv(file, nil, options, persist_options.merge(:engine => "BDB", :prefix => "Association Index")) do |data|
    options = Misc.add_defaults options.dup, :monitor => "Building index for #{Misc.fingerprint file}"
    recycle = options[:recycle]
    undirected = options[:undirected]

    serializer = persist_options[:serializer] || :list

    persist_options[:file] = persist_options[:file] + '.database' if persist_options[:file]

    database = open(file, options, persist_options.dup.merge(:engine => "HDB"))

    source_field = database.key_field

    fields = database.fields
    target_field = fields.first.split(":").last

    undirected = true if undirected.nil? and source_field == target_field

    key_field = [source_field, target_field, undirected ? "undirected" : nil].compact * "~"

    TSV.setup(data, :key_field => key_field, :fields => fields[1..-1], :type => :list, :serializer => serializer, :namespace => database.namespace)

    data.key_field = key_field
    data.fields = fields[1..-1]
    data.type = :list
    data.serializer ||= serializer
    data.filename ||= file if String === file

    database.with_unnamed do
      database.with_monitor(options[:monitor]) do
        database.through do |source, values|
          case database.type
          when :single
            values = [[values]]
          when :list
            values = values.collect{|v| [v] }
          when :flat
            values = [values]
          end
          next if values.empty?
          next if source.nil? or source.empty?
          next if values.empty?

          #targets, *rest = Misc.zip_fields(Misc.zip_fields(values).uniq)
          
          next if values.first.empty?
          values =  Misc.zip_fields(Misc.zip_fields(values).uniq)
          targets, *rest = values

          size = targets ? targets.length : 0

          rest.each_with_index do |list,i|
            list.replace [list.first] * size if list.length == 1
          end if recycle and size > 1

          rest = Misc.zip_fields rest

          annotations = (Array === rest.first and rest.first.length > 1) ?
            targets.zip(rest) :
            targets.zip(rest * targets.length) 

          source = source.gsub('~','-..-')
          annotations.each do |target, info|
            next if target.nil? or target.empty?
            target = target.gsub('~','-..-')
            key = [source, target] * "~"

            if data[key].nil? or info.nil?
              data[key] = info
            else
              old_info = data[key]
              info = old_info.zip(info).collect{|p| p * ";;" }
              data[key] = info
            end
          end
        end

        if undirected
          new_data = {}

          data.through do |key,values|
            reverse_key = key.split("~").reverse * "~"
            new_data[reverse_key] = values
          end 

          new_data.each do |key,values|
            data[key] = values
          end
        end

      end
    end

    data
  end.tap do |data|
    data.read if not Hash === data and data.respond_to? :read
    Association::Index.setup data
    data.entity_options = options[:entity_options] if options[:entity_options]
    data
  end
end

.normalize_specs(spec, all_fields = nil) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/rbbt/association/util.rb', line 24

def self.normalize_specs(spec, all_fields = nil)
  return nil if spec.nil?
  field, header, format = parse_field_specification spec 

  specs = if all_fields.nil? or all_fields.include? field
             [field, header, format]
           else
             if all_fields.nil?
               begin
                 identify_entity_format field, all_fields 
               rescue
                 [field, header, format]
               end
             else
               [field, header, format]
             end
           end
  specs
end

.open(file, options = nil, persist_options = nil) ⇒ Object



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/rbbt/association/open.rb', line 10

def self.open(file, options = nil, persist_options = nil)
  options = options.nil? ? {} : options.dup
  persist_options = persist_options.nil? ?  Misc.pull_keys(options, :persist)  : persist_options.dup 

  options = Misc.add_defaults options, :zipped => true, :merge => true, :monitor => {:desc => "Opening database #{Misc.fingerprint file}"}
  options[:zipped] = false unless options[:merge]
  persist_options = Misc.add_defaults persist_options.dup, :persist => true, :dir => Rbbt.var.associations
  persist = persist_options[:persist]

  file = version_file(file, options[:namespace]) if options[:namespace] and String === file

  data = Persist.persist_tsv(file, nil, options, persist_options.merge(:prefix => "Association Database")) do |data|
    file = file.call if Proc === file

    options = options.dup
    data.serializer = :double if data.respond_to? :serializer

    tsv = Association.database(file, options.merge(:unnamed => true, :data => data, :type => :double))

    data
  end
  data.entity_options = options[:entity_options] if options[:entity_options]
  data
end

.open_stream(stream, options = {}) ⇒ Object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/rbbt/association/database.rb', line 115

def self.open_stream(stream, options = {})
  fields, persist, data = Misc.process_options options, :fields, :persist, :data

  parser = TSV::Parser.new stream, options.merge(:fields => nil, :key_field => nil)
  options = options.merge(parser.options)
  options = Misc.add_defaults options, :type => :double, :merge => true

  key_field, *_fields = all_fields = parser.all_fields

  source_pos, field_pos, source_header, field_headers, source_format, target_format = headers parser.all_fields, fields, options

  parser.key_field = source_pos
  parser.fields = field_pos
  parser.field_positions = field_pos
  parser.key_position = source_pos

  #case parser.type
  #when :single
  #  class << parser
  #    def get_values(parts)
  #      [parts[@key_field], parts.values_at(*@fields).first]
  #    end
  #  end
  #when :list
  #  class << parser
  #    def get_values(parts)
  #      [parts[@key_field], parts.values_at(*@fields)]
  #    end
  #  end
  #when :__double
  #  class << parser
  #    def get_values(parts)
  #      [parts[@key_field].split(@sep2,-1), parts.values_at(*@fields).collect{|v| v.nil? ? [] : v.split(@sep2,-1) }]
  #    end
  #  end
  #when :flat
  #  class << parser
  #    def get_values(parts)
  #      fields = (0..parts.length-1).to_a - [@key_field]
  #      values = parts.values_at(*fields).compact.collect{|v| v.split(@sep2,-1) }.flatten
  #      [parts[@key_field].split(@sep2,-1), values]
  #    end
  #  end
  #end

  open_options = options.merge(parser.options).merge(:parser => parser)
  open_options = Misc.add_defaults open_options, :monitor => {:desc => "Parsing #{ Misc.fingerprint stream }"}

  data ||= {}
  tsv = nil
  if data.respond_to?(:close) && data.respond_to?(:write)
    data.close
    data.write
  end
  TmpFile.with_file do |tmpfile|
    tmp_data = Persist.open_database(tmpfile, true, open_options[:type], "HDB")

    tsv = TSV.parse parser.stream, tmp_data, open_options
    tsv = tsv.to_double
    tsv.key_field = source_header
    tsv.fields = field_headers

    if source_format or target_format
      tsv = translate tsv, source_format, target_format, :persist => true, :data => data
    else
      tsv.through do |k,v|
        data[k] = v
      end
      tsv.annotate data
    end

  end

  tsv
end

.parse_field_specification(spec) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
# File 'lib/rbbt/association/util.rb', line 12

def self.parse_field_specification(spec)
  return [spec,nil,nil] if Numeric === spec
  spec = spec.split "=>" unless Array === spec
  field_part, final_format = spec

  field, format = field_part.split "=~", -1

  field = nil if field.nil? or field.empty?

  [field, format, final_format]
end

.process_formats(field, default_format = {}) ⇒ Object



103
104
105
106
107
108
109
110
# File 'lib/rbbt/association/util.rb', line 103

def self.process_formats(field, default_format = {})
  return nil if default_format.nil? or default_format.empty?
  default_format.each do |type, format|
    entity_type = Entity.formats[field] || format
    return format if entity_type.to_s === type 
  end
  return nil
end

.reorder_tsv(tsv, options = {}) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/rbbt/association/database.rb', line 79

def self.reorder_tsv(tsv, options = {})
  fields, persist = Misc.process_options options, :fields, :persist 
  all_fields = tsv.all_fields

  source_pos, field_pos, source_header, field_headers, source_format, target_format = headers(all_fields, fields, options)

  source_field = source_pos == :key ? :key : all_fields[source_pos]
  info_fields = field_pos.collect{|f| f == :key ? :key : all_fields[f]}
  options = options.merge({:key_field => source_field, :fields =>  info_fields})

  fields = field_headers if fields.nil?

  data = options[:data] || {}
  TmpFile.with_file do |tmpfile|
    tmp_data = Persist.open_database(tmpfile, true, :double, "HDB")

    tsv.with_monitor(options[:monitor]) do
      tsv = tsv.reorder source_field, tsv.all_fields.values_at(*field_pos), :persist => persist, :persist_data => tmp_data if true or source_field != tsv.key_field or (fields and tsv.fields != fields)
    end

    tsv.key_field = source_header
    tsv.fields = field_headers

    if source_format or target_format
      tsv = translate tsv, source_format, target_format, :persist => true, :data => data
    else
      tsv.through do |k,v|
        data[k] = v
      end
      tsv.annotate data
    end
  end

  tsv
end

.translate(tsv, source_final_format, target_final_format, options = {}) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/rbbt/association/database.rb', line 28

def self.translate(tsv, source_final_format, target_final_format, options = {})
  source_field = tsv.key_field
  target_field = tsv.fields.first
  namespace = tsv.namespace

  data = Misc.process_options options, :data

  data ||= {}
  TmpFile.with_file do |tmpfile1|
    TmpFile.with_file do |tmpfile2|
      tmp_data1 = Persist.open_database(tmpfile1, true, :double, "HDB")
      tmp_data2 = Persist.open_database(tmpfile2, true, :double, "HDB")

      if source_final_format and source_field != source_final_format 
        Log.debug("Changing source format from #{tsv.key_field} to #{source_final_format}")

        identifier_files = tsv.identifier_files.dup
        identifier_files = [Organism.identifiers("NAMESPACE")] if identifier_files.empty?
        identifier_files.concat Entity.identifier_files(source_final_format) if defined? Entity
        identifier_files.uniq!
        identifier_files.collect!{|f| f.annotate(f.gsub(/\bNAMESPACE\b/, namespace))} if namespace
        identifier_files.reject!{|f| f.match(/\bNAMESPACE\b/)}

        tsv = TSV.translate(tsv, source_field, source_final_format, options.merge(:identifier_files => identifier_files, :persist_data => tmp_data1))
      end

      # Translate target 
      if target_final_format and target_field != target_final_format
        Log.debug("Changing target format from #{target_field} to #{target_final_format}")
        old_key_field = tsv.key_field 
        tsv.key_field = "MASK"

        identifier_files = tsv.identifier_files.dup 
        identifier_files.concat Entity.identifier_files(target_final_format) if defined? Entity
        identifier_files.uniq!
        identifier_files.collect!{|f| f.annotate(f.gsub(/\bNAMESPACE\b/, namespace))} if namespace
        identifier_files.reject!{|f| f.match(/\bNAMESPACE\b/)}

        tsv = TSV.translate(tsv, target_field, target_final_format, options.merge(:identifier_files => identifier_files, :persist_data => tmp_data2))
        tsv.key_field = old_key_field
      end

      tsv.through do |k,v|
        data[k] = v
      end

      tsv.annotate data
    end
  end
end

.version_file(file, namespace) ⇒ Object



4
5
6
7
8
# File 'lib/rbbt/association/open.rb', line 4

def self.version_file(file, namespace)
  old_file, file = file, file.sub('NAMESPACE', namespace) if namespace and String === file
  old_file.annotate file if Path === old_file
  file
end