Module: Association
- Defined in:
- lib/rbbt/association.rb,
lib/rbbt/association/open.rb,
lib/rbbt/association/util.rb,
lib/rbbt/association/index.rb,
lib/rbbt/association/database.rb
Defined Under Namespace
Modules: Index
Class Method Summary collapse
- .add_reciprocal(tsv) ⇒ Object
- .database(file, options = {}) ⇒ Object
- .extract_specs(all_fields = nil, options = {}) ⇒ Object
- .headers(all_fields, info_fields = nil, options = {}) ⇒ Object
- .identify_entity_format(format, fields) ⇒ Object
- .index(file, options = nil, persist_options = nil) ⇒ Object
- .normalize_specs(spec, all_fields = nil) ⇒ Object
- .open(file, options = nil, persist_options = nil) ⇒ Object
- .open_stream(stream, options = {}) ⇒ Object
- .parse_field_specification(spec) ⇒ Object
- .process_formats(field, default_format = {}) ⇒ Object
- .reorder_tsv(tsv, options = {}) ⇒ Object
- .translate(tsv, source_final_format, target_final_format, options = {}) ⇒ Object
- .version_file(file, namespace) ⇒ Object
Class Method Details
.add_reciprocal(tsv) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/rbbt/association/database.rb', line 6 def self.add_reciprocal(tsv) new = TSV.open(tsv.dumper_stream) tsv.with_unnamed do case tsv.type when :double tsv.through do |source, values| Misc.zip_fields(values).each do |info| target, *rest = info next if target == source rest.unshift source new.zip_new target, rest end end else end end tsv.annotate(new) new end |
.database(file, options = {}) ⇒ Object
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
# File 'lib/rbbt/association/database.rb', line 191 def self.database(file, = {}) database = case file when (defined? Step and Step) file.clean if file.error? or file.aborted? or file.dirty? file.run(true) unless file.done? or file.started? file.join unless file.done? open_stream(TSV.get_stream(file), .dup) when TSV file = file.to_double unless file.type == :double tsv = reorder_tsv(file, .dup) if [:data] data = [:data] tsv.with_unnamed do tsv.with_monitor("Saving database #{Misc.fingerprint file}") do tsv.through do |k,v| data[k] = v end end end end tsv when IO open_stream(file, .dup) else stream = TSV.get_stream(file) open_stream(stream, .dup) end database.filename = file if Path === file && file.identifier_files.any? database. = [:entity_options] if [:entity_options] database end |
.extract_specs(all_fields = nil, options = {}) ⇒ Object
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/rbbt/association/util.rb', line 44 def self.extract_specs(all_fields=nil, = {}) source, source_format, target, target_format, format = Misc. , :source, :source_format, :target, :target_format, :format key_field, *fields = all_fields.nil? ? [nil] : all_fields source_specs = normalize_specs source, all_fields target_specs = normalize_specs target, all_fields source_specs = [nil, nil, nil] if source_specs.nil? target_specs = [nil, nil, nil] if target_specs.nil? source_specs[2] = source_format if source_format target_specs[2] = target_format if target_format if source_specs.first and not all_fields.include? source_specs.first and defined? Entity and (_format = Entity.formats[source_specs.first.to_s]) _source = all_fields.select{|f| Entity.formats[f.to_s] == _format }.first raise "Source not found #{source_specs}. Options: #{Misc.fingerprint all_fields}" if _source.nil? source_specs[0] = _source end if target_specs.first and not all_fields.include? target_specs.first and defined? Entity and (_format = Entity.formats[target_specs.first.to_s]) _target = all_fields.select{|f| Entity.formats[f.to_s].to_s == _format.to_s }.first raise "Target not found #{target_specs}. Options: #{Misc.fingerprint all_fields}" if _target.nil? target_specs[0] = _target end if source_specs[0].nil? and target_specs[0].nil? source_specs[0] = key_field target_specs[0] = fields[0] elsif source_specs[0].nil? if target_specs[0] == :key or target_specs[0] == key_field source_specs[0] = fields[0] else source_specs[0] = key_field end elsif target_specs[0].nil? if source_specs[0] == fields.first target_specs[0] = key_field else target_specs[0] = fields.first end end # If format is specified, then perhaps we need to change the if target_specs[2].nil? target_type = Entity.formats[target_specs[1] || target_specs[0]] target_specs[2] = format[target_type.to_s] if format target_specs[2] = nil if target_specs[2] == target_specs[0] or target_specs[2] == target_specs[1] end if source_specs[2].nil? source_type = Entity.formats[source_specs[1] || source_specs[0]] source_specs[2] = format[source_type.to_s] if format source_specs[2] = nil if source_specs[2] == source_specs[0] or source_specs[2] == source_specs[1] end {:source => source_specs, :target => target_specs} end |
.headers(all_fields, info_fields = nil, options = {}) ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/rbbt/association/util.rb', line 112 def self.headers(all_fields, info_fields = nil, = {}) specs = extract_specs all_fields, source_field = specs[:source][0] target_field = specs[:target][0] #source_pos = all_fields.index source_field #target_pos = all_fields.index target_field source_pos = TSV.identify_field all_fields.first, all_fields[1..-1], source_field target_pos = TSV.identify_field all_fields.first, all_fields[1..-1], target_field source_pos = source_pos == :key ? 0 : source_pos + 1 target_pos = target_pos == :key ? 0 : target_pos + 1 source_header = specs[:source][1] || specs[:source][0] target_header = specs[:target][1] || specs[:target][0] info_fields = all_fields.dup if info_fields.nil? info_fields.delete source_field info_fields.delete target_field info_fields.unshift target_field field_headers = [target_header] info_fields[1..-1].each do |field| header = case field when String field when Numeric all_fields[field] when :key all_fields.first end field_headers << header end field_pos = info_fields.collect do |f| p = TSV.identify_field all_fields.first, all_fields[1..-1], f p == :key ? 0 : p + 1 end field_pos.delete source_pos source_format = specs[:source][2] target_format = specs[:target][2] if format = [:format] source_format = process_formats(specs[:source][1] || specs[:source][0], format) || source_format unless source_format target_format = process_formats(specs[:target][1] || specs[:target][0], format) || target_format unless target_format end res = [source_pos, field_pos, source_header, field_headers, source_format, target_format] Log.low "Headers -- #{res}" res end |
.identify_entity_format(format, fields) ⇒ Object
4 5 6 7 8 9 10 |
# File 'lib/rbbt/association/util.rb', line 4 def self.identify_entity_format(format, fields) entity_type = Entity.formats[format] raise "Field #{ format } could not be resolved: #{fields}" if entity_type.nil? main_field = fields.select{|f| Entity.formats[f] == entity_type}.first raise "Field #{ format } not present, options: #{Misc.fingerprint fields}" if main_field.nil? [main_field, nil, format] end |
.index(file, options = nil, persist_options = nil) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
# File 'lib/rbbt/association/index.rb', line 6 def self.index(file, = nil, = nil) = .nil? ? {} : .dup = .nil? ? Misc.pull_keys(, :persist) : .dup [:serializer] ||= [:serializer] if .include?(:serializer) = Misc.add_defaults .dup, :persist => true, :dir => Rbbt.var.associations persist = [:persist] file = version_file(file, [:namespace]) if [:namespace] and String === file Persist.persist_tsv(file, nil, , .merge(:engine => "BDB", :prefix => "Association Index")) do |data| = Misc.add_defaults .dup, :monitor => "Building index for #{Misc.fingerprint file}" recycle = [:recycle] undirected = [:undirected] serializer = [:serializer] || :list [:file] = [:file] + '.database' if [:file] database = open(file, , .dup.merge(:engine => "HDB")) source_field = database.key_field fields = database.fields target_field = fields.first.split(":").last undirected = true if undirected.nil? and source_field == target_field key_field = [source_field, target_field, undirected ? "undirected" : nil].compact * "~" TSV.setup(data, :key_field => key_field, :fields => fields[1..-1], :type => :list, :serializer => serializer, :namespace => database.namespace) data.key_field = key_field data.fields = fields[1..-1] data.type = :list data.serializer ||= serializer data.filename ||= file if String === file database.with_unnamed do database.with_monitor([:monitor]) do database.through do |source, values| case database.type when :single values = [[values]] when :list values = values.collect{|v| [v] } when :flat values = [values] end next if values.empty? next if source.nil? or source.empty? next if values.empty? #targets, *rest = Misc.zip_fields(Misc.zip_fields(values).uniq) next if values.first.empty? values = Misc.zip_fields(Misc.zip_fields(values).uniq) targets, *rest = values size = targets ? targets.length : 0 rest.each_with_index do |list,i| list.replace [list.first] * size if list.length == 1 end if recycle and size > 1 rest = Misc.zip_fields rest annotations = (Array === rest.first and rest.first.length > 1) ? targets.zip(rest) : targets.zip(rest * targets.length) source = source.gsub('~','-..-') annotations.each do |target, info| next if target.nil? or target.empty? target = target.gsub('~','-..-') key = [source, target] * "~" if data[key].nil? or info.nil? data[key] = info else old_info = data[key] info = old_info.zip(info).collect{|p| p * ";;" } data[key] = info end end end if undirected new_data = {} data.through do |key,values| reverse_key = key.split("~").reverse * "~" new_data[reverse_key] = values end new_data.each do |key,values| data[key] = values end end end end data end.tap do |data| data.read if not Hash === data and data.respond_to? :read Association::Index.setup data data. = [:entity_options] if [:entity_options] data end end |
.normalize_specs(spec, all_fields = nil) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/rbbt/association/util.rb', line 24 def self.normalize_specs(spec, all_fields = nil) return nil if spec.nil? field, header, format = parse_field_specification spec specs = if all_fields.nil? or all_fields.include? field [field, header, format] else if all_fields.nil? begin identify_entity_format field, all_fields rescue [field, header, format] end else [field, header, format] end end specs end |
.open(file, options = nil, persist_options = nil) ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/rbbt/association/open.rb', line 10 def self.open(file, = nil, = nil) = .nil? ? {} : .dup = .nil? ? Misc.pull_keys(, :persist) : .dup = Misc.add_defaults , :zipped => true, :merge => true, :monitor => {:desc => "Opening database #{Misc.fingerprint file}"} [:zipped] = false unless [:merge] = Misc.add_defaults .dup, :persist => true, :dir => Rbbt.var.associations persist = [:persist] file = version_file(file, [:namespace]) if [:namespace] and String === file data = Persist.persist_tsv(file, nil, , .merge(:prefix => "Association Database")) do |data| file = file.call if Proc === file = .dup data.serializer = :double if data.respond_to? :serializer tsv = Association.database(file, .merge(:unnamed => true, :data => data, :type => :double)) data end data. = [:entity_options] if [:entity_options] data end |
.open_stream(stream, options = {}) ⇒ Object
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
# File 'lib/rbbt/association/database.rb', line 115 def self.open_stream(stream, = {}) fields, persist, data = Misc. , :fields, :persist, :data parser = TSV::Parser.new stream, .merge(:fields => nil, :key_field => nil) = .merge(parser.) = Misc.add_defaults , :type => :double, :merge => true key_field, *_fields = all_fields = parser.all_fields source_pos, field_pos, source_header, field_headers, source_format, target_format = headers parser.all_fields, fields, parser.key_field = source_pos parser.fields = field_pos parser.field_positions = field_pos parser.key_position = source_pos #case parser.type #when :single # class << parser # def get_values(parts) # [parts[@key_field], parts.values_at(*@fields).first] # end # end #when :list # class << parser # def get_values(parts) # [parts[@key_field], parts.values_at(*@fields)] # end # end #when :__double # class << parser # def get_values(parts) # [parts[@key_field].split(@sep2,-1), parts.values_at(*@fields).collect{|v| v.nil? ? [] : v.split(@sep2,-1) }] # end # end #when :flat # class << parser # def get_values(parts) # fields = (0..parts.length-1).to_a - [@key_field] # values = parts.values_at(*fields).compact.collect{|v| v.split(@sep2,-1) }.flatten # [parts[@key_field].split(@sep2,-1), values] # end # end #end = .merge(parser.).merge(:parser => parser) = Misc.add_defaults , :monitor => {:desc => "Parsing #{ Misc.fingerprint stream }"} data ||= {} tsv = nil if data.respond_to?(:close) && data.respond_to?(:write) data.close data.write end TmpFile.with_file do |tmpfile| tmp_data = Persist.open_database(tmpfile, true, [:type], "HDB") tsv = TSV.parse parser.stream, tmp_data, tsv = tsv.to_double tsv.key_field = source_header tsv.fields = field_headers if source_format or target_format tsv = translate tsv, source_format, target_format, :persist => true, :data => data else tsv.through do |k,v| data[k] = v end tsv.annotate data end end tsv end |
.parse_field_specification(spec) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 |
# File 'lib/rbbt/association/util.rb', line 12 def self.parse_field_specification(spec) return [spec,nil,nil] if Numeric === spec spec = spec.split "=>" unless Array === spec field_part, final_format = spec field, format = field_part.split "=~", -1 field = nil if field.nil? or field.empty? [field, format, final_format] end |
.process_formats(field, default_format = {}) ⇒ Object
103 104 105 106 107 108 109 110 |
# File 'lib/rbbt/association/util.rb', line 103 def self.process_formats(field, default_format = {}) return nil if default_format.nil? or default_format.empty? default_format.each do |type, format| entity_type = Entity.formats[field] || format return format if entity_type.to_s === type end return nil end |
.reorder_tsv(tsv, options = {}) ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
# File 'lib/rbbt/association/database.rb', line 79 def self.reorder_tsv(tsv, = {}) fields, persist = Misc. , :fields, :persist all_fields = tsv.all_fields source_pos, field_pos, source_header, field_headers, source_format, target_format = headers(all_fields, fields, ) source_field = source_pos == :key ? :key : all_fields[source_pos] info_fields = field_pos.collect{|f| f == :key ? :key : all_fields[f]} = .merge({:key_field => source_field, :fields => info_fields}) fields = field_headers if fields.nil? data = [:data] || {} TmpFile.with_file do |tmpfile| tmp_data = Persist.open_database(tmpfile, true, :double, "HDB") tsv.with_monitor([:monitor]) do tsv = tsv.reorder source_field, tsv.all_fields.values_at(*field_pos), :persist => persist, :persist_data => tmp_data if true or source_field != tsv.key_field or (fields and tsv.fields != fields) end tsv.key_field = source_header tsv.fields = field_headers if source_format or target_format tsv = translate tsv, source_format, target_format, :persist => true, :data => data else tsv.through do |k,v| data[k] = v end tsv.annotate data end end tsv end |
.translate(tsv, source_final_format, target_final_format, options = {}) ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/rbbt/association/database.rb', line 28 def self.translate(tsv, source_final_format, target_final_format, = {}) source_field = tsv.key_field target_field = tsv.fields.first namespace = tsv.namespace data = Misc. , :data data ||= {} TmpFile.with_file do |tmpfile1| TmpFile.with_file do |tmpfile2| tmp_data1 = Persist.open_database(tmpfile1, true, :double, "HDB") tmp_data2 = Persist.open_database(tmpfile2, true, :double, "HDB") if source_final_format and source_field != source_final_format Log.debug("Changing source format from #{tsv.key_field} to #{source_final_format}") identifier_files = tsv.identifier_files.dup identifier_files = [Organism.identifiers("NAMESPACE")] if identifier_files.empty? identifier_files.concat Entity.identifier_files(source_final_format) if defined? Entity identifier_files.uniq! identifier_files.collect!{|f| f.annotate(f.gsub(/\bNAMESPACE\b/, namespace))} if namespace identifier_files.reject!{|f| f.match(/\bNAMESPACE\b/)} tsv = TSV.translate(tsv, source_field, source_final_format, .merge(:identifier_files => identifier_files, :persist_data => tmp_data1)) end # Translate target if target_final_format and target_field != target_final_format Log.debug("Changing target format from #{target_field} to #{target_final_format}") old_key_field = tsv.key_field tsv.key_field = "MASK" identifier_files = tsv.identifier_files.dup identifier_files.concat Entity.identifier_files(target_final_format) if defined? Entity identifier_files.uniq! identifier_files.collect!{|f| f.annotate(f.gsub(/\bNAMESPACE\b/, namespace))} if namespace identifier_files.reject!{|f| f.match(/\bNAMESPACE\b/)} tsv = TSV.translate(tsv, target_field, target_final_format, .merge(:identifier_files => identifier_files, :persist_data => tmp_data2)) tsv.key_field = old_key_field end tsv.through do |k,v| data[k] = v end tsv.annotate data end end end |