Class: MiGA::RemoteDataset

Inherits:
MiGA
  • Object
show all
Defined in:
lib/miga/remote_dataset.rb

Overview

MiGA representation of datasets with data in remote locations.

Constant Summary collapse

@@_EUTILS =

Class-level

"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
@@UNIVERSE =
{
  web:{
    dbs: {
      assembly:{stage: :assembly, format: :fasta},
      assembly_gz:{stage: :assembly, format: :fasta_gz}
    },
    url: "%2$s",
    method: :net
  },
  ebi:{
    dbs: { embl:{stage: :assembly, format: :fasta} },
    url: "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
    method: :rest
  },
  ncbi:{
    dbs: { nuccore:{stage: :assembly, format: :fasta} },
    url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
    method: :rest
  },
  ncbi_map:{
    dbs: { assembly:{map_to: :nuccore, format: :text} },
      # FIXME ncbi_map is intended to do internal NCBI mapping between
      # databases.
    url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%3$s - - - - -",
    method: :rest,
    map_to_universe: :ncbi
  }
}

Constants included from MiGA

CITATION, VERSION, VERSION_DATE, VERSION_NAME

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from MiGA

CITATION, DEBUG, DEBUG_OFF, DEBUG_ON, DEBUG_TRACE_OFF, DEBUG_TRACE_ON, FULL_VERSION, LONG_VERSION, VERSION, VERSION_DATE, clean_fasta_file, initialized?, #result_files_exist?, root_path, tabulate

Constructor Details

#initialize(ids, db, universe) ⇒ RemoteDataset

Initialize MiGA::RemoteDataset with ids in database db from universe.



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/miga/remote_dataset.rb', line 102

def initialize(ids, db, universe)
  ids = [ids] unless ids.is_a? Array
  @ids = (ids.is_a?(Array) ? ids : [ids])
  @db = db.to_sym
  @universe = universe.to_sym
  raise "Unknown Universe: #{@universe}. Try one of: "+
    "#{@@UNIVERSE.keys}" unless @@UNIVERSE.keys.include? @universe
  raise "Unknown Database: #{@db}. Try one of: "+
    "#{@@UNIVERSE[@universe][:dbs]}" unless
    @@UNIVERSE[@universe][:dbs].include? @db
  # FIXME Part of the +map_to+ support:
  #unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
  #  MiGA::RemoteDataset.download
  #end
end

Instance Attribute Details

#dbObject (readonly)

Database storing the dataset.



96
97
98
# File 'lib/miga/remote_dataset.rb', line 96

def db
  @db
end

#idsObject (readonly)

IDs of the entries composing the dataset.



98
99
100
# File 'lib/miga/remote_dataset.rb', line 98

def ids
  @ids
end

#universeObject (readonly)

Universe of the dataset.



94
95
96
# File 'lib/miga/remote_dataset.rb', line 94

def universe
  @universe
end

Class Method Details

.download(universe, db, ids, format, file = nil) ⇒ Object

Download data from the universe in the database db with IDs ids and in format. If passed, it saves the result in file. Returns String.



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/miga/remote_dataset.rb', line 57

def self.download(universe, db, ids, format, file=nil)
  ids = [ids] unless ids.is_a? Array
  case @@UNIVERSE[universe][:method]
  when :rest
    map_to = @@UNIVERSE[universe][:dbs][db].nil? ? nil :
      @@UNIVERSE[universe][:dbs][db][:map_to]
    url = sprintf @@UNIVERSE[universe][:url],
      db, ids.join(","), format, map_to
    response = RestClient::Request.execute(method: :get, url:url, timeout:600)
    raise "Unable to reach #{universe} client, error code " +
      "#{response.code}." unless response.code == 200
    doc = response.to_s
  when :net
    url = sprintf @@UNIVERSE[universe][:url],db,ids.join(","),format,map_to
    doc = ""
    @timeout_try = 0
    begin
      open(url) { |f| doc = f.read }
    rescue Net::ReadTimeout
      @timeout_try += 1
      if @timeout_try > 3 ; raise Net::ReadTimeout
      else ; retry
      end
    end
  end
  unless file.nil?
    ofh = File.open(file, "w")
    ofh.print doc
    ofh.close
  end
  doc
end

.UNIVERSEObject

Structure of the different database Universes or containers. The structure is a Hash with universe names as keys as Symbol and values being a Hash with supported keys as Symbol:

  • :dbs => Hash with keys being the database name and the values a Hash of properties such as stage, format, and map_to.

  • url => Pattern of the URL where the data can be obtained, where %1$s is the name of the database, %2$s is the IDs, and %3$s is format.

  • method => Method used to query the URL. Only :rest is currently supported.

  • map_to_universe => Universe where results map to. Currently unsupported.



24
# File 'lib/miga/remote_dataset.rb', line 24

def self.UNIVERSE ; @@UNIVERSE ; end

Instance Method Details

#download(file) ⇒ Object

Download data into file.



168
169
170
171
# File 'lib/miga/remote_dataset.rb', line 168

def download(file)
  MiGA::RemoteDataset.download(universe, db, ids,
    @@UNIVERSE[universe][:dbs][db][:format], file)
end

#get_metadata(metadata = {}) ⇒ Object

Get metadata from the remote location.



157
158
159
160
161
162
163
164
# File 'lib/miga/remote_dataset.rb', line 157

def (={})
  case universe
  when :ebi, :ncbi
    # Get taxonomy
    [:tax] = get_ncbi_taxonomy
  end
  
end

#get_ncbi_taxidObject

Get NCBI Taxonomy ID.



175
176
177
# File 'lib/miga/remote_dataset.rb', line 175

def get_ncbi_taxid
  send("get_ncbi_taxid_from_#{universe}")
end

#get_ncbi_taxonomyObject

Get NCBI taxonomy as MiGA::Taxonomy.



181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/miga/remote_dataset.rb', line 181

def get_ncbi_taxonomy
  lineage = {}
  tax_id = get_ncbi_taxid
  while !(tax_id.nil? or %w{0 1}.include? tax_id)
    doc = MiGA::RemoteDataset.download(:ebi, :taxonomy, tax_id, "")
    name = doc.scan(/SCIENTIFIC NAME\s+:\s+(.+)/).first.to_a.first
    rank = doc.scan(/RANK\s+:\s+(.+)/).first.to_a.first
    rank = "dataset" if lineage.empty? and rank=="no rank"
    lineage[rank] = name unless rank.nil?
    tax_id = doc.scan(/PARENT ID\s+:\s+(.+)/).first.to_a.first
  end
  MiGA::Taxonomy.new(lineage)
end

#save_to(project, name = nil, is_ref = true, metadata = {}) ⇒ Object

Save dataset to the MiGA::Project project identified with name. is_ref indicates if it should be a reference dataset, and contains metadata.



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/miga/remote_dataset.rb', line 121

def save_to(project, name=nil, is_ref=true, ={})
  name = ids.join("_").miga_name if name.nil?
  project = MiGA::Project.new(project) if project.is_a? String
  raise "Dataset #{name} exists in the project, aborting..." if
    MiGA::Dataset.exist?(project, name)
   = ()
  case @@UNIVERSE[universe][:dbs][db][:stage]
  when :assembly
    dir = MiGA::Dataset.RESULT_DIRS[:assembly]
    base = "#{project.path}/data/#{dir}/#{name}"
    File.open("#{base}.start", "w") { |ofh| ofh.puts Time.now.to_s }
    if @@UNIVERSE[universe][:dbs][db][:format] == :fasta_gz
      download("#{base}.LargeContigs.fna.gz")
      system("gzip -d #{base}.LargeContigs.fna.gz")
    else
      download("#{base}.LargeContigs.fna")
    end
    File.symlink(
      File.basename("#{base}.LargeContigs.fna"), "#{base}.AllContigs.fna")
    File.open("#{base}.done", "w") { |ofh| ofh.puts Time.now.to_s }
  else
    raise "Unexpected error: Unsupported result for database #{db}."
  end
  dataset = MiGA::Dataset.new(project, name, is_ref, )
  project.add_dataset(dataset.name)
  result = dataset.add_result(@@UNIVERSE[universe][:dbs][db][:stage],
    true, is_clean:true)
  raise "Empty dataset created: seed result was not added due to "+
    "incomplete files." if result.nil?
  result.clean!
  result.save
  dataset
end