Class: EuPathDBSpeciesData

Inherits:
Object
  • Object
show all
Defined in:
lib/eupathdb_species_data.rb

Overview

A class dedicated to recording ‘administrative’ data about the databases, answering questions such as “which species are recorded in ToxoDB?” for instance.

It is also meant for dealing with locally cached version of the files, where all the data is stored in a base directory with a specified structure.

TODO: functions for the info and the local caching should probably be separated into separate classes, and the directory structure of the local versions shouldn’t be forced on the user.

Constant Summary collapse

SOURCE_VERSIONS =
{
  'PlasmoDB' => '8.2',
  'ToxoDB' => '7.2',
  'CryptoDB' => '4.6',
  'PiroplasmaDB' => '1.1',
  'FungiDB' => '1.0',
  'TriTrypDB' => '4.0',
}
DATABASES =
SOURCE_VERSIONS.keys
@@data =
{
  ## PlasmoDB
  'Plasmodium falciparum' => {
    :name => 'Plasmodium falciparum',
    :source => 'PlasmoDB',
    :fasta_file_species_name => 'Plasmodium_falciparum_3D7',
    :sequencing_centre_abbreviation => 'psu',
    :behind_usage_policy => true,
  },
  'Plasmodium yoelii' => {
    :directory => 'yoelii',
    :name => 'Plasmodium yoelii',
    :sequencing_centre_abbreviation => 'TIGR',
    :fasta_file_species_name => 'Plasmodium_yoelii_yoelii_str._17XNL',
    :proteins_fasta_filename => lambda {|version| "PyoeliiAnnotatedProteins_PlasmoDB-#{version}.fasta"},
    #:transcripts_fasta_filename => lambda {|version| "PyoeliiAllTranscripts_PlasmoDB-#{version}.fasta"},
    :source => 'PlasmoDB'
  },
  'Plasmodium vivax' => {
    :name => 'Plasmodium vivax',
    :sequencing_centre_abbreviation => 'gb',
    :fasta_file_species_name => 'Plasmodium_vivax_SaI-1',
    :proteins_fasta_filename => lambda {|version| "PvivaxAnnotatedProteins_PlasmoDB-#{version}.fasta"},
    :source => 'PlasmoDB'
  },
  'Plasmodium berghei' => {
    :name => 'Plasmodium berghei',
    :sequencing_centre_abbreviation => 'psu',
    :fasta_file_species_name => 'Plasmodium_berghei_str._ANKA',
    :proteins_fasta_filename => lambda {|version| "PbergheiAnnotatedProteins_PlasmoDB-#{version}.fasta"},
    #:transcripts_fasta_filename => lambda {|version| "PbergheiAllTranscripts_PlasmoDB-#{version}.fasta"},
    :source => 'PlasmoDB'
  },
  'Plasmodium chabaudi' => {
    :name => 'Plasmodium chabaudi',
    :sequencing_centre_abbreviation => 'psu',
    :fasta_file_species_name => 'Plasmodium_chabaudi_chabaudi',
    :proteins_fasta_filename => lambda {|version| "PchabaudiAnnotatedProteins_PlasmoDB-#{version}.fasta"},
    :source => 'PlasmoDB',
    :behind_usage_policy => true,
  },
  'Plasmodium knowlesi' => {
    :name => 'Plasmodium knowlesi',
    :sequencing_centre_abbreviation => 'psu',
    :fasta_file_species_name => 'Plasmodium_knowlesi_strain_H',
    :source => 'PlasmoDB',
    :behind_usage_policy => true,
  },
  ## ToxoDB
  'Neospora caninum' => {
    :name => 'Neospora caninum',
    :sequencing_centre_abbreviation => 'psu',
    :fasta_file_species_name => 'Neospora_caninum',
    :database_download_folder => 'NeosporaCaninum',
    :representative_strain_name => 'NeosporaCaninum',
    :proteins_fasta_filename => lambda {|version| "NeosporaCaninumAnnotatedProteins_ToxoDB-#{version}.fasta"},
    :transcripts_fasta_filename => lambda {|version| "NeosporaCaninumAnnotatedTranscripts_ToxoDB-#{version}.fasta"},
    :source => 'ToxoDB',
    :behind_usage_policy => true,
  },
  'Eimeria tenella' => {
    :name => 'Eimeria tenella',
    :sequencing_centre_abbreviation => 'GeneDB',
    :fasta_file_species_name => 'EtenellaHoughton',
    :source => 'ToxoDB',
    :database_download_folder => 'EtenellaHoughton',
    :behind_usage_policy => true,
    :fasta_file_species_name => 'Eimeria_tenella_str._Houghton',
  },
  'Toxoplasma gondii' => {
    :name => 'Toxoplasma gondii',
    :sequencing_centre_abbreviation => 'gb',
    :fasta_file_species_name => 'Toxoplasma_gondii_ME49',
    :database_download_folder => 'TgondiiME49',
    :gene_information_filename => lambda {|version| "TgondiiME49Gene_ToxoDB-#{version}.txt"},
    :proteins_fasta_filename => lambda {|version| "TgondiiME49AnnotatedProteins_ToxoDB-#{version}.fasta"},
    :transcripts_fasta_filename => lambda {|version| "TgondiiME49AnnotatedTranscripts_ToxoDB-#{version}.fasta"},
    :gff_filename => lambda {|version| "TgondiiME49_ToxoDB-#{version}.gff"},
    :genomic_fasta_filename => lambda {|version| "TgondiiME49Genomic_ToxoDB-#{version}.fasta"},
    :source => 'ToxoDB'
  },
  ## CryptoDB
  'Cryptosporidium parvum' => {
    :name => 'Cryptosporidium parvum',
    :sequencing_centre_abbreviation => 'gb',
    :fasta_file_species_name => 'Cryptosporidium_parvum',
    :proteins_fasta_filename => lambda {|version| "CparvumAnnotatedProteins_CryptoDB-#{version}.fasta"},
    :transcripts_fasta_filename => lambda {|version| "CparvumAnnotatedTranscripts_CryptoDB-#{version}.fasta"},
    #:gff_filename => lambda {|version| "c_parvum_iowa_ii.gff"}, #changed as of version 4.3
    :source => 'CryptoDB'
  },
  'Cryptosporidium hominis' => {
    :name => 'Cryptosporidium hominis',
    :sequencing_centre_abbreviation => 'gb',
    :fasta_file_species_name => 'Cryptosporidium_hominis',
    :proteins_fasta_filename => lambda {|version| "ChominisAnnotatedProteins_CryptoDB-#{version}.fasta"},
    :transcripts_fasta_filename => lambda {|version| "ChominisAnnotatedTranscripts_CryptoDB-#{version}.fasta"},
    #:gff_filename => lambda {|version| "c_hominis_tu502.gff"}, #changed as of version 4.3
    :source => 'CryptoDB'
  },
  'Cryptosporidium muris' => {
    :name => 'Cryptosporidium muris',
    :sequencing_centre_abbreviation => 'gb',
    :fasta_file_species_name => 'Cryptosporidium_muris',
    :proteins_fasta_filename => lambda {|version| "CmurisAnnotatedProteins_CryptoDB-#{version}.fasta"},
    :transcripts_fasta_filename => lambda {|version| "CmurisAnnotatedTranscripts_CryptoDB-#{version}.fasta"},
    #:gff_filename => lambda {|version| "c_muris.gff"}, #changed as of version 4.3
    :source => 'CryptoDB'
  },
  ## PiroplasmaDB
  'Theileria annulata' => {
    :name => 'Theileria annulata',
    :database_download_folder => 'TannulataAnkara',
    :sequencing_centre_abbreviation => 'Genbank',
    :fasta_file_species_name => 'Theileria_annulata_strain_Ankara',
    :source => 'PiroplasmaDB',
  },
  'Theileria parva' => {
    :name => 'Theileria parva',
    :database_download_folder => 'TparvaMuguga',  
    :sequencing_centre_abbreviation => 'Genbank',
    :fasta_file_species_name => 'Theileria_parva_strain_Muguga',
    :source => 'PiroplasmaDB', 
  },
  'Babesia bovis' => {
    :name => 'Babesia bovis',
    :database_download_folder => 'BbovisT2Bo',
    :representative_strain_name => 'BbovisT2Bo',
    :sequencing_centre_abbreviation => 'Genbank',
    :fasta_file_species_name => 'Babesia_bovis_T2Bo',
    :source => 'PiroplasmaDB',
  },
  ## FungiDB
  'Candida albicans' => {
    :name => 'Candida albicans',
    :database_download_folder => 'Candida_albicans_SC5314',
    :sequencing_centre_abbreviation => 'CGD',
    :fasta_file_species_name => 'Candida_albicans_SC5314',
    :source => 'FungiDB',
  },
  ## TriTrypDB
  'Trypanosoma brucei' => {
    :name => 'Trypanosoma brucei',
    :sequencing_centre_abbreviation => 'GeneDB',
    :source => 'TriTrypDB',
    :representative_strain_name => 'TbruceiTreu927',
    :fasta_file_species_name => 'Trypanosoma_brucei_TREU927',
  },
}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(nickname, base_data_directory = nil, database_version = nil) ⇒ EuPathDBSpeciesData

Create a new object about one particular species. The species can be specified by a nickname, which is either the full binomal name of the specie e.g. “Plasmodium falciparum”, or by simply the second part (the species name without the genus name) e.g. ‘falciparum’.

base_data_directory is the directory where locally cached version of the downloaded files are stored.

Raises:

  • (Exception)


190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/eupathdb_species_data.rb', line 190

def initialize(nickname, base_data_directory=nil, database_version=nil)
  @species_data = @@data[nickname] # try the full name
  @species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores
  if @species_data.nil? # try using just the second word
    splits = nickname.split(' ')
    if splits.length == 2
      @species_data = @@data[splits[1]]
    end
  end
  raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
  
  @base_data_directory = base_data_directory
  
  # record out what version of the db we are looking at, otherwise default
  @database_version = database_version
  @database_version ||= SOURCE_VERSIONS[@species_data[:source]]
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(symbol) ⇒ Object



208
209
210
211
212
# File 'lib/eupathdb_species_data.rb', line 208

def method_missing(symbol)
  answer = @species_data[symbol]
  return answer unless answer.nil?
  super
end

Class Method Details

.download(base_download_directory, database_name = nil) ⇒ Object

Download all the data files from all the EuPathDB databases, or just one single database. Requires wget to be available on the command line



370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# File 'lib/eupathdb_species_data.rb', line 370

def self.download(base_download_directory, database_name=nil)
  # by default, download everything
  if database_name.nil?
    EuPathDBSpeciesData::DATABASES.each do |d|
      download base_download_directory, d
    end
  else
    # Download the new files from the relevant database
    EuPathDBSpeciesData.species_data_from_database(database_name, base_download_directory).each do |spd|
      spd.directories_for_mkdir.each do |directory|
        unless File.exists?(directory)
          Dir.mkdir(directory)
        end
      end
      
      Dir.chdir(spd.local_download_directory) do
        p spd.eu_path_db_fasta_download_directory
          
        # protein
        unless File.exists?(spd.protein_fasta_filename)
          `wget #{spd.eu_path_db_fasta_download_directory}/#{spd.protein_fasta_filename}`
        end
        # gff
        unless File.exists?(spd.gff_filename)
          `wget #{spd.eu_path_db_gff_download_directory}/#{spd.gff_filename}`
        end
        # transcripts
        unless File.exists?(spd.transcript_fasta_filename)
          `wget #{spd.eu_path_db_fasta_download_directory}/#{spd.transcript_fasta_filename}`
        end
        # gene information table
        unless File.exists?(spd.gene_information_filename)
          `wget '#{spd.eu_path_db_txt_download_directory}/#{spd.gene_information_filename}'`
        end
        # genomic
        unless File.exists?(spd.genomic_fasta_filename)
          `wget '#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}'`
        end
      end
    end
  end
end

.species_data_from_database(database_name, base_download_directory = nil) ⇒ Object

Return a list of the species names that are included in the EuPathDB database



358
359
360
361
362
363
364
365
366
# File 'lib/eupathdb_species_data.rb', line 358

def self.species_data_from_database(database_name, base_download_directory=nil)
  species = @@data.select {|name, info|
    info[:source].downcase == database_name.downcase and
    name == info[:name] #only allow ones that are fully specified - not shortcut ones
  }
  species.collect do |name_info|
    EuPathDBSpeciesData.new(name_info[0], base_download_directory)
  end
end

Instance Method Details

#databaseObject



296
297
298
# File 'lib/eupathdb_species_data.rb', line 296

def database
  @species_data[:source]
end

#directories_for_mkdirObject

an array of directory names. mkdir is called on each of them in order, otherwise mkdir throws errors because there isn’t sufficient folders to build on.



338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
# File 'lib/eupathdb_species_data.rb', line 338

def directories_for_mkdir
  if @base_data_directory.nil?
    raise Exception, "Unable to generate directories when @base_data_directory is not set"
  end
  
  s = @species_data
  components = [
    @base_data_directory,
  s[:name],
    'genome',
  s[:source],
  @database_version,
  ]
  
   (0..components.length-1).collect do |i|
    components[0..i].join('/')
  end
end

#eu_path_db_download_directoryObject



300
301
302
# File 'lib/eupathdb_species_data.rb', line 300

def eu_path_db_download_directory
  "http://#{database.downcase}.org/common/downloads/release-#{@database_version}/#{one_word_name}"
end

#eu_path_db_fasta_download_directoryObject



304
305
306
307
308
# File 'lib/eupathdb_species_data.rb', line 304

def eu_path_db_fasta_download_directory
  path = "#{eu_path_db_download_directory}/fasta"
  path = "#{path}/data" if @species_data[:behind_usage_policy]
  path
end

#eu_path_db_gff_download_directoryObject



310
311
312
313
314
# File 'lib/eupathdb_species_data.rb', line 310

def eu_path_db_gff_download_directory
  path = "#{eu_path_db_download_directory}/gff"
  path = "#{path}/data" if @species_data[:behind_usage_policy]
  path
end

#eu_path_db_txt_download_directoryObject



316
317
318
319
320
# File 'lib/eupathdb_species_data.rb', line 316

def eu_path_db_txt_download_directory
  path = "#{eu_path_db_download_directory}/txt"
  path = "#{path}/data" if @species_data[:behind_usage_policy]
  path
end

#gene_information_filenameObject



233
234
235
236
237
238
239
240
241
# File 'lib/eupathdb_species_data.rb', line 233

def gene_information_filename
  f = @species_data[:gene_information_filename]
  if f
    "#{f.call(version)}"
  else      # TgondiiME49Gene_ToxoDB-5.2.txt.gz
    # PfalciparumGene_PlasmoDB-6.1.txt.gz
    "#{representative_strain_name}Gene_#{database}-#{version}.txt"
  end
end

#gene_information_gzfile_filenameObject

The path to the EuPathDB gene information table (stored as a gzip)



220
221
222
# File 'lib/eupathdb_species_data.rb', line 220

def gene_information_gzfile_filename
  "#{gene_information_filename}.gz"
end

#gene_information_gzfile_pathObject

The path to the EuPathDB gene information table (stored as a gzip)



215
216
217
# File 'lib/eupathdb_species_data.rb', line 215

def gene_information_gzfile_path
  "#{local_download_directory}/#{gene_information_gzfile_filename}"
end

#gene_information_pathObject



224
225
226
# File 'lib/eupathdb_species_data.rb', line 224

def gene_information_path
  "#{local_download_directory}/#{gene_information_filename}"
end

#genomic_fasta_filenameObject



275
276
277
278
279
280
281
282
# File 'lib/eupathdb_species_data.rb', line 275

def genomic_fasta_filename
  genomic = @species_data[:genomic_fasta_filename]
  if genomic
    return "#{genomic.call(version)}"
  else
    return "#{representative_strain_name}Genomic_#{database}-#{version}.fasta"
  end
end

#gff_filenameObject



284
285
286
287
288
289
290
# File 'lib/eupathdb_species_data.rb', line 284

def gff_filename
  if @species_data[:gff_filename]
    return @species_data[:gff_filename].call(version)
  else
    return "#{representative_strain_name}_#{database}-#{version}.gff"
  end
end

#gff_pathObject



292
293
294
# File 'lib/eupathdb_species_data.rb', line 292

def gff_path
  File.join(local_download_directory,gff_filename)
end

#local_download_directoryObject



330
331
332
333
# File 'lib/eupathdb_species_data.rb', line 330

def local_download_directory
  s = @species_data
  "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{@database_version}"
end

#one_word_nameObject

Plasmodium chabaudi => Pchabaudi



323
324
325
326
327
328
# File 'lib/eupathdb_species_data.rb', line 323

def one_word_name
  return @species_data[:database_download_folder] unless @species_data[:database_download_folder].nil?
  splits = @species_data[:name].split(' ')
  raise unless splits.length == 2
  return "#{splits[0][0..0]}#{splits[1]}"
end

#protein_blast_database_pathObject



259
260
261
# File 'lib/eupathdb_species_data.rb', line 259

def protein_blast_database_path
  "/blastdb/#{protein_fasta_filename}"
end

#protein_fasta_file_iteratorObject



413
414
415
# File 'lib/eupathdb_species_data.rb', line 413

def protein_fasta_file_iterator
  Bio::EuPathDB::FastaParser.new(fasta_file_species_name, protein_fasta_path)
end

#protein_fasta_filenameObject



247
248
249
250
251
252
253
# File 'lib/eupathdb_species_data.rb', line 247

def protein_fasta_filename
  if @species_data[:proteins_fasta_filename]
    return "#{@species_data[:proteins_fasta_filename].call(version)}"
  else
    return "#{representative_strain_name}AnnotatedProteins_#{database}-#{version}.fasta"
  end
end

#protein_fasta_pathObject



255
256
257
# File 'lib/eupathdb_species_data.rb', line 255

def protein_fasta_path
  return File.join(local_download_directory,protein_fasta_filename)
end

#representative_strain_nameObject



228
229
230
231
# File 'lib/eupathdb_species_data.rb', line 228

def representative_strain_name
  return @species_data[:representative_strain_name] unless @species_data[:representative_strain_name].nil?
  return one_word_name
end

#transcript_fasta_filenameObject



263
264
265
266
267
268
269
# File 'lib/eupathdb_species_data.rb', line 263

def transcript_fasta_filename
  if @species_data[:transcripts_fasta_filename]
    return "#{@species_data[:transcripts_fasta_filename].call(version)}"
  else
    return "#{representative_strain_name}AnnotatedTranscripts_#{database}-#{version}.fasta"
  end
end

#transcript_fasta_pathObject



271
272
273
# File 'lib/eupathdb_species_data.rb', line 271

def transcript_fasta_path
  File.join(local_download_directory,transcript_fasta_filename)
end

#versionObject



243
244
245
# File 'lib/eupathdb_species_data.rb', line 243

def version
  @database_version
end