Class: EuPathDBSpeciesData
- Inherits:
-
Object
- Object
- EuPathDBSpeciesData
- Defined in:
- lib/eupathdb_species_data.rb
Overview
A class dedicated to recording ‘administrative’ data about the databases, answering questions such as “which species are recorded in ToxoDB?” for instance.
It is also meant for dealing with locally cached version of the files, where all the data is stored in a base directory with a specified structure.
TODO: functions for the info and the local caching should probably be separated into separate classes, and the directory structure of the local versions shouldn’t be forced on the user.
Constant Summary collapse
- SOURCE_VERSIONS =
{ 'PlasmoDB' => '8.2', 'ToxoDB' => '7.2', 'CryptoDB' => '4.6', 'PiroplasmaDB' => '1.1', 'FungiDB' => '1.0', 'TriTrypDB' => '4.0', }
- DATABASES =
SOURCE_VERSIONS.keys
- @@data =
{ ## PlasmoDB 'Plasmodium falciparum' => { :name => 'Plasmodium falciparum', :source => 'PlasmoDB', :fasta_file_species_name => 'Plasmodium_falciparum_3D7', :sequencing_centre_abbreviation => 'psu', :behind_usage_policy => true, }, 'Plasmodium yoelii' => { :directory => 'yoelii', :name => 'Plasmodium yoelii', :sequencing_centre_abbreviation => 'TIGR', :fasta_file_species_name => 'Plasmodium_yoelii_yoelii_str._17XNL', :proteins_fasta_filename => lambda {|version| "PyoeliiAnnotatedProteins_PlasmoDB-#{version}.fasta"}, #:transcripts_fasta_filename => lambda {|version| "PyoeliiAllTranscripts_PlasmoDB-#{version}.fasta"}, :source => 'PlasmoDB' }, 'Plasmodium vivax' => { :name => 'Plasmodium vivax', :sequencing_centre_abbreviation => 'gb', :fasta_file_species_name => 'Plasmodium_vivax_SaI-1', :proteins_fasta_filename => lambda {|version| "PvivaxAnnotatedProteins_PlasmoDB-#{version}.fasta"}, :source => 'PlasmoDB' }, 'Plasmodium berghei' => { :name => 'Plasmodium berghei', :sequencing_centre_abbreviation => 'psu', :fasta_file_species_name => 'Plasmodium_berghei_str._ANKA', :proteins_fasta_filename => lambda {|version| "PbergheiAnnotatedProteins_PlasmoDB-#{version}.fasta"}, #:transcripts_fasta_filename => lambda {|version| "PbergheiAllTranscripts_PlasmoDB-#{version}.fasta"}, :source => 'PlasmoDB' }, 'Plasmodium chabaudi' => { :name => 'Plasmodium chabaudi', :sequencing_centre_abbreviation => 'psu', :fasta_file_species_name => 'Plasmodium_chabaudi_chabaudi', :proteins_fasta_filename => lambda {|version| "PchabaudiAnnotatedProteins_PlasmoDB-#{version}.fasta"}, :source => 'PlasmoDB', :behind_usage_policy => true, }, 'Plasmodium knowlesi' => { :name => 'Plasmodium knowlesi', :sequencing_centre_abbreviation => 'psu', :fasta_file_species_name => 'Plasmodium_knowlesi_strain_H', :source => 'PlasmoDB', :behind_usage_policy => true, }, ## ToxoDB 'Neospora caninum' => { :name => 'Neospora caninum', :sequencing_centre_abbreviation => 'psu', :fasta_file_species_name => 'Neospora_caninum', :database_download_folder => 'NeosporaCaninum', :representative_strain_name => 'NeosporaCaninum', :proteins_fasta_filename => lambda {|version| "NeosporaCaninumAnnotatedProteins_ToxoDB-#{version}.fasta"}, :transcripts_fasta_filename => lambda {|version| "NeosporaCaninumAnnotatedTranscripts_ToxoDB-#{version}.fasta"}, :source => 'ToxoDB', :behind_usage_policy => true, }, 'Eimeria tenella' => { :name => 'Eimeria tenella', :sequencing_centre_abbreviation => 'GeneDB', :fasta_file_species_name => 'EtenellaHoughton', :source => 'ToxoDB', :database_download_folder => 'EtenellaHoughton', :behind_usage_policy => true, :fasta_file_species_name => 'Eimeria_tenella_str._Houghton', }, 'Toxoplasma gondii' => { :name => 'Toxoplasma gondii', :sequencing_centre_abbreviation => 'gb', :fasta_file_species_name => 'Toxoplasma_gondii_ME49', :database_download_folder => 'TgondiiME49', :gene_information_filename => lambda {|version| "TgondiiME49Gene_ToxoDB-#{version}.txt"}, :proteins_fasta_filename => lambda {|version| "TgondiiME49AnnotatedProteins_ToxoDB-#{version}.fasta"}, :transcripts_fasta_filename => lambda {|version| "TgondiiME49AnnotatedTranscripts_ToxoDB-#{version}.fasta"}, :gff_filename => lambda {|version| "TgondiiME49_ToxoDB-#{version}.gff"}, :genomic_fasta_filename => lambda {|version| "TgondiiME49Genomic_ToxoDB-#{version}.fasta"}, :source => 'ToxoDB' }, ## CryptoDB 'Cryptosporidium parvum' => { :name => 'Cryptosporidium parvum', :sequencing_centre_abbreviation => 'gb', :fasta_file_species_name => 'Cryptosporidium_parvum', :proteins_fasta_filename => lambda {|version| "CparvumAnnotatedProteins_CryptoDB-#{version}.fasta"}, :transcripts_fasta_filename => lambda {|version| "CparvumAnnotatedTranscripts_CryptoDB-#{version}.fasta"}, #:gff_filename => lambda {|version| "c_parvum_iowa_ii.gff"}, #changed as of version 4.3 :source => 'CryptoDB' }, 'Cryptosporidium hominis' => { :name => 'Cryptosporidium hominis', :sequencing_centre_abbreviation => 'gb', :fasta_file_species_name => 'Cryptosporidium_hominis', :proteins_fasta_filename => lambda {|version| "ChominisAnnotatedProteins_CryptoDB-#{version}.fasta"}, :transcripts_fasta_filename => lambda {|version| "ChominisAnnotatedTranscripts_CryptoDB-#{version}.fasta"}, #:gff_filename => lambda {|version| "c_hominis_tu502.gff"}, #changed as of version 4.3 :source => 'CryptoDB' }, 'Cryptosporidium muris' => { :name => 'Cryptosporidium muris', :sequencing_centre_abbreviation => 'gb', :fasta_file_species_name => 'Cryptosporidium_muris', :proteins_fasta_filename => lambda {|version| "CmurisAnnotatedProteins_CryptoDB-#{version}.fasta"}, :transcripts_fasta_filename => lambda {|version| "CmurisAnnotatedTranscripts_CryptoDB-#{version}.fasta"}, #:gff_filename => lambda {|version| "c_muris.gff"}, #changed as of version 4.3 :source => 'CryptoDB' }, ## PiroplasmaDB 'Theileria annulata' => { :name => 'Theileria annulata', :database_download_folder => 'TannulataAnkara', :sequencing_centre_abbreviation => 'Genbank', :fasta_file_species_name => 'Theileria_annulata_strain_Ankara', :source => 'PiroplasmaDB', }, 'Theileria parva' => { :name => 'Theileria parva', :database_download_folder => 'TparvaMuguga', :sequencing_centre_abbreviation => 'Genbank', :fasta_file_species_name => 'Theileria_parva_strain_Muguga', :source => 'PiroplasmaDB', }, 'Babesia bovis' => { :name => 'Babesia bovis', :database_download_folder => 'BbovisT2Bo', :representative_strain_name => 'BbovisT2Bo', :sequencing_centre_abbreviation => 'Genbank', :fasta_file_species_name => 'Babesia_bovis_T2Bo', :source => 'PiroplasmaDB', }, ## FungiDB 'Candida albicans' => { :name => 'Candida albicans', :database_download_folder => 'Candida_albicans_SC5314', :sequencing_centre_abbreviation => 'CGD', :fasta_file_species_name => 'Candida_albicans_SC5314', :source => 'FungiDB', }, ## TriTrypDB 'Trypanosoma brucei' => { :name => 'Trypanosoma brucei', :sequencing_centre_abbreviation => 'GeneDB', :source => 'TriTrypDB', :representative_strain_name => 'TbruceiTreu927', :fasta_file_species_name => 'Trypanosoma_brucei_TREU927', }, }
Class Method Summary collapse
-
.download(base_download_directory, database_name = nil) ⇒ Object
Download all the data files from all the EuPathDB databases, or just one single database.
-
.species_data_from_database(database_name, base_download_directory = nil) ⇒ Object
Return a list of the species names that are included in the EuPathDB database.
Instance Method Summary collapse
- #database ⇒ Object
-
#directories_for_mkdir ⇒ Object
an array of directory names.
- #eu_path_db_download_directory ⇒ Object
- #eu_path_db_fasta_download_directory ⇒ Object
- #eu_path_db_gff_download_directory ⇒ Object
- #eu_path_db_txt_download_directory ⇒ Object
- #gene_information_filename ⇒ Object
-
#gene_information_gzfile_filename ⇒ Object
The path to the EuPathDB gene information table (stored as a gzip).
-
#gene_information_gzfile_path ⇒ Object
The path to the EuPathDB gene information table (stored as a gzip).
- #gene_information_path ⇒ Object
- #genomic_fasta_filename ⇒ Object
- #gff_filename ⇒ Object
- #gff_path ⇒ Object
-
#initialize(nickname, base_data_directory = nil, database_version = nil) ⇒ EuPathDBSpeciesData
constructor
Create a new object about one particular species.
- #local_download_directory ⇒ Object
- #method_missing(symbol) ⇒ Object
-
#one_word_name ⇒ Object
Plasmodium chabaudi => Pchabaudi.
- #protein_blast_database_path ⇒ Object
- #protein_fasta_file_iterator ⇒ Object
- #protein_fasta_filename ⇒ Object
- #protein_fasta_path ⇒ Object
- #representative_strain_name ⇒ Object
- #transcript_fasta_filename ⇒ Object
- #transcript_fasta_path ⇒ Object
- #version ⇒ Object
Constructor Details
#initialize(nickname, base_data_directory = nil, database_version = nil) ⇒ EuPathDBSpeciesData
Create a new object about one particular species. The species can be specified by a nickname, which is either the full binomal name of the specie e.g. “Plasmodium falciparum”, or by simply the second part (the species name without the genus name) e.g. ‘falciparum’.
base_data_directory is the directory where locally cached version of the downloaded files are stored.
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
# File 'lib/eupathdb_species_data.rb', line 190 def initialize(nickname, base_data_directory=nil, database_version=nil) @species_data = @@data[nickname] # try the full name @species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores if @species_data.nil? # try using just the second word splits = nickname.split(' ') if splits.length == 2 @species_data = @@data[splits[1]] end end raise Exception, "Couldn't find species data for #{nickname}" unless @species_data @base_data_directory = base_data_directory # record out what version of the db we are looking at, otherwise default @database_version = database_version @database_version ||= SOURCE_VERSIONS[@species_data[:source]] end |
Dynamic Method Handling
This class handles dynamic methods through the method_missing method
#method_missing(symbol) ⇒ Object
208 209 210 211 212 |
# File 'lib/eupathdb_species_data.rb', line 208 def method_missing(symbol) answer = @species_data[symbol] return answer unless answer.nil? super end |
Class Method Details
.download(base_download_directory, database_name = nil) ⇒ Object
Download all the data files from all the EuPathDB databases, or just one single database. Requires wget to be available on the command line
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 |
# File 'lib/eupathdb_species_data.rb', line 370 def self.download(base_download_directory, database_name=nil) # by default, download everything if database_name.nil? EuPathDBSpeciesData::DATABASES.each do |d| download base_download_directory, d end else # Download the new files from the relevant database EuPathDBSpeciesData.species_data_from_database(database_name, base_download_directory).each do |spd| spd.directories_for_mkdir.each do |directory| unless File.exists?(directory) Dir.mkdir(directory) end end Dir.chdir(spd.local_download_directory) do p spd.eu_path_db_fasta_download_directory # protein unless File.exists?(spd.protein_fasta_filename) `wget #{spd.eu_path_db_fasta_download_directory}/#{spd.protein_fasta_filename}` end # gff unless File.exists?(spd.gff_filename) `wget #{spd.eu_path_db_gff_download_directory}/#{spd.gff_filename}` end # transcripts unless File.exists?(spd.transcript_fasta_filename) `wget #{spd.eu_path_db_fasta_download_directory}/#{spd.transcript_fasta_filename}` end # gene information table unless File.exists?(spd.gene_information_filename) `wget '#{spd.eu_path_db_txt_download_directory}/#{spd.gene_information_filename}'` end # genomic unless File.exists?(spd.genomic_fasta_filename) `wget '#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}'` end end end end end |
.species_data_from_database(database_name, base_download_directory = nil) ⇒ Object
Return a list of the species names that are included in the EuPathDB database
358 359 360 361 362 363 364 365 366 |
# File 'lib/eupathdb_species_data.rb', line 358 def self.species_data_from_database(database_name, base_download_directory=nil) species = @@data.select {|name, info| info[:source].downcase == database_name.downcase and name == info[:name] #only allow ones that are fully specified - not shortcut ones } species.collect do |name_info| EuPathDBSpeciesData.new(name_info[0], base_download_directory) end end |
Instance Method Details
#database ⇒ Object
296 297 298 |
# File 'lib/eupathdb_species_data.rb', line 296 def database @species_data[:source] end |
#directories_for_mkdir ⇒ Object
an array of directory names. mkdir is called on each of them in order, otherwise mkdir throws errors because there isn’t sufficient folders to build on.
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 |
# File 'lib/eupathdb_species_data.rb', line 338 def directories_for_mkdir if @base_data_directory.nil? raise Exception, "Unable to generate directories when @base_data_directory is not set" end s = @species_data components = [ @base_data_directory, s[:name], 'genome', s[:source], @database_version, ] (0..components.length-1).collect do |i| components[0..i].join('/') end end |
#eu_path_db_download_directory ⇒ Object
300 301 302 |
# File 'lib/eupathdb_species_data.rb', line 300 def eu_path_db_download_directory "http://#{database.downcase}.org/common/downloads/release-#{@database_version}/#{one_word_name}" end |
#eu_path_db_fasta_download_directory ⇒ Object
304 305 306 307 308 |
# File 'lib/eupathdb_species_data.rb', line 304 def eu_path_db_fasta_download_directory path = "#{eu_path_db_download_directory}/fasta" path = "#{path}/data" if @species_data[:behind_usage_policy] path end |
#eu_path_db_gff_download_directory ⇒ Object
310 311 312 313 314 |
# File 'lib/eupathdb_species_data.rb', line 310 def eu_path_db_gff_download_directory path = "#{eu_path_db_download_directory}/gff" path = "#{path}/data" if @species_data[:behind_usage_policy] path end |
#eu_path_db_txt_download_directory ⇒ Object
316 317 318 319 320 |
# File 'lib/eupathdb_species_data.rb', line 316 def eu_path_db_txt_download_directory path = "#{eu_path_db_download_directory}/txt" path = "#{path}/data" if @species_data[:behind_usage_policy] path end |
#gene_information_filename ⇒ Object
233 234 235 236 237 238 239 240 241 |
# File 'lib/eupathdb_species_data.rb', line 233 def gene_information_filename f = @species_data[:gene_information_filename] if f "#{f.call(version)}" else # TgondiiME49Gene_ToxoDB-5.2.txt.gz # PfalciparumGene_PlasmoDB-6.1.txt.gz "#{representative_strain_name}Gene_#{database}-#{version}.txt" end end |
#gene_information_gzfile_filename ⇒ Object
The path to the EuPathDB gene information table (stored as a gzip)
220 221 222 |
# File 'lib/eupathdb_species_data.rb', line 220 def gene_information_gzfile_filename "#{gene_information_filename}.gz" end |
#gene_information_gzfile_path ⇒ Object
The path to the EuPathDB gene information table (stored as a gzip)
215 216 217 |
# File 'lib/eupathdb_species_data.rb', line 215 def gene_information_gzfile_path "#{local_download_directory}/#{gene_information_gzfile_filename}" end |
#gene_information_path ⇒ Object
224 225 226 |
# File 'lib/eupathdb_species_data.rb', line 224 def gene_information_path "#{local_download_directory}/#{gene_information_filename}" end |
#genomic_fasta_filename ⇒ Object
275 276 277 278 279 280 281 282 |
# File 'lib/eupathdb_species_data.rb', line 275 def genomic_fasta_filename genomic = @species_data[:genomic_fasta_filename] if genomic return "#{genomic.call(version)}" else return "#{representative_strain_name}Genomic_#{database}-#{version}.fasta" end end |
#gff_filename ⇒ Object
284 285 286 287 288 289 290 |
# File 'lib/eupathdb_species_data.rb', line 284 def gff_filename if @species_data[:gff_filename] return @species_data[:gff_filename].call(version) else return "#{representative_strain_name}_#{database}-#{version}.gff" end end |
#gff_path ⇒ Object
292 293 294 |
# File 'lib/eupathdb_species_data.rb', line 292 def gff_path File.join(local_download_directory,gff_filename) end |
#local_download_directory ⇒ Object
330 331 332 333 |
# File 'lib/eupathdb_species_data.rb', line 330 def local_download_directory s = @species_data "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{@database_version}" end |
#one_word_name ⇒ Object
Plasmodium chabaudi => Pchabaudi
323 324 325 326 327 328 |
# File 'lib/eupathdb_species_data.rb', line 323 def one_word_name return @species_data[:database_download_folder] unless @species_data[:database_download_folder].nil? splits = @species_data[:name].split(' ') raise unless splits.length == 2 return "#{splits[0][0..0]}#{splits[1]}" end |
#protein_blast_database_path ⇒ Object
259 260 261 |
# File 'lib/eupathdb_species_data.rb', line 259 def protein_blast_database_path "/blastdb/#{protein_fasta_filename}" end |
#protein_fasta_file_iterator ⇒ Object
413 414 415 |
# File 'lib/eupathdb_species_data.rb', line 413 def protein_fasta_file_iterator Bio::EuPathDB::FastaParser.new(fasta_file_species_name, protein_fasta_path) end |
#protein_fasta_filename ⇒ Object
247 248 249 250 251 252 253 |
# File 'lib/eupathdb_species_data.rb', line 247 def protein_fasta_filename if @species_data[:proteins_fasta_filename] return "#{@species_data[:proteins_fasta_filename].call(version)}" else return "#{representative_strain_name}AnnotatedProteins_#{database}-#{version}.fasta" end end |
#protein_fasta_path ⇒ Object
255 256 257 |
# File 'lib/eupathdb_species_data.rb', line 255 def protein_fasta_path return File.join(local_download_directory,protein_fasta_filename) end |
#representative_strain_name ⇒ Object
228 229 230 231 |
# File 'lib/eupathdb_species_data.rb', line 228 def representative_strain_name return @species_data[:representative_strain_name] unless @species_data[:representative_strain_name].nil? return one_word_name end |
#transcript_fasta_filename ⇒ Object
263 264 265 266 267 268 269 |
# File 'lib/eupathdb_species_data.rb', line 263 def transcript_fasta_filename if @species_data[:transcripts_fasta_filename] return "#{@species_data[:transcripts_fasta_filename].call(version)}" else return "#{representative_strain_name}AnnotatedTranscripts_#{database}-#{version}.fasta" end end |
#transcript_fasta_path ⇒ Object
271 272 273 |
# File 'lib/eupathdb_species_data.rb', line 271 def transcript_fasta_path File.join(local_download_directory,transcript_fasta_filename) end |
#version ⇒ Object
243 244 245 |
# File 'lib/eupathdb_species_data.rb', line 243 def version @database_version end |