Class: Bio::EMBL

Inherits:
EMBLDB show all
Includes:
Bio::EMBLDB::Common
Defined in:
lib/bio/db/embl/embl.rb

Constant Summary

Constants included from Bio::EMBLDB::Common

Bio::EMBLDB::Common::DELIMITER, Bio::EMBLDB::Common::RS, Bio::EMBLDB::Common::TAGSIZE

Instance Method Summary collapse

Methods included from Bio::EMBLDB::Common

#ac, #accession, #de, #dr, #initialize, #kw, #oc, #og, #ref, #references

Methods inherited from EMBLDB

#initialize

Methods inherited from DB

#exists?, #fetch, #get, open, #tags

Instance Method Details

#ccObject Also known as: comment

returns comment text in the comments (CC) line.

CC Line; comments of notes (>=0)



401
402
403
# File 'lib/bio/db/embl/embl.rb', line 401

def cc
  get('CC').to_s.gsub(/^CC   /, '')
end

#data_classObject



130
131
132
# File 'lib/bio/db/embl/embl.rb', line 130

def data_class
  id_line('DATA_CLASS')
end

#date_createdObject

created date. Returns Date object, String or nil.



461
462
463
# File 'lib/bio/db/embl/embl.rb', line 461

def date_created
  parse_date(self.dt['created'])
end

#date_modifiedObject

modified date. Returns Date object, String or nil.



456
457
458
# File 'lib/bio/db/embl/embl.rb', line 456

def date_modified
  parse_date(self.dt['updated'])
end

database references (DR). Returns an array of Bio::Sequence::DBLink objects.



511
512
513
514
515
# File 'lib/bio/db/embl/embl.rb', line 511

def dblinks
  get('DR').split(/\n/).collect { |x|
    Bio::Sequence::DBLink.parse_embl_DR_line(x)
  }
end

#divisionObject

returns DIVISION in the ID line.

  • Bio::EMBL#division -> String



140
141
142
# File 'lib/bio/db/embl/embl.rb', line 140

def division
  id_line('DIVISION')
end

#dt(key = nil) ⇒ Object

returns contents in the date (DT) line.

  • Bio::EMBL#dt -> <DT Hash>

where <DT Hash> is:

{}
  • Bio::EMBL#dt(key) -> String

keys: ‘created’ and ‘updated’

DT Line; date (2/entry)



182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/bio/db/embl/embl.rb', line 182

def dt(key=nil)
  unless @data['DT']
    tmp = Hash.new
    dt_line = self.get('DT').split(/\n/)
    tmp['created'] = dt_line[0].sub(/\w{2}   /,'').strip
    tmp['updated'] = dt_line[1].sub(/\w{2}   /,'').strip
    @data['DT'] = tmp
  end
  if key
    @data['DT'][key]
  else
    @data['DT']
  end
end

#each_cdsObject

iterates on CDS features in the FT lines.



380
381
382
383
384
385
386
# File 'lib/bio/db/embl/embl.rb', line 380

def each_cds
  ft.each do |cds_feature|
    if cds_feature.feature == 'CDS'
      yield cds_feature
    end
  end
end

#each_geneObject

iterates on gene features in the FT lines.



389
390
391
392
393
394
395
# File 'lib/bio/db/embl/embl.rb', line 389

def each_gene
  ft.each do |gene_feature|
    if gene_feature.feature == 'gene'
      yield gene_feature
    end
  end
end

#entryObject Also known as: entry_name, entry_id

returns ENTRY_NAME in the ID line.

  • Bio::EMBL#entry -> String



117
118
119
# File 'lib/bio/db/embl/embl.rb', line 117

def entry
  id_line('ENTRY_NAME')
end

#entry_versionObject

entry version number numbered by EMBL



476
477
478
# File 'lib/bio/db/embl/embl.rb', line 476

def entry_version
  parse_release_version(self.dt['updated'])[1]
end

#fhObject

returns feature table header (String) in the feature header (FH) line.

FH Line; feature table header (0 or 2)



325
326
327
# File 'lib/bio/db/embl/embl.rb', line 325

def fh
  fetch('FH')
end

#ftObject Also known as: features

returns contents in the feature table (FT) lines.

  • Bio::EMBL#ft -> Bio::Features

  • Bio::EMBL#ft {} -> {|Bio::Feature| }

same as features method in bio/db/genbank.rb

FT Line; feature table data (>=0)



336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# File 'lib/bio/db/embl/embl.rb', line 336

def ft
  unless @data['FT']
    ary = Array.new
    in_quote = false
    @orig['FT'].each_line do |line|
      next if line =~ /^FEATURES/

      head = line[0,20].strip  # feature key (source, CDS, ...)
      body = line[20,60].chomp # feature value (position, /qualifier=)
      if line =~ /^FT {3}(\S+)/
        ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
      elsif body =~ /^ \// and not in_quote
        ary.last.push(body)    # /q="data..., /q=data, /q

        if body =~ /=" / and body !~ /"$/
          in_quote = true
        end

      else
        ary.last.last << body # ...data..., ...data..."

        if body =~ /"$/
          in_quote = false
        end
      end
    end

    ary.map! do |subary|
      parse_qualifiers(subary)
    end

    @data['FT'] = ary.extend(Bio::Features::BackwardCompatibility)
  end
  if block_given?
    @data['FT'].each do |feature|
      yield feature
    end
  else
    @data['FT']
  end
end

#id_line(key = nil) ⇒ Object

returns contents in the ID line.

  • Bio::EMBL#id_line -> <ID Hash>

where <ID Hash> is:

{'ENTRY_NAME' => String, 'MOLECULE_TYPE' => String, 'DIVISION' => String,
 'SEQUENCE_LENGTH' => Int, 'SEQUENCE_VERSION' => Int}

ID Line

"ID  ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; DIVISION; SEQUENCE_LENGTH BP."

DATA_CLASS = [‘standard’]

MOLECULE_TYPE: DNA RNA XXX

Code ( DIVISION )

EST (ESTs)
PHG (Bacteriophage)
FUN (Fungi)
GSS (Genome survey)
HTC (High Throughput cDNAs) 
HTG (HTGs)
HUM (Human)
INV (Invertebrates)
ORG (Organelles)
MAM (Other Mammals)
VRT (Other Vertebrates)
PLN (Plants)
PRO (Prokaryotes)
ROD (Rodents)
SYN (Synthetic)
STS (STSs)
UNC (Unclassified)
VRL (Viruses)

Rel 89- ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.

  1. Primary accession number

  2. Sequence version number

  3. Topology: ‘circular’ or ‘linear’

  4. Molecule type (see note 1 below)

  5. Data class (see section 3.1)

  6. Taxonomic division (see section 3.2)

  7. Sequence length (see note 2 below)



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/bio/db/embl/embl.rb', line 89

def id_line(key=nil)
  unless @data['ID']
    tmp = Hash.new
    idline = fetch('ID').split(/; +/)         
    tmp['ENTRY_NAME'], tmp['DATA_CLASS'] = idline.shift.split(/ +/)
    if idline.first =~ /^SV/
      tmp['SEQUENCE_VERSION'] = idline.shift.split(' ').last
      tmp['TOPOLOGY'] = idline.shift
      tmp['MOLECULE_TYPE'] = idline.shift
      tmp['DATA_CLASS'] = idline.shift
    else
      tmp['MOLECULE_TYPE'] = idline.shift
    end
    tmp['DIVISION'] = idline.shift
    tmp['SEQUENCE_LENGTH'] = idline.shift.strip.split(' ').first.to_i

    @data['ID'] = tmp
  end
  
  if key
    @data['ID'][key]
  else
    @data['ID']
  end
end

#moleculeObject Also known as: molecule_type

returns MOLECULE_TYPE in the ID line.

  • Bio::EMBL#molecule -> String



125
126
127
# File 'lib/bio/db/embl/embl.rb', line 125

def molecule
  id_line('MOLECULE_TYPE')
end

#os(num = nil) ⇒ Object

returns contents in the OS line.

  • Bio::EMBL#os -> Array of <OS Hash>

where <OS Hash> is:

[{'name'=>'Human', 'os'=>'Homo sapiens'}, 
 {'name'=>'Rat', 'os'=>'Rattus norveticus'}]

  • Bio::EMBL#os(0) => “Homo sapiens (Human)”

++

OS Line; organism species (>=1)

OS   Trifolium repens (white clover)

Typically, OS line shows “Genus species (name)” style:

OS   Genus species (name)

Other examples:

OS   uncultured bacterium
OS   xxxxxx metagenome
OS   Cloning vector xxxxxxxx

Complicated examples:

OS   Poeciliopsis gracilis (Poeciliopsis gracilis (Heckel, 1848))
OS   Etmopterus sp. B Last & Stevens, 1994 (bristled lanternshark)
OS   Galaxias sp. D (Allibone et al., 1996) (Pool Burn galaxias)
OS   Sicydiinae sp. 'Keith et al., 2010'
OS   Acanthopagrus sp. 'Jean & Lee, 2008'
OS   Gaussia princeps (T. Scott, 1894)
OS   Rana sp. 8 Hillis & Wilcox, 2005
OS   Contracaecum rudolphii C D'Amelio et al., 2007
OS   Partula sp. 'Mt. Marau, Tahiti'
OS   Leptocephalus sp. 'type II larva' (Smith, 1989)
OS   Tayloria grandis (D.G.Long) Goffinet & A.J.Shaw, 2002
OS   Non-A, non-B hepatitis virus
OS   Canidae (dog, coyote, wolf, fox)
OS   Salmonella enterica subsp. enterica serovar 4,[5],12:i:-
OS   Yersinia enterocolitica (type O:5,27)
OS   Influenza A virus (A/green-winged teal/OH/72/99(H6N1,4))
OS   Influenza A virus (A/Beijing/352/1989,(highgrowth reassortant NIB26)(H3N2))
OS   Recombinant Hepatitis C virus H77(5'UTR-NS2)/JFH1_V787A,Q1247L


266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# File 'lib/bio/db/embl/embl.rb', line 266

def os(num = nil)
  unless @data['OS']
    os = Array.new
    tmp = fetch('OS')
    if /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d]) *\(([\w\d ]+)\)\s*\z/ =~ tmp
      org = $1
      os.push({'name' => $2, 'os' => $1})
    else
      os.push({'name' => nil, 'os' => tmp})
    end
    @data['OS'] = os
  end
  if num
    # EX. "Trifolium repens (white clover)"
    "#{@data['OS'][num]['os']} {#data['OS'][num]['name']"
  end
  @data['OS']
end

#release_createdObject

release number when created



471
472
473
# File 'lib/bio/db/embl/embl.rb', line 471

def release_created
  parse_release_version(self.dt['created'])[0]
end

#release_modifiedObject

release number when last updated



466
467
468
# File 'lib/bio/db/embl/embl.rb', line 466

def release_modified
  parse_release_version(self.dt['updated'])[0]
end

#seqObject Also known as: naseq, ntseq

returns the nucleotie sequence in this entry.

  • Bio::EMBL#seq -> Bio::Sequence::NA

@orig as sequence bb Line; (blanks) sequence data (>=1)



445
446
447
# File 'lib/bio/db/embl/embl.rb', line 445

def seq
  Bio::Sequence::NA.new( fetch('').gsub(/ /,'').gsub(/\d+/,'') )
end

#sequence_lengthObject Also known as: seqlen

returns SEQUENCE_LENGTH in the ID line.

  • Bio::EMBL#sequencelength -> String



146
147
148
# File 'lib/bio/db/embl/embl.rb', line 146

def sequence_length
  id_line('SEQUENCE_LENGTH')
end

#speciesObject

species



518
519
520
# File 'lib/bio/db/embl/embl.rb', line 518

def species
  self.fetch('OS')
end

#sq(base = nil) ⇒ Object

returns sequence header information in the sequence header (SQ) line.

  • Bio::EMBL#sq -> <SQ Hash>

where <SQ Hash> is:

{'ntlen' => Int, 'other' => Int,
 'a' => Int, 'c' => Int, 'g' => Int, 't' => Int}
  • Bio::EMBL#sq(base) -> <base content in Int>

  • Bio::EMBL#sq -> <base content in Int>

SQ Line; sequence header (1/entry)

SQ   Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;


422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
# File 'lib/bio/db/embl/embl.rb', line 422

def sq(base = nil)
  unless @data['SQ']
    fetch('SQ') =~ \
           /(\d+) BP\; (\d+) A; (\d+) C; (\d+) G; (\d+) T; (\d+) other;/
    @data['SQ'] = {'ntlen' => $1.to_i, 'other' => $6.to_i,
                   'a' => $2.to_i, 'c' => $3.to_i , 'g' => $4.to_i, 't' => $5.to_i}
  else
    @data['SQ']
  end

  if base
    @data['SQ'][base.downcase]
  else
    @data['SQ']
  end
end

#svObject

returns the version information in the sequence version (SV) line.

  • Bio::EMBL#sv -> Accession.Version in String

  • Bio::EMBL#version -> accession in Int

SV Line; sequence version (1/entry)

SV    Accession.Version


162
163
164
165
166
167
168
# File 'lib/bio/db/embl/embl.rb', line 162

def sv
  if (v = field_fetch('SV').sub(/;/,'')) == ""
    [id_line['ENTRY_NAME'], id_line['SEQUENCE_VERSION']].join('.') 
  else
    v
  end  
end

#to_biosequenceObject

converts the entry to Bio::Sequence object


Arguments
Returns

Bio::Sequence object



530
531
532
# File 'lib/bio/db/embl/embl.rb', line 530

def to_biosequence
  Bio::Sequence.adapter(self, Bio::Sequence::Adapter::EMBL)
end

#topologyObject



134
135
136
# File 'lib/bio/db/embl/embl.rb', line 134

def topology
  id_line('TOPOLOGY')
end

#versionObject



169
170
171
# File 'lib/bio/db/embl/embl.rb', line 169

def version
  (sv.split(".")[1] || id_line['SEQUENCE_VERSION']).to_i
end