Class: Bio::EMBL

Inherits:
EMBLDB show all
Includes:
Bio::EMBLDB::Common
Defined in:
lib/bio/db/embl/embl.rb

Constant Summary

Constants included from Bio::EMBLDB::Common

Bio::EMBLDB::Common::DELIMITER, Bio::EMBLDB::Common::RS, Bio::EMBLDB::Common::TAGSIZE

Instance Method Summary collapse

Methods included from Bio::EMBLDB::Common

#ac, #accession, #de, #dr, #initialize, #kw, #oc, #og, #os, #ref, #references

Methods inherited from EMBLDB

#initialize

Methods inherited from DB

#exists?, #fetch, #get, open, #tags

Instance Method Details

#ccObject

returns comment text in the comments (CC) line.

CC Line; comments of notes (>=0)



313
314
315
# File 'lib/bio/db/embl/embl.rb', line 313

def cc
  get('CC')
end

#divisionObject

returns DIVISION in the ID line.

  • Bio::EMBL#division -> String



125
126
127
# File 'lib/bio/db/embl/embl.rb', line 125

def division
  id_line('DIVISION')
end

#dt(key = nil) ⇒ Object

returns contents in the date (DT) line.

  • Bio::EMBL#dt -> <DT Hash>

where <DT Hash> is:

{}
  • Bio::EMBL#dt(key) -> String

keys: ‘created’ and ‘updated’

DT Line; date (2/entry)



167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/bio/db/embl/embl.rb', line 167

def dt(key=nil)
  unless @data['DT']
    tmp = Hash.new
    dt_line = self.get('DT').split(/\n/)
    tmp['created'] = dt_line[0].sub(/\w{2}   /,'').strip
    tmp['updated'] = dt_line[1].sub(/\w{2}   /,'').strip
    @data['DT'] = tmp
  end
  if key
    @data['DT'][key]
  else
    @data['DT']
  end
end

#each_cdsObject

iterates on CDS features in the FT lines.



292
293
294
295
296
297
298
# File 'lib/bio/db/embl/embl.rb', line 292

def each_cds
  ft.each do |cds_feature|
    if cds_feature.feature == 'CDS'
      yield cds_feature
    end
  end
end

#each_geneObject

iterates on gene features in the FT lines.



301
302
303
304
305
306
307
# File 'lib/bio/db/embl/embl.rb', line 301

def each_gene
  ft.each do |gene_feature|
    if gene_feature.feature == 'gene'
      yield gene_feature
    end
  end
end

#entryObject Also known as: entry_name, entry_id

returns ENTRY_NAME in the ID line.

  • Bio::EMBL#entry -> String



110
111
112
# File 'lib/bio/db/embl/embl.rb', line 110

def entry
  id_line('ENTRY_NAME')
end

#fhObject

returns feature table header (String) in the feature header (FH) line.

FH Line; feature table header (0 or 2)



236
237
238
# File 'lib/bio/db/embl/embl.rb', line 236

def fh
  fetch('FH')
end

#ftObject Also known as: features

returns contents in the feature table (FT) lines.

  • Bio::EMBL#ft -> Bio::Features

  • Bio::EMBL#ft {} -> {|Bio::Feature| }

same as features method in bio/db/genbank.rb

FT Line; feature table data (>=0)



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'lib/bio/db/embl/embl.rb', line 247

def ft
  unless @data['FT']
    @data['FT'] = Array.new
    ary = Array.new
    in_quote = false
    @orig['FT'].each_line do |line|
      next if line =~ /^FEATURES/

      head = line[0,20].strip  # feature key (source, CDS, ...)
      body = line[20,60].chomp # feature value (position, /qualifier=)
      if line =~ /^FT {3}(\S+)/
        ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
      elsif body =~ /^ \// and not in_quote
        ary.last.push(body)    # /q="data..., /q=data, /q

        if body =~ /=" / and body !~ /"$/
          in_quote = true
        end

      else
        ary.last.last << body # ...data..., ...data..."

        if body =~ /"$/
          in_quote = false
        end
      end
    end

    ary.map! do |subary|
      parse_qualifiers(subary)
    end

    @data['FT'] = Features.new(ary)
  end
  if block_given?
    @data['FT'].each do |feature|
      yield feature
    end
  else
    @data['FT']
  end
end

#id_line(key = nil) ⇒ Object

returns contents in the ID line.

  • Bio::EMBL#id_line -> <ID Hash>

where <ID Hash> is:

{'ENTRY_NAME' => String, 'MOLECULE_TYPE' => String, 'DIVISION' => String,
 'SEQUENCE_LENGTH' => Int, 'SEQUENCE_VERSION' => Int}

ID Line

"ID  ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; DIVISION; SEQUENCE_LENGTH BP."

DATA_CLASS = [‘standard’]

MOLECULE_TYPE: DNA RNA XXX

Code ( DIVISION )

EST (ESTs)
PHG (Bacteriophage)
FUN (Fungi)
GSS (Genome survey)
HTC (High Throughput cDNAs) 
HTG (HTGs)
HUM (Human)
INV (Invertebrates)
ORG (Organelles)
MAM (Other Mammals)
VRT (Other Vertebrates)
PLN (Plants)
PRO (Prokaryotes)
ROD (Rodents)
SYN (Synthetic)
STS (STSs)
UNC (Unclassified)
VRL (Viruses)

Rel 89- ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.

  1. Primary accession number

  2. Sequence version number

  3. Topology: ‘circular’ or ‘linear’

  4. Molecule type (see note 1 below)

  5. Data class (see section 3.1)

  6. Taxonomic division (see section 3.2)

  7. Sequence length (see note 2 below)



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/bio/db/embl/embl.rb', line 82

def id_line(key=nil)
  unless @data['ID']
    tmp = Hash.new
    idline = fetch('ID').split(/; +/)         
    tmp['ENTRY_NAME'], tmp['DATA_CLASS'] = idline.shift.split(/ +/)
    if idline.first =~ /^SV/
      tmp['SEQUENCE_VERSION'] = idline.shift.split(' ').last
      tmp['TOPOLOGY'] = idline.shift
      tmp['MOLECULE_TYPE'] = idline.shift
      tmp['DATA_CLASS'] = idline.shift
    else
      tmp['MOLECULE_TYPE'] = idline.shift
    end
    tmp['DIVISION'] = idline.shift
    tmp['SEQUENCE_LENGTH'] = idline.shift.strip.split(' ').first.to_i

    @data['ID'] = tmp
  end
  
  if key
    @data['ID'][key]
  else
    @data['ID']
  end
end

#moleculeObject Also known as: molecule_type

returns MOLECULE_TYPE in the ID line.

  • Bio::EMBL#molecule -> String



118
119
120
# File 'lib/bio/db/embl/embl.rb', line 118

def molecule
  id_line('MOLECULE_TYPE')
end

#seqObject Also known as: naseq, ntseq

returns the nucleotie sequence in this entry.

  • Bio::EMBL#seq -> Bio::Sequence::NA

@orig as sequence bb Line; (blanks) sequence data (>=1)



357
358
359
# File 'lib/bio/db/embl/embl.rb', line 357

def seq
  Sequence::NA.new( fetch('').gsub(/ /,'').gsub(/\d+/,'') )
end

#sequence_lengthObject Also known as: seqlen

returns SEQUENCE_LENGTH in the ID line.

  • Bio::EMBL#sequencelength -> String



131
132
133
# File 'lib/bio/db/embl/embl.rb', line 131

def sequence_length
  id_line('SEQUENCE_LENGTH')
end

#sq(base = nil) ⇒ Object

returns sequence header information in the sequence header (SQ) line.

  • Bio::EMBL#sq -> <SQ Hash>

where <SQ Hash> is:

{'ntlen' => Int, 'other' => Int,
 'a' => Int, 'c' => Int, 'g' => Int, 't' => Int}
  • Bio::EMBL#sq(base) -> <base content in Int>

  • Bio::EMBL#sq -> <base content in Int>

SQ Line; sequence header (1/entry)

SQ   Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;


334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
# File 'lib/bio/db/embl/embl.rb', line 334

def sq(base = nil)
  unless @data['SQ']
    fetch('SQ') =~ \
           /(\d+) BP\; (\d+) A; (\d+) C; (\d+) G; (\d+) T; (\d+) other;/
    @data['SQ'] = {'ntlen' => $1.to_i, 'other' => $6.to_i,
                   'a' => $2.to_i, 'c' => $3.to_i , 'g' => $4.to_i, 't' => $5.to_i}
  else
    @data['SQ']
  end

  if base
    @data['SQ'][base.downcase]
  else
    @data['SQ']
  end
end

#svObject

returns the version information in the sequence version (SV) line.

  • Bio::EMBL#sv -> Accession.Version in String

  • Bio::EMBL#version -> accession in Int

SV Line; sequence version (1/entry)

SV    Accession.Version


147
148
149
150
151
152
153
# File 'lib/bio/db/embl/embl.rb', line 147

def sv
  if (v = field_fetch('SV').sub(/;/,'')) == ""
    [id_line['ENTRY_NAME'], id_line['SEQUENCE_VERSION']].join('.') 
  else
    v
  end  
end

#versionObject



154
155
156
# File 'lib/bio/db/embl/embl.rb', line 154

def version
  (sv.split(".")[1] || id_line['SEQUENCE_VERSION']).to_i
end