Class: Bio::PROSITE

Inherits:
EMBLDB show all
Defined in:
lib/bio/db/prosite.rb

Constant Summary collapse

DELIMITER =

Delimiter

"\n//\n"
RS =

Delimiter

DELIMITER
TAGSIZE =

Bio::DB API

5

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from DB

#exists?, #fetch, #get, open, #tags

Constructor Details

#initialize(entry) ⇒ PROSITE

Returns a new instance of PROSITE.



27
28
29
# File 'lib/bio/db/prosite.rb', line 27

def initialize(entry)
  super(entry, TAGSIZE)
end

Class Method Details

.pa2re(pattern) ⇒ Object

prosite pattern to regular expression

prosite/prosuser.txt:

The PA (PAttern) lines contains the definition of a PROSITE pattern. The patterns are described using the following conventions:

0) The standard IUPAC one-letter codes for the amino acids are used. 0) Ambiguities are indicated by listing the acceptable amino acids for a

given position, between square parentheses `[ ]'. For example: [ALT]
stands for Ala or Leu or Thr.

1) A period ends the pattern. 2) When a pattern is restricted to either the N- or C-terminal of a

sequence, that pattern either starts with a `<' symbol or respectively
ends with a `>' symbol.

3) Ambiguities are also indicated by listing between a pair of curly

brackets `{ }' the amino acids that are not accepted at a given
position. For example: {AM} stands for any amino acid except Ala and
Met.

4) Repetition of an element of the pattern can be indicated by following

that element with a numerical value or a numerical range between
parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to
x-x or x-x-x or x-x-x-x.

5) The symbol ‘x’ is used for a position where any amino acid is accepted. 6) Each element in a pattern is separated from its neighbor by a ‘-’.

Examples:

PA [AC]-x-V-x(4)-ED.

This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-but Glu or Asp

PA <A-x-(2)-x(0,1)-V.

This pattern, which must be in the N-terminal of the sequence (‘<’), is translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val



468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
# File 'lib/bio/db/prosite.rb', line 468

def self.pa2re(pattern)
  pattern = pattern.dup
  pattern.gsub!(/\s/, '')	# remove white spaces
  pattern.sub!(/\.$/, '')	# (1) remove trailing '.'
  pattern.sub!(/^</, '^')	# (2) restricted to the N-terminal : `<'
  pattern.sub!(/>$/, '$')	# (2) restricted to the C-terminal : `>'
  pattern.gsub!(/\{(\w+)\}/) { |m|
    '[^' + $1 + ']'		# (3) not accepted at a given position : '{}'
  }
  pattern.gsub!(/\(([\d,]+)\)/) { |m|
    '{' + $1 + '}'		# (4) repetition of an element : (n), (n,m)
  }
  pattern.tr!('x', '.')	# (5) any amino acid is accepted : 'x'
  pattern.tr!('-', '')	# (6) each element is separated by a '-'
  Regexp.new(pattern, Regexp::IGNORECASE)
end

Instance Method Details

#acObject Also known as: entry_id

AC Accession number (1 per entry)

AC   PSnnnnn;

Returns



58
59
60
61
62
63
# File 'lib/bio/db/prosite.rb', line 58

def ac
  unless @data['AC']
    @data['AC'] = fetch('AC').chomp(';')
  end
  @data['AC']
end

#ccObject Also known as: comment

CC Comments (>=0 per entry)

CC   /QUALIFIER=data; /QUALIFIER=data; .......

/TAXO-RANGE Taxonomic range. /MAX-REPEAT Maximum known number of repetitions of the pattern in a

single protein.

/SITE Indication of an ‘interesting’ site in the pattern. /SKIP-FLAG Indication of an entry that can be, in some cases, ignored

by a program (because it is too unspecific).

Returns



274
275
276
277
278
279
280
281
282
283
# File 'lib/bio/db/prosite.rb', line 274

def cc
  unless @data['CC']
    hash = {}			# temporal hash
    fetch('CC').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
      hash[k] = v
    end
    @data['CC'] = hash
  end
  @data['CC']
end

#deObject Also known as: definition

DE Short description (1 per entry)

DE   Description.

Returns



85
86
87
# File 'lib/bio/db/prosite.rb', line 85

def de
  field_fetch('DE')
end

#divisionObject

Returns



45
46
47
48
49
50
# File 'lib/bio/db/prosite.rb', line 45

def division
  unless @data['TYPE']
    name
  end
  @data['TYPE']
end

#drObject Also known as: sp_xref

DR Cross-references to SWISS-PROT (>=0 per entry)

DR   AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C;
  • ‘AC_NB’ is the SWISS-PROT primary accession number of the entry to which reference is being made.

  • ‘ENTRY_NAME’ is the SWISS-PROT entry name.

  • ‘C’ is a one character flag that can be one of the following:

T For a true positive. N For a false negative; a sequence which belongs to the set under

consideration, but which has not been picked up by the pattern or
profile.

P For a ‘potential’ hit; a sequence that belongs to the set under

consideration, but which was not picked up because the region(s) that
are used as a 'fingerprint' (pattern or profile) is not yet available
in the data bank (partial sequence).

? For an unknown; a sequence which possibly could belong to the set under

consideration.

F For a false positive; a sequence which does not belong to the set in

consideration.

Returns



350
351
352
353
354
355
356
357
358
359
360
361
# File 'lib/bio/db/prosite.rb', line 350

def dr
  unless @data['DR']
    hash = {}			# temporal hash
    if fetch('DR')
      fetch('DR').scan(/(\w+)\s*, (\w+)\s*, (.);/).each do |a, e, c|
        hash[a] = [e, c]	# SWISS-PROT : accession, entry, true/false
      end
    end
    @data['DR'] = hash
  end
  @data['DR']
end

#dtObject Also known as: date

DT Date (1 per entry)

DT   MMM-YYYY (CREATED); MMM-YYYY (DATA UPDATE); MMM-YYYY (INFO UPDATE).

Returns



73
74
75
# File 'lib/bio/db/prosite.rb', line 73

def dt
  field_fetch('DT')
end

#false_negObject Also known as: false_negative_hits

Returns



251
252
253
# File 'lib/bio/db/prosite.rb', line 251

def false_neg
  statistics['FALSE_NEG']
end

#false_posObject

Returns



236
237
238
# File 'lib/bio/db/prosite.rb', line 236

def false_pos
  statistics['FALSE_POS']
end

#false_positive_hitsObject

Returns



241
242
243
# File 'lib/bio/db/prosite.rb', line 241

def false_positive_hits
  false_pos.first
end

#false_positive_sequencesObject

Returns



246
247
248
# File 'lib/bio/db/prosite.rb', line 246

def false_positive_sequences
  false_pos.last
end

#list_falsenegative(by_name = nil) ⇒ Object

Returns



387
388
389
# File 'lib/bio/db/prosite.rb', line 387

def list_falsenegative(by_name = nil)
  list_xref('F', by_name)
end

#list_falsepositive(by_name = nil) ⇒ Object

Returns



392
393
394
# File 'lib/bio/db/prosite.rb', line 392

def list_falsepositive(by_name = nil)
  list_xref('P', by_name)
end

#list_potentialhit(by_name = nil) ⇒ Object

Returns



397
398
399
# File 'lib/bio/db/prosite.rb', line 397

def list_potentialhit(by_name = nil)
  list_xref('P', by_name)
end

#list_truepositive(by_name = nil) ⇒ Object

Returns



382
383
384
# File 'lib/bio/db/prosite.rb', line 382

def list_truepositive(by_name = nil)
  list_xref('T', by_name)
end

#list_unknown(by_name = nil) ⇒ Object

Returns



402
403
404
# File 'lib/bio/db/prosite.rb', line 402

def list_unknown(by_name = nil)
  list_xref('?', by_name)
end

#list_xref(flag, by_name = nil) ⇒ Object

Returns



366
367
368
369
370
371
372
373
374
375
376
377
378
379
# File 'lib/bio/db/prosite.rb', line 366

def list_xref(flag, by_name = nil)
  ary = []
  sp_xref.each do |sp_acc, value|
    if value[1] == flag
      if by_name
        sp_name = value[0]
        ary.push(sp_name)
      else
        ary.push(sp_acc)
      end
    end
  end
  return ary
end

#maObject Also known as: profile

MA Matrix/profile (>=0 per entry)

see - ma2re method

Returns



112
113
114
# File 'lib/bio/db/prosite.rb', line 112

def ma
  field_fetch('MA')
end

#ma2re(matrix) ⇒ Object

prosite profile to regular expression

prosite/profile.txt:

Returns

Raises:

  • (NotImplementedError)


499
500
501
# File 'lib/bio/db/prosite.rb', line 499

def ma2re(matrix)
  raise NotImplementedError
end

#max_repeatObject

Returns



307
308
309
# File 'lib/bio/db/prosite.rb', line 307

def max_repeat
  comment['MAX-REPEAT'].to_i
end

#nameObject

ID Identification (Begins each entry; 1 per entry)

ID   ENTRY_NAME; ENTRY_TYPE.  (ENTRY_TYPE : PATTERN, MATRIX, RULE)

Returns



37
38
39
40
41
42
# File 'lib/bio/db/prosite.rb', line 37

def name
  unless @data['ID']
    @data['ID'], @data['TYPE'] = fetch('ID').chomp('.').split('; ')
  end
  @data['ID']
end

#nrObject Also known as: statistics

NR Numerical results (>=0 per entry)

- SWISS-PROT scan statistics of true and false positives/negatives

/RELEASE SWISS-PROT release number and total number of sequence

entries in that release.

/TOTAL Total number of hits in SWISS-PROT. /POSITIVE Number of hits on proteins that are known to belong to the

set in consideration.

/UNKNOWN Number of hits on proteins that could possibly belong to

the set in consideration.

/FALSE_POS Number of false hits (on unrelated proteins). /FALSE_NEG Number of known missed hits. /PARTIAL Number of partial sequences which belong to the set in

consideration, but  which  are  not  hit  by the pattern or
profile because they are partial (fragment) sequences.

Returns



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/bio/db/prosite.rb', line 151

def nr
  unless @data['NR']
    hash = {}			# temporal hash
    fetch('NR').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
      if v =~ /^(\d+)\((\d+)\)$/
        hits = $1.to_i		# the number of hits
        seqs = $2.to_i		# the number of sequences
        v = [hits, seqs]
      elsif v =~ /([\d\.]+),(\d+)/
        sprel = $1			# the number of SWISS-PROT release
        spseq = $2.to_i		# the number of SWISS-PROT sequences
        v = [sprel, spseq]
      else
        v = v.to_i
      end
      hash[k] = v
    end
    @data['NR'] = hash
  end
  @data['NR']
end

#paObject Also known as: pattern

PA Pattern (>=0 per entry)

see - pa2re method

Returns



97
98
99
100
101
102
# File 'lib/bio/db/prosite.rb', line 97

def pa
  field_fetch('PA')
  @data['PA'] = fetch('PA') unless @data['PA']
  @data['PA'].gsub!(/\s+/, '') if @data['PA']
  @data['PA']
end

#pa2re(pattern) ⇒ Object



485
486
487
# File 'lib/bio/db/prosite.rb', line 485

def pa2re(pattern)
  self.class.pa2re(pattern)
end

#partialObject

Returns



257
258
259
# File 'lib/bio/db/prosite.rb', line 257

def partial
  statistics['PARTIAL']
end

#pdb_xrefObject

3D Cross-references to PDB (>=0 per entry)

3D   name; [name2;...]

Returns



412
413
414
415
416
417
# File 'lib/bio/db/prosite.rb', line 412

def pdb_xref
  unless @data['3D']
    @data['3D'] = fetch('3D').split(/; */)
  end
  @data['3D']
end

#pdoc_xrefObject

DO Pointer to the documentation file (1 per entry)

DO   PDOCnnnnn;

Returns



425
426
427
# File 'lib/bio/db/prosite.rb', line 425

def pdoc_xref
  @data['DO'] = fetch('DO').chomp(';')
end

#positiveObject

Returns



206
207
208
# File 'lib/bio/db/prosite.rb', line 206

def positive
  statistics['POSITIVE']
end

#positive_hitsObject

Returns



211
212
213
# File 'lib/bio/db/prosite.rb', line 211

def positive_hits
  positive.first
end

#positive_sequencesObject

Returns



216
217
218
# File 'lib/bio/db/prosite.rb', line 216

def positive_sequences
  positive.last
end

#reObject



489
490
491
# File 'lib/bio/db/prosite.rb', line 489

def re
  self.class.pa2re(self.pa)
end

#releaseObject

Returns



176
177
178
# File 'lib/bio/db/prosite.rb', line 176

def release
  statistics['RELEASE']
end

#ruObject Also known as: rule

RU Rule (>=0 per entry)

RU   Rule_Description.

The rule is described in ordinary English and is free-format.

Returns



126
127
128
# File 'lib/bio/db/prosite.rb', line 126

def ru
  field_fetch('RU')
end

#siteObject

Returns



312
313
314
315
316
317
# File 'lib/bio/db/prosite.rb', line 312

def site
  if comment['SITE']
    num, desc = comment['SITE'].split(',')
  end
  return [num.to_i, desc]
end

#skip_flagObject

Returns



320
321
322
323
324
# File 'lib/bio/db/prosite.rb', line 320

def skip_flag
  if comment['SKIP-FLAG'] == 'TRUE'
    return true
  end
end

#swissprot_release_numberObject

Returns



181
182
183
# File 'lib/bio/db/prosite.rb', line 181

def swissprot_release_number
  release.first
end

#swissprot_release_sequencesObject

Returns



186
187
188
# File 'lib/bio/db/prosite.rb', line 186

def swissprot_release_sequences
  release.last
end

#taxon_range(expand = nil) ⇒ Object

Returns



288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# File 'lib/bio/db/prosite.rb', line 288

def taxon_range(expand = nil)
  range = comment['TAXO-RANGE']
  if range and expand
    expand = []
    range.scan(/./) do |x|
      case x
      when 'A'; expand.push('archaebacteria')
      when 'B'; expand.push('bacteriophages')
      when 'E'; expand.push('eukaryotes')
      when 'P'; expand.push('prokaryotes')
      when 'V'; expand.push('eukaryotic viruses')
      end
    end
    range = expand
  end
  return range
end

#totalObject

Returns



191
192
193
# File 'lib/bio/db/prosite.rb', line 191

def total
  statistics['TOTAL']
end

#total_hitsObject

Returns



196
197
198
# File 'lib/bio/db/prosite.rb', line 196

def total_hits
  total.first
end

#total_sequencesObject

Returns



201
202
203
# File 'lib/bio/db/prosite.rb', line 201

def total_sequences
  total.last
end

#unknownObject

Returns



221
222
223
# File 'lib/bio/db/prosite.rb', line 221

def unknown
  statistics['UNKNOWN']
end

#unknown_hitsObject

Returns



226
227
228
# File 'lib/bio/db/prosite.rb', line 226

def unknown_hits
  unknown.first
end

#unknown_sequencesObject

Returns



231
232
233
# File 'lib/bio/db/prosite.rb', line 231

def unknown_sequences
  unknown.last
end