Class: Bio::FastaFormat

Inherits:
DB show all
Defined in:
lib/bio/db/fasta.rb

Overview

Treats a FASTA formatted entry, such as:

>id and/or some comments                    <== comment line
ATGCATGCATGCATGCATGCATGCATGCATGCATGC        <== sequence lines
ATGCATGCATGCATGCATGCATGCATGCATGCATGC
ATGCATGCATGC

The precedent ‘>’ can be omitted and the trailing ‘>’ will be removed automatically.

Examples

f_str = <<END
>sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
GIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNL
KLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGC
IFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFP
QWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES
>sce:YBR274W  CHK1; probable serine/threonine-protein kinase [EC:2.7.1.-] [SP:KB9S_YEAST]
MSLSQVSPLPHIKDVVLGDTVGQGAFACVKNAHLQMDPSIILAVKFIHVP
TCKKMGLSDKDITKEVVLQSKCSKHPNVLRLIDCNVSKEYMWIILEMADG
GDLFDKIEPDVGVDSDVAQFYFQQLVSAINYLHVECGVAHRDIKPENILL
DKNGNLKLADFGLASQFRRKDGTLRVSMDQRGSPPYMAPEVLYSEEGYYA
DRTDIWSIGILLFVLLTGQTPWELPSLENEDFVFFIENDGNLNWGPWSKI
EFTHLNLLRKILQPDPNKRVTLKALKLHPWVLRRASFSGDDGLCNDPELL
AKKLFSHLKVSLSNENYLKFTQDTNSNNRYISTQPIGNELAELEHDSMHF
QTVSNTQRAFTSYDSNTNYNSGTGMTQEAKWTQFISYDIAALQFHSDEND
CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
KTGDPLEWRRLFKKISTICRDIILIPN
END

f = Bio::FastaFormat.new(f_str)
puts "### FastaFormat"
puts "# entry"
puts f.entry
puts "# entry_id"
p f.entry_id
puts "# definition"
p f.definition
puts "# data"
p f.data
puts "# seq"
p f.seq
puts "# seq.type"
p f.seq.type
puts "# length"
p f.length
puts "# aaseq"
p f.aaseq
puts "# aaseq.type"
p f.aaseq.type
puts "# aaseq.composition"
p f.aaseq.composition
puts "# aalen"
p f.aalen

References

Direct Known Subclasses

FastaNumericFormat

Constant Summary collapse

DELIMITER =

Entry delimiter in flatfile text.

RS = "\n>"
DELIMITER_OVERRUN =

(Integer) excess read size included in DELIMITER.

1

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from DB

#exists?, #fetch, #get, open, #tags

Constructor Details

#initialize(str) ⇒ FastaFormat

Stores the comment and sequence information from one entry of the FASTA format string. If the argument contains more than one entry, only the first entry is used.



155
156
157
158
159
160
# File 'lib/bio/db/fasta.rb', line 155

def initialize(str)
  @definition = str[/.*/].sub(/^>/, '').strip	# 1st line
  @data = str.sub(/.*/, '')				# rests
  @data.sub!(/^>.*/m, '')	# remove trailing entries for sure
  @entry_overrun = $&
end

Instance Attribute Details

#dataObject

The seuqnce lines in text.



148
149
150
# File 'lib/bio/db/fasta.rb', line 148

def data
  @data
end

#definitionObject

The comment line of the FASTA formatted data.



145
146
147
# File 'lib/bio/db/fasta.rb', line 145

def definition
  @definition
end

#entry_overrunObject (readonly)

Returns the value of attribute entry_overrun.



150
151
152
# File 'lib/bio/db/fasta.rb', line 150

def entry_overrun
  @entry_overrun
end

Instance Method Details

#aalenObject

Returens the length of Bio::Sequence::AA.



245
246
247
# File 'lib/bio/db/fasta.rb', line 245

def aalen
  self.aaseq.length
end

#aaseqObject

Returens the Bio::Sequence::AA.



240
241
242
# File 'lib/bio/db/fasta.rb', line 240

def aaseq
  Sequence::AA.new(seq)
end

#acc_versionObject

Returns accession number with version.



303
304
305
# File 'lib/bio/db/fasta.rb', line 303

def acc_version
  identifiers.acc_version
end

#accessionObject

Returns an accession number.



291
292
293
# File 'lib/bio/db/fasta.rb', line 291

def accession
  identifiers.accession
end

#accessionsObject

Parsing FASTA Defline (using #identifiers method), and shows accession numbers. It returns an array of strings.



298
299
300
# File 'lib/bio/db/fasta.rb', line 298

def accessions
  identifiers.accessions
end

#commentObject

Returns comments.



219
220
221
222
# File 'lib/bio/db/fasta.rb', line 219

def comment
  seq
  @comment
end

#entryObject Also known as: to_s

Returns the stored one entry as a FASTA format. (same as to_s)



163
164
165
# File 'lib/bio/db/fasta.rb', line 163

def entry
  @entry = ">#{@definition}\n#{@data.strip}\n"
end

#entry_idObject

Parsing FASTA Defline (using #identifiers method), and shows a possibly unique identifier. It returns a string.



277
278
279
# File 'lib/bio/db/fasta.rb', line 277

def entry_id
  identifiers.entry_id
end

#giObject

Parsing FASTA Defline (using #identifiers method), and shows GI/locus/accession/accession with version number. If a entry has more than two of such IDs, only the first ID are shown. It returns a string or nil.



286
287
288
# File 'lib/bio/db/fasta.rb', line 286

def gi
  identifiers.gi
end

#identifiersObject

Parsing FASTA Defline, and extract IDs. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs. It returns a Bio::FastaDefline instance.



267
268
269
270
271
272
# File 'lib/bio/db/fasta.rb', line 267

def identifiers
  unless defined?(@ids) then
    @ids = FastaDefline.new(@definition)
  end
  @ids
end

#lengthObject

Returns sequence length.



225
226
227
# File 'lib/bio/db/fasta.rb', line 225

def length
  seq.length
end

#locusObject

Returns locus.



308
309
310
# File 'lib/bio/db/fasta.rb', line 308

def locus
  identifiers.locus
end

#nalenObject

Returens the length of Bio::Sequence::NA.



235
236
237
# File 'lib/bio/db/fasta.rb', line 235

def nalen
  self.naseq.length
end

#naseqObject

Returens the Bio::Sequence::NA.



230
231
232
# File 'lib/bio/db/fasta.rb', line 230

def naseq
  Sequence::NA.new(seq)
end

#query(factory) ⇒ Object Also known as: fasta, blast

Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast factory object.

#!/usr/bin/env ruby
require 'bio'

factory = Bio::Fasta.local('fasta34', 'db/swissprot.f')
flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f')
flatfile.each do |entry|
  p entry.definition
  result = entry.fasta(factory)
  result.each do |hit|
    print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at "
    p hit.lap_at
  end
end


186
187
188
# File 'lib/bio/db/fasta.rb', line 186

def query(factory)
  factory.query(@entry)
end

#seqObject

Returns a joined sequence line as a String.



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# File 'lib/bio/db/fasta.rb', line 193

def seq
  unless defined?(@seq)
    unless /\A\s*^\#/ =~ @data then
      @seq = Sequence::Generic.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
    else
      a = @data.split(/(^\#.*$)/)
      i = 0
      cmnt = {}
      s = []
      a.each do |x|
        if /^# ?(.*)$/ =~ x then
          cmnt[i] ? cmnt[i] << "\n" << $1 : cmnt[i] = $1
        else
          x.tr!(" \t\r\n0-9", '') # lazy clean up
          i += x.length
          s << x
        end
      end
      @comment = cmnt
      @seq = Bio::Sequence::Generic.new(s.join(''))
    end
  end
  @seq
end

#to_seqObject

Returns sequence as a Bio::Sequence object.

Note: If you modify the returned Bio::Sequence object, the sequence or definition in this FastaFormat object might also be changed (but not always be changed) because of efficiency.



256
257
258
259
260
261
# File 'lib/bio/db/fasta.rb', line 256

def to_seq
  seq
  obj = Bio::Sequence.new(@seq)
  obj.definition = self.definition
  obj
end