Class: Bio::FastaDefline

Inherits:
Object show all
Defined in:
lib/bio/db/fasta.rb

Overview

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs.

specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers

Examples

rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
rub.entry_id       ==> 'gi|671595'
rub.get('emb')     ==> 'CAA85678.1'
rub.emb            ==> 'CAA85678.1'
rub.gi             ==> '671595'
rub.accession      ==> 'CAA85678'
rub.accessions     ==> [ 'CAA85678' ]
rub.acc_version    ==> 'CAA85678.1'
rub.locus          ==> nil
rub.list_ids       ==> [["gi", "671595"],
                        ["emb", "CAA85678.1", nil],
                        ["Perovskia abrotanoides"]]

ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
ckr.entry_id      ==> "gi|2495000"
ckr.sp            ==> "CCKR_CAVPO"
ckr.pir           ==> "I51898"
ckr.gb            ==> "AAB29504.1"
ckr.gi            ==> "2495000"
ckr.accession     ==> "AAB29504"
ckr.accessions    ==> ["Q63931", "AAB29504"]
ckr.acc_version   ==> "AAB29504.1"
ckr.locus         ==> nil
ckr.description   ==>
  "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
ckr.descriptions  ==>
  ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
   "cholecystokinin A receptor - guinea pig",
   "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
ckr.words         ==> 
  ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
   "receptor", "type"]
ckr.id_strings    ==>
  ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
   "544724", "AAB29504.1", "Cavia"]
ckr.list_ids      ==>
  [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
   ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
   ["gb", "AAB29504.1", nil], ["Cavia"]]

Refereneces

Constant Summary collapse

NSIDs =
{
  # NCBI and WU-BLAST
  'gi'  => [ 'gi' ],                      # NCBI GI
  'gb'  => [ 'acc_version', 'locus' ],      # GenBank
  'emb' => [ 'acc_version', 'locus' ],      # EMBL
  'dbj' => [ 'acc_version', 'locus' ],      # DDBJ
  'sp'  => [ 'accession', 'entry_id' ],   # SWISS-PROT
  'pdb' => [ 'entry_id', 'chain' ],       # PDB
  'bbs' => [ 'number' ],                  # GenInfo Backbone Id
  'gnl' => [ 'database' , 'entry_id' ],   # General database identifier
  'ref' => [ 'acc_version' , 'locus' ],     # NCBI Reference Sequence
  'lcl' => [ 'entry_id' ],                # Local Sequence identifier

  # WU-BLAST and NCBI
  'pir' => [ 'accession', 'entry_id' ],   # PIR
  'prf' => [ 'accession', 'entry_id' ],   # Protein Research Foundation
  'pat' => [ 'country', 'number', 'serial' ], # Patents

  # WU-BLAST only
  'bbm' => [ 'number' ],      # NCBI GenInfo Backbone database identifier
  'gim' => [ 'number' ],      # NCBI GenInfo Import identifier
  'gp'  => [ 'acc_version', 'locus' ],      # GenPept
  'oth' => [ 'accession', 'name', 'release' ],  # Other (user-definable) identifier
  'tpd' => [ 'accession', 'name' ],       # Third party annotation, DDBJ
  'tpe' => [ 'accession', 'name' ],       # Third party annotation, EMBL
  'tpg' => [ 'accession', 'name' ],       # Third party annotation, GenBank

  # Original
  'ri'  => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
}
KillWords =
[
  'an', 'the', 'this', 'that',
  'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
  'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
  'from', 'and', 'or', 'not',
  'dna', 'rna', 'mrna', 'cdna', 'orf',
  'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
  'similar', 'involved', 'identical', 'identity',
  'cds', 'clone', 'library', 'contig', 'contigs',
  'homolog', 'homologue', 'homologs', 'homologous',
  'protein', 'proteins', 'gene', 'genes',
  'product', 'products', 'sequence', 'sequences', 
  'strain', 'strains', 'region', 'regions',
]
KillWordsHash =
{}
KillRegexpArray =
[
  /\A\d{1,3}\%?\z/,
  /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
  /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(str) ⇒ FastaDefline

Parses given string.



469
470
471
472
473
474
475
476
477
478
479
480
# File 'lib/bio/db/fasta.rb', line 469

def initialize(str)
  @deflines = []
  @info = {}
  @list_ids = []

  @entry_id = nil

  lines = str.split("\x01")
  lines.each do |line|
    add_defline(line)
  end
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args) ⇒ Object



811
812
813
814
815
816
817
818
819
# File 'lib/bio/db/fasta.rb', line 811

def method_missing(name, *args)
  # raise ArgumentError,
  # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
  r = get(name, *args)
  if !r and !(self.class::NSIDs[name.to_s]) then
    raise "NameError: undefined method `#{name.inspect}'"
  end
  r
end

Instance Attribute Details

#entry_idObject (readonly)

Shows a possibly unique identifier. Returns a string.



466
467
468
# File 'lib/bio/db/fasta.rb', line 466

def entry_id
  @entry_id
end

#list_idsObject (readonly)

Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.



462
463
464
# File 'lib/bio/db/fasta.rb', line 462

def list_ids
  @list_ids
end

Instance Method Details

#acc_versionObject

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



782
783
784
785
786
787
# File 'lib/bio/db/fasta.rb', line 782

def acc_version
  unless defined?(@acc_version) then
    @acc_version = get_by_type('acc_version')
  end
  @acc_version
end

#accessionObject

Shows an accession number.



800
801
802
803
804
805
806
807
808
809
# File 'lib/bio/db/fasta.rb', line 800

def accession
  unless defined?(@accession) then
    if acc_version then
      @accession = acc_version.split('.')[0]
    else
      @accession = accessions[0]
    end
  end
  @accession
end

#accessionsObject

Shows accession numbers. Returns an array of strings.



791
792
793
794
795
796
797
# File 'lib/bio/db/fasta.rb', line 791

def accessions
  unless defined?(@accessions) then
    @accessions = get_all_by_type('accession', 'acc_version')
    @accessions.collect! { |x| x.sub(/\..*\z/, '') }
  end
  @accessions
end

#add_defline(str) ⇒ Object

Parses given string and adds parsed data.



483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
# File 'lib/bio/db/fasta.rb', line 483

def add_defline(str)
  case str
  when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
    # NSIDs
    # examples:
    # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
    #
    # note: regexp (:?) means grouping without backreferences
    i = $1
    d = $2
    tks = i.split('|')
    tks << '' if i[-1,1] == '|'
    a = parse_NSIDs(tks)
    i = a[0].join('|')
    a.unshift('|')
    d = tks.join('|') + ' ' + d unless tks.empty?
    a << d
    this_line = a
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /\A[A-Z]/ then
        di = [  x ]
        @list_ids << di
        @info['organism'] = x unless @info['organism']
      end
    end

  when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
    # examples:
    # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
    # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
    i = $1
    d = $2
    a = parse_ColonSepID(i)
    i = a.join(':')
    this_line = [ ':', a , d ]
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /:/ then
        parse_ColonSepID(x)
      elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
        @list_ids << [ $1 ]
      end
    end

  when /^\>?\s*(\S+)(?:\s+(.+))?$/
    # examples:
    # >ABC12345 this is test
    i = $1
    d = $2.to_s
    @list_ids << [ i.chomp('.') ]
    this_line = [  '', [ i ], d ]
    match_EC(d)
  else
    i = str
    d = ''
    match_EC(i)
    this_line = [ '', [ i ], d ]
  end

  @deflines << this_line
  @entry_id = i unless @entry_id
end

#descriptionObject

Shows description.



625
626
627
# File 'lib/bio/db/fasta.rb', line 625

def description
  @deflines[0].to_a[-1]
end

#descriptionsObject

Returns descriptions.



630
631
632
633
634
# File 'lib/bio/db/fasta.rb', line 630

def descriptions
  @deflines.collect do |a|
    a[-1]
  end
end

#get(dbname) ⇒ Object

Returns identifires by a database name.



706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
# File 'lib/bio/db/fasta.rb', line 706

def get(dbname)
  db = dbname.to_s
  r = nil
  unless r = @info[db] then
    di = @list_ids.find { |x| x[0] == db.to_s }
    if di and di.size <= 2 then
      r = di[-1]
    elsif di then
      labels = self.class::NSIDs[db]
      [ 'acc_version', 'entry_id',
        'locus', 'accession', 'number'].each do |x|
        if i = labels.index(x) then
          r = di[i+1]
          break if r
        end
      end
      r = di[1..-1].find { |x| x } unless r
    end
    @info[db] = r if r
  end
  r
end

#get_all_by_type(*type_strarg) ⇒ Object

Returns identifiers by given type.



742
743
744
745
746
747
748
749
750
751
752
753
754
# File 'lib/bio/db/fasta.rb', line 742

def get_all_by_type(*type_strarg)
  d = []
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      type_strarg.each do |y|
        if i = labels.index(y) then
          d << x[i+1] if x[i+1]
        end
      end
    end
  end
  d
end

#get_by_type(type_str) ⇒ Object

Returns an identifier by given type.



730
731
732
733
734
735
736
737
738
739
# File 'lib/bio/db/fasta.rb', line 730

def get_by_type(type_str)
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      if i = labels.index(type_str) then
        return x[i+1]
      end
    end
  end
  nil
end

#giObject

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



771
772
773
774
775
776
# File 'lib/bio/db/fasta.rb', line 771

def gi
  unless defined?(@gi) then
    @gi = get_by_type('gi')
  end
  @gi
end

#id_stringsObject

Shows ID-like strings. Returns an array of strings.



638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
# File 'lib/bio/db/fasta.rb', line 638

def id_strings
  r = []
  @list_ids.each do |a|
    if a.size >= 2 then
      r.concat a[1..-1].find_all { |x| x }
    else
      if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
        r << a[0]
      end
    end
  end
  r.concat( words(true, []).find_all do |x|
             x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
               x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
           end)
  r
end

#locusObject

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



760
761
762
763
764
765
# File 'lib/bio/db/fasta.rb', line 760

def locus
  unless defined?(@locus)
    @locus = get_by_type('locus')
  end
  @locus
end

#to_sObject

Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.



617
618
619
620
621
622
# File 'lib/bio/db/fasta.rb', line 617

def to_s
  @deflines.collect { |a|
    s = a[0]
    (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
  }.join("\x01")
end

#words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, kwhash = self.class::KillWordsHash) ⇒ Object

Shows words used in the defline. Returns an Array.



680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
# File 'lib/bio/db/fasta.rb', line 680

def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
          kwhash = self.class::KillWordsHash)
  a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
  a.collect! do |x|
    x.sub!(/\A[\$\*\-\+]+/, '')
    x.sub!(/[\$\*\-\=]+\z/, '')
    if x.size <= 1 then
      nil
    elsif kwhash[x.downcase] then
      nil
    else
      if kill_regexp.find { |expr| expr =~ x } then
        nil
      else
        x
      end
    end
  end
  a.compact!
  a.collect! { |x| x.downcase } unless case_sensitive
  a.sort!
  a.uniq!
  a
end