Class: Bio::FlatFile::AutoDetect

Inherits:
Object
  • Object
show all
Includes:
TSort
Defined in:
lib/bio/io/flatfile/autodetection.rb

Overview

AutoDetect automatically determines database class of given data.

Defined Under Namespace

Classes: RuleDebug, RuleProc, RuleRegexp, RuleRegexp2, RuleSpecial, RuleTemplate, RulesArray

Constant Summary collapse

TopRule =

Special element that is always top priority.

RuleSpecial.new('top')
BottomRule =

Special element that is always bottom priority.

RuleSpecial.new('bottom')

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeAutoDetect

Creates a new Autodetect object



226
227
228
229
230
231
232
233
# File 'lib/bio/io/flatfile/autodetection.rb', line 226

def initialize
  # stores autodetection rules.
  @rules = Hash.new
  # stores elements (cache)
  @elements = nil
  self.add(TopRule)
  self.add(BottomRule)
end

Class Method Details

.[](*arg) ⇒ Object

make a new autodetect object



361
362
363
364
365
# File 'lib/bio/io/flatfile/autodetection.rb', line 361

def self.[](*arg)
  a = self.new
  arg.each { |e| a.add(e) }
  a
end

.defaultObject

returns the default autodetect object



348
349
350
351
352
353
# File 'lib/bio/io/flatfile/autodetection.rb', line 348

def self.default
  unless @default then
    @default = self.make_default
  end
  @default
end

.default=(ad) ⇒ Object

sets the default autodetect object.



356
357
358
# File 'lib/bio/io/flatfile/autodetection.rb', line 356

def self.default=(ad)
  @default = ad
end

.make_defaultObject

make a default of default autodetect object



368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
# File 'lib/bio/io/flatfile/autodetection.rb', line 368

def self.make_default
  a = self[
    genbank  = RuleRegexp[ 'Bio::GenBank',
      /^LOCUS       .+ bp .*[a-z]*[DR]?NA/ ],
    genpept  = RuleRegexp[ 'Bio::GenPept',
      /^LOCUS       .+ aa .+/ ],
    medline  = RuleRegexp[ 'Bio::MEDLINE',
      /^PMID\- [0-9]+$/ ],
    embl     = RuleRegexp[ 'Bio::EMBL',
      /^ID   .+\; .*(DNA|RNA|XXX)\;/ ],
    sptr     = RuleRegexp2[ 'Bio::SPTR',
      /^ID   .+\; *PRT\;/,
      /^ID   [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ],
    prosite  = RuleRegexp[ 'Bio::PROSITE',
      /^ID   [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
    transfac = RuleRegexp[ 'Bio::TRANSFAC',
      /^AC  [-A-Za-z0-9_\.]+$/ ],

    aaindex  = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text|
      if /^H [-A-Z0-9_\.]+$/ =~ text then
        if text =~ /^M [rc]/ then
          Bio::AAindex2
        elsif text =~ /^I    A\/L/ then
          Bio::AAindex1
        else
          false #fail to determine
        end
      else
        nil
      end
    end,

    litdb    = RuleRegexp[ 'Bio::LITDB',
      /^CODE        [0-9]+$/ ],
    pathway_module = RuleRegexp[ 'Bio::KEGG::MODULE',
      /^ENTRY       .+ Pathway\s+Module\s*/ ],
    pathway  = RuleRegexp[ 'Bio::KEGG::PATHWAY',
      /^ENTRY       .+ Pathway\s*/ ],
    brite    = RuleRegexp[ 'Bio::KEGG::BRITE',
      /^Entry           [A-Z0-9]+/ ],
    orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY',
      /^ENTRY       .+ KO\s*/ ],
    drug     = RuleRegexp[ 'Bio::KEGG::DRUG',
      /^ENTRY       .+ Drug\s*/ ],
    glycan   = RuleRegexp[ 'Bio::KEGG::GLYCAN',
      /^ENTRY       .+ Glycan\s*/ ],
    enzyme   = RuleRegexp2[ 'Bio::KEGG::ENZYME',
      /^ENTRY       EC [0-9\.]+$/,
      /^ENTRY       .+ Enzyme\s*/
    ],
    compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND',
      /^ENTRY       C[A-Za-z0-9\._]+$/,
      /^ENTRY       .+ Compound\s*/
    ],
    reaction = RuleRegexp2[ 'Bio::KEGG::REACTION',
      /^ENTRY       R[A-Za-z0-9\._]+$/,
      /^ENTRY       .+ Reaction\s*/
    ],
    genes    = RuleRegexp[ 'Bio::KEGG::GENES',
      /^ENTRY       .+ (CDS|gene|.*RNA|Contig) / ],
    genome   = RuleRegexp[ 'Bio::KEGG::GENOME',
      /^ENTRY       [a-z]+$/ ],

    fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster',
                          'Bio::FANTOM::MaXML::Sequence') do |text|
      if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
        case $1
        when 'clusters'
          Bio::FANTOM::MaXML::Cluster
        when 'sequences'
          Bio::FANTOM::MaXML::Sequence
        else
          nil #unknown
        end
      else
        nil
      end
    end,

    pdb = RuleRegexp[ 'Bio::PDB',
      /^HEADER    .{40}\d\d\-[A-Z]{3}\-\d\d   [0-9A-Z]{4}/ ],
    het = RuleRegexp[ 'Bio::PDB::ChemicalComponent',
      /^RESIDUE +.+ +\d+\s*$/ ],

    clustal = RuleRegexp2[ 'Bio::ClustalW::Report',
    /^CLUSTAL .*\(.*\).*sequence +alignment/,
    /^CLUSTAL FORMAT for T-COFFEE/ ],

    gcg_msf = RuleRegexp[ 'Bio::GCG::Msf',
    /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ],

    gcg_seq = RuleRegexp[ 'Bio::GCG::Seq',
    /^!!(N|A)A_SEQUENCE .+/ ],

    blastxml = RuleRegexp[ 'Bio::Blast::Report',
      /\<\!DOCTYPE BlastOutput PUBLIC / ],
    wublast  = RuleRegexp[ 'Bio::Blast::WU::Report',
      /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
    wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast',
      /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
    blast    = RuleRegexp[ 'Bio::Blast::Default::Report',
      /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
    tblast   = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast',
      /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
    rpsblast   = RuleRegexp[ 'Bio::Blast::RPSBlast::Report',
      /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],

    blat   = RuleRegexp[ 'Bio::Blat::Report',
      /^psLayout version \d+/ ],
    spidey = RuleRegexp[ 'Bio::Spidey::Report',
      /^\-\-SPIDEY version .+\-\-$/ ],
    hmmer  = RuleRegexp[ 'Bio::HMMER::Report',
      /^HMMER +\d+\./ ],
    sim4   = RuleRegexp[ 'Bio::Sim4::Report',
      /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],

    fastq  = RuleRegexp[ 'Bio::Fastq',
      /^\@.+(?:\r|\r?\n)(?:[^\@\+].*(?:\r|\r?\n))+\+.*(?:\r|\r?\n).+(?:\r|\r?\n)/ ],

    fastaformat = RuleProc.new('Bio::FastaFormat',
                               'Bio::NBRF',
                               'Bio::FastaNumericFormat') do |text|
      if /^>.+$/ =~ text
        case text
        when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
          Bio::NBRF
        when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
            Bio::FastaFormat
        when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
          Bio::FastaNumericFormat
        else
          false
        end
      else
        nil
      end
    end
  ]

  # dependencies
  # NCBI
  genbank.is_prior_to genpept
  # EMBL/UniProt
  embl.is_prior_to sptr
  sptr.is_prior_to prosite
  prosite.is_prior_to transfac
  # KEGG
  #aaindex.is_prior_to litdb
  #litdb.is_prior_to brite
  pathway_module.is_prior_to pathway
  pathway.is_prior_to brite
  brite.is_prior_to orthology
  orthology.is_prior_to drug
  drug.is_prior_to glycan
  glycan.is_prior_to enzyme
  enzyme.is_prior_to compound
  compound.is_prior_to reaction
  reaction.is_prior_to genes
  genes.is_prior_to genome
  # PDB
  pdb.is_prior_to het
  # BLAST
  wublast.is_prior_to wutblast
  wutblast.is_prior_to blast
  blast.is_prior_to tblast
  # Fastq
  BottomRule.is_prior_to(fastq)
  fastq.is_prior_to(fastaformat)
  # FastaFormat
  BottomRule.is_prior_to(fastaformat)

  # for debug
  #debug_first = RuleDebug.new('debug_first')
  #a.add(debug_first)
  #debug_first.is_prior_to(TopRule)

  ## for debug
  #debug_last = RuleDebug.new('debug_last')
  #a.add(debug_last)
  #BottomRule.is_prior_to(debug_last)
  #fastaformat.is_prior_to(debug_last)

  a.rehash
  return a
end

Instance Method Details

#add(elem) ⇒ Object

Adds a new element. Returns elem.



237
238
239
240
241
242
# File 'lib/bio/io/flatfile/autodetection.rb', line 237

def add(elem)
  raise 'element name conflicts' if @rules[elem.name]
  @elements = nil
  @rules[elem.name] = elem
  elem
end

#autodetect(text, meta = {}) ⇒ Object

Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.



305
306
307
308
309
310
311
312
313
# File 'lib/bio/io/flatfile/autodetection.rb', line 305

def autodetect(text, meta = {})
  r = nil
  elements.each do |e|
    #$stderr.puts e.name
    r = e.guess(text, meta)
    break if r
  end
  r
end

#autodetect_flatfile(ff, lines = 31) ⇒ Object

autodetect from the FlatFile object. Returns a database class if succeeded. Returns nil if failed.



318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
# File 'lib/bio/io/flatfile/autodetection.rb', line 318

def autodetect_flatfile(ff, lines = 31)
  meta = {}
  stream = ff.instance_eval { @stream }
  begin
    path = stream.path
  rescue NameError
  end
  if path then
    meta[:path] = path
    # call autodetect onece with meta and without any read action
    if r = self.autodetect(stream.prefetch_buffer, meta)
      return r
    end
  end
  # reading stream
  1.upto(lines) do |x|
    break unless line = stream.prefetch_gets
    if line.strip.size > 0 then
      if r = self.autodetect(stream.prefetch_buffer, meta)
        return r
      end
    end
  end
  return nil
end

#each_rule(&x) ⇒ Object

Iterates over each element.



298
299
300
# File 'lib/bio/io/flatfile/autodetection.rb', line 298

def each_rule(&x) #:yields: elem
  elements.each(&x)
end

#elementsObject

Returns current elements as an array whose order fulfills all elements’ priorities.



275
276
277
278
279
280
281
282
# File 'lib/bio/io/flatfile/autodetection.rb', line 275

def elements
  unless @elements
    ary = tsort
    ary.reverse!
    @elements = ary
  end
  @elements
end

#inspectObject

visualizes the object (mainly for debug)



291
292
293
294
295
# File 'lib/bio/io/flatfile/autodetection.rb', line 291

def inspect
  "<#{self.class.to_s} " +
    self.elements.collect { |e| e.name.inspect }.join(' ') +
    ">"
end

#rehashObject

rebuilds the object and clears internal cache.



285
286
287
288
# File 'lib/bio/io/flatfile/autodetection.rb', line 285

def rehash
  @rules.rehash
  @elements = nil
end

#tsort_each_child(elem) ⇒ Object

(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.



253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# File 'lib/bio/io/flatfile/autodetection.rb', line 253

def tsort_each_child(elem)
  if elem == TopRule then
    @rules.each_value do |e|
      yield e unless e == TopRule or 
        e.lower_priority_elements.index(TopRule)
    end
  elsif elem == BottomRule then
    @rules.each_value do |e|
      yield e if e.higher_priority_elements.index(BottomRule)
    end
  else
    elem.lower_priority_elements.each do |e|
      yield e if e != BottomRule
    end
    unless elem.higher_priority_elements.index(BottomRule)
      yield BottomRule
    end
  end
end

#tsort_each_node(&x) ⇒ Object

(required by TSort.) For all elements, yields each element.



246
247
248
# File 'lib/bio/io/flatfile/autodetection.rb', line 246

def tsort_each_node(&x)
  @rules.each_value(&x)
end