Class: Bio::Fastq

Inherits:
Object show all
Defined in:
lib/bio/db/fastq.rb

Overview

Bio::Fastq is a parser for FASTQ format.

Defined Under Namespace

Classes: Error, FormatData

Constant Summary collapse

FormatNames =

Available format names.

{
  "fastq-sanger"   => FormatData::FASTQ_SANGER,
  "fastq-solexa"   => FormatData::FASTQ_SOLEXA,
  "fastq-illumina" => FormatData::FASTQ_ILLUMINA
}.freeze
Formats =

Available format name symbols.

{
  :fastq_sanger   => FormatData::FASTQ_SANGER,
  :fastq_solexa   => FormatData::FASTQ_SOLEXA,
  :fastq_illumina => FormatData::FASTQ_ILLUMINA
}.freeze
DefaultFormatName =

Default format name

'fastq-sanger'.freeze
FLATFILE_SPLITTER =

Splitter for Bio::FlatFile

Bio::FlatFile::Splitter::LineOriented

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(str = nil) ⇒ Fastq

Creates a new Fastq object from formatted text string.

The format of quality scores should be specified later by using format= method.


Arguments:

  • str: Formatted string (String)



383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
# File 'lib/bio/db/fastq.rb', line 383

def initialize(str = nil)
  return unless str
  sc = StringScanner.new(str)
  while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
    unless add_header_line(line) then
      sc.unscan
      break
    end
  end
  while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
    unless add_line(line) then
      sc.unscan
      break
    end
  end
  @entry_overrun = sc.rest
end

Instance Attribute Details

#definitionObject (readonly)

definition; ID line (begins with @)



402
403
404
# File 'lib/bio/db/fastq.rb', line 402

def definition
  @definition
end

#entry_overrunObject (readonly)

entry_overrun



373
374
375
# File 'lib/bio/db/fastq.rb', line 373

def entry_overrun
  @entry_overrun
end

#headerObject (readonly)

misc lines before the entry (String or nil)



335
336
337
# File 'lib/bio/db/fastq.rb', line 335

def header
  @header
end

#quality_stringObject (readonly)

quality as a string



405
406
407
# File 'lib/bio/db/fastq.rb', line 405

def quality_string
  @quality_string
end

#sequence_stringObject (readonly)

raw sequence data as a String object



408
409
410
# File 'lib/bio/db/fastq.rb', line 408

def sequence_string
  @sequence_string
end

Instance Method Details

#add_header_line(line) ⇒ Object

Adds a header line if the header data is not yet given and the given line is suitable for header. Returns self if adding header line is succeeded. Otherwise, returns false (the line is not added).



324
325
326
327
328
329
330
331
332
# File 'lib/bio/db/fastq.rb', line 324

def add_header_line(line)
  @header ||= ""
  if line[0,1] == "@" then
    false
  else
    @header.concat line
    self
  end
end

#add_line(line) ⇒ Object

Adds a line to the entry if the given line is regarded as a part of the current entry.



339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
# File 'lib/bio/db/fastq.rb', line 339

def add_line(line)
  line = line.chomp
  if !defined? @definition then
    if line[0, 1] == "@" then
      @definition = line[1..-1]
    else
      @definition = line
      @parse_errors ||= []
      @parse_errors.push Error::No_atmark.new
    end
    return self
  end
  if defined? @definition2 then
    @quality_string ||= ''
    if line[0, 1] == "@" and
        @quality_string.size >= @sequence_string.size then
      return false
    else
      @quality_string.concat line
      return self
    end
  else
    @sequence_string ||= ''
    if line[0, 1] == '+' then
      @definition2 = line[1..-1]
    else
      @sequence_string.concat line
    end
    return self
  end
  raise "Bug: should not reach here!"
end

#entry_idObject

Identifier of the entry. Normally, the first word of the ID line.



446
447
448
449
450
451
452
# File 'lib/bio/db/fastq.rb', line 446

def entry_id
  unless defined? @entry_id then
    eid = @definition.strip.split(/\s+/)[0] || @definition
    @entry_id = eid
  end
  @entry_id
end

#error_probabilitiesObject

Estimated probability of error for each base.


Returns

(Array containing Float) error probability values



529
530
531
532
533
534
535
536
# File 'lib/bio/db/fastq.rb', line 529

def error_probabilities
  unless defined? @error_probabilities then
    self.format ||= self.class::DefaultFormatName
    a = @format.q2p(self.quality_scores)
    @error_probabilities = a
  end
  @error_probabilities
end

#formatObject

Format name. One of “fastq-sanger”, “fastq-solexa”, “fastq-illumina”, or nil (when not specified).


Returns

(String or nil) format name



497
498
499
# File 'lib/bio/db/fastq.rb', line 497

def format
  @format ? @format.name : nil
end

#format=(name) ⇒ Object

Specify the format. If the format is not found, raises RuntimeError.

Available formats are:

"fastq-sanger" or :fastq_sanger
"fastq-solexa" or :fastq_solexa
"fastq-illumina" or :fastq_illumina

Arguments:

  • (required) name: format name (String or Symbol).

Returns

(String) format name



476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
# File 'lib/bio/db/fastq.rb', line 476

def format=(name)
  if name then
    f = FormatNames[name] || Formats[name]
    if f then
      reset_state
      @format = f.instance
      self.format
    else
      raise "unknown format"
    end
  else
    reset_state
    nil
  end
end

#mask(threshold, mask_char = 'n') ⇒ Object

Masks low quality sequence regions. For each sequence position, if the quality score is smaller than the threshold, the sequence in the position is replaced with mask_char.

Note: This method does not care quality_score_type.


Arguments:

  • (required) threshold : (Numeric) threshold

  • (optional) mask_char : (String) character used for masking

Returns

Bio::Sequence object



668
669
670
# File 'lib/bio/db/fastq.rb', line 668

def mask(threshold, mask_char = 'n')
  to_biosequence.mask_with_quality_score(threshold, mask_char)
end

#nalenObject

length of naseq



433
434
435
# File 'lib/bio/db/fastq.rb', line 433

def nalen
  naseq.length
end

#naseqObject

returns Bio::Sequence::NA



425
426
427
428
429
430
# File 'lib/bio/db/fastq.rb', line 425

def naseq
  unless defined? @naseq then
    @naseq = Bio::Sequence::NA.new(@sequence_string)
  end
  @naseq
end

#quality_score_typeObject

The meaning of the quality scores. It may be one of :phred, :solexa, or nil.



504
505
506
507
# File 'lib/bio/db/fastq.rb', line 504

def quality_score_type
  self.format ||= self.class::DefaultFormatName
  @format.quality_score_type
end

#quality_scoresObject Also known as: qualities

Quality score for each base. For “fastq-sanger” or “fastq-illumina”, it is PHRED score. For “fastq-solexa”, it is Solexa score.


Returns

(Array containing Integer) quality score values



515
516
517
518
519
520
521
522
# File 'lib/bio/db/fastq.rb', line 515

def quality_scores
  unless defined? @quality_scores then
    self.format ||= self.class::DefaultFormatName
    s = @format.str2scores(@quality_string)
    @quality_scores = s
  end
  @quality_scores
end

#seqObject

returns Bio::Sequence::Generic



438
439
440
441
442
443
# File 'lib/bio/db/fastq.rb', line 438

def seq
  unless defined? @seq then
    @seq = Bio::Sequence::Generic.new(@sequence_string)
  end
  @seq
end

#to_biosequenceObject

Returns sequence as a Bio::Sequence object.

Note: If you modify the returned Bio::Sequence object, the sequence or definition in this Fastq object might also be changed (but not always be changed) because of efficiency.



653
654
655
# File 'lib/bio/db/fastq.rb', line 653

def to_biosequence
  Bio::Sequence.adapter(self, Bio::Sequence::Adapter::Fastq)
end

#to_sObject

Returns Fastq formatted string constructed from instance variables. The string will always be consisted of four lines without wrapping of the sequence and quality string, and the third-line is always only contains “+”. This may be different from initial entry.

Note that use of the method may be inefficient and may lose performance because new string object is created every time it is called. For showing an entry as-is, consider using Bio::FlatFile#entry_raw. For output with various options, use Bio::Sequence#output(:fastq).



420
421
422
# File 'lib/bio/db/fastq.rb', line 420

def to_s
  "@#{@definition}\n#{@sequence_string}\n+\n#{@quality_string}\n"
end

#validate_format(errors = nil) ⇒ Object

Format validation.

If an array is given as the argument, when errors are found, error objects are pushed to the array. Currently, following errors may be added to the array. (All errors are under the Bio::Fastq namespace, for example, Bio::Fastq::Error::Diff_ids).

Error::Diff_ids – the identifier in the two lines are different Error::Long_qual – length of quality is longer than the sequence Error::Short_qual – length of quality is shorter than the sequence Error::No_qual – no quality characters found Error::No_seq – no sequence found Error::Qual_char – invalid character in the quality Error::Seq_char – invalid character in the sequence Error::Qual_range – quality score value out of range Error::No_ids – sequence identifier not found Error::No_atmark – the first identifier does not begin with “@” Error::Skipped_unformatted_lines – the parser skipped unformatted lines that could not be recognized as FASTQ format


Arguments:

  • (optional) errors: (Array or nil) an array for pushing error messages. The array should be empty.

Returns

true:no error, false: containing error.



562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
# File 'lib/bio/db/fastq.rb', line 562

def validate_format(errors = nil)
  err = []

  # if header exists, the format might be broken.
  if defined? @header and @header and !@header.strip.empty? then
    err.push Error::Skipped_unformatted_lines.new
  end

  # if parse errors exist, adding them
  if defined? @parse_errors and @parse_errors then
    err.concat @parse_errors
  end

  # check if identifier exists, and identifier matches
  if !defined?(@definition) or !@definition then
    err.push Error::No_ids.new
  elsif defined?(@definition2) and
      !@definition2.to_s.empty? and
      @definition != @definition2 then
    err.push Error::Diff_ids.new
  end

  # check if sequence exists
  has_seq  = true
  if !defined?(@sequence_string) or !@sequence_string then
    err.push Error::No_seq.new
    has_seq = false
  end

  # check if quality exists
  has_qual = true
  if !defined?(@quality_string) or !@quality_string then
    err.push Error::No_qual.new
    has_qual = false
  end

  # sequence and quality length check
  if has_seq and has_qual then
    slen = @sequence_string.length
    qlen = @quality_string.length
    if slen > qlen then
      err.push Error::Short_qual.new
    elsif qlen > slen then
      err.push Error::Long_qual.new
    end
  end

  # sequence character check
  if has_seq then
    sc = StringScanner.new(@sequence_string)
    while sc.scan_until(/[ \x00-\x1f\x7f-\xff]/n)
      err.push Error::Seq_char.new(sc.pos - sc.matched_size)
    end
  end

  # sequence character check
  if has_qual then
    fmt = if defined?(@format) and @format then
            @format.name
          else
            nil
          end
    re = case fmt
         when 'fastq-sanger'
           /[^\x21-\x7e]/n
         when 'fastq-solexa'
           /[^\x3b-\x7e]/n
         when 'fastq-illumina'
           /[^\x40-\x7e]/n
         else
           /[ \x00-\x1f\x7f-\xff]/n
         end
    sc = StringScanner.new(@quality_string)
    while sc.scan_until(re)
      err.push Error::Qual_char.new(sc.pos - sc.matched_size)
    end
  end

  # if "errors" is given, set errors
  errors.concat err if errors
  # returns true if no error; otherwise, returns false
  err.empty? ? true : false
end