Class: GeneValidator::DuplicationValidation

Inherits:

ValidationTest

Object
ValidationTest
GeneValidator::DuplicationValidation

show all

Extended by:: Forwardable

Defined in:: lib/genevalidator/validation_duplication.rb

Overview

This class contains the methods necessary for finding duplicated subsequences in the predicted gene

Instance Attribute Summary collapse

#index_file_name ⇒ Object readonly

Returns the value of attribute index_file_name.
#raw_seq_file ⇒ Object readonly

Returns the value of attribute raw_seq_file.
#raw_seq_file_load ⇒ Object readonly

Returns the value of attribute raw_seq_file_load.

Attributes inherited from ValidationTest

#cli_name, #description, #header, #hits, #prediction, #run_time, #short_header, #type, #validation_report

Instance Method Summary collapse

#in_range?(ranges, idx) ⇒ Boolean
#initialize(prediction, hits) ⇒ DuplicationValidation constructor

A new instance of DuplicationValidation.
#run(n = 10) ⇒ Object

Check duplication in the first n hits Output: DuplicationValidationOutput object.
#wilcox_test(averages) ⇒ Object

wilcox test implementation from statsample ruby gem many thanks to Claudio for helping us with the implementation!.

Constructor Details

#initialize(prediction, hits) ⇒ `DuplicationValidation`

Returns a new instance of DuplicationValidation.

# File 'lib/genevalidator/validation_duplication.rb', line 88

def initialize(prediction, hits)
  super
  @short_header      = 'Duplication'
  @header            = 'Duplication'
  @description       = 'Check whether there is a duplicated subsequence' \
                       ' in the predicted gene by counting the hsp' \
                       ' residue coverage of the prediction, for each hit.'
  @cli_name          = 'dup'
  @raw_seq_file      = opt[:raw_sequences]
  @index_file_name   = config[:raw_seq_file_index]
  @raw_seq_file_load = config[:raw_seq_file_load]
  @db                = opt[:db]
  @num_threads       = opt[:num_threads]
  @type              = config[:type]
end

Instance Attribute Details

#index_file_name ⇒ `Object` (readonly)

Returns the value of attribute index_file_name.



85
86
87

# File 'lib/genevalidator/validation_duplication.rb', line 85

def index_file_name
  @index_file_name
end

#raw_seq_file ⇒ `Object` (readonly)

Returns the value of attribute raw_seq_file.



84
85
86

# File 'lib/genevalidator/validation_duplication.rb', line 84

def raw_seq_file
  @raw_seq_file
end

#raw_seq_file_load ⇒ `Object` (readonly)

Returns the value of attribute raw_seq_file_load.



86
87
88

# File 'lib/genevalidator/validation_duplication.rb', line 86

def raw_seq_file_load
  @raw_seq_file_load
end

Instance Method Details

#in_range?(ranges, idx) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/genevalidator/validation_duplication.rb', line 104

def in_range?(ranges, idx)
  ranges.each do |range|
    return (range.member?(idx)) ? true : false
  end
  false
end

#run(n = 10) ⇒ `Object`

Check duplication in the first n hits Output: DuplicationValidationOutput object

# File 'lib/genevalidator/validation_duplication.rb', line 115

def run(n = 10)
  fail NotEnoughHitsError unless hits.length >= 5
  fail unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? &&
              hits[0].is_a?(Query)

  start = Time.new
  # get the first n hits
  less_hits = @hits[0..[n - 1, @hits.length].min]
  useless_hits = []
  # get raw sequences for less_hits
  less_hits.map do |hit|
    next unless hit.raw_sequence.nil?
    hit.raw_sequence = FetchRawSequences.run(hit.identifier,
                                             hit.accession_no)
    useless_hits.push(hit) if hit.raw_sequence.nil?
  end

  useless_hits.each { |hit| less_hits.delete(hit) }

  fail NoInternetError if less_hits.length == 0

  averages = []

  less_hits.each do |hit|
    coverage = Array.new(hit.length_protein, 0)
    # each residue of the protein should be evluated once only
    ranges_prediction = []

    hit.hsp_list.each do |hsp|
      # align subsequences from the hit and prediction that match
      if !hsp.hit_alignment.nil? && !hsp.query_alignment.nil?
        hit_alignment   = hsp.hit_alignment
        query_alignment = hsp.query_alignment
      else
        # indexing in blast starts from 1
        hit_local   = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1]
        query_local = prediction.raw_sequence[hsp.match_query_from -
                                              1..hsp.match_query_to - 1]

        # in case of nucleotide prediction sequence translate into protein
        # use translate with reading frame 1 because
        # to/from coordinates of the hsp already correspond to the
        # reading frame in which the prediction was read to match this hsp
        if @type == :nucleotide
          s = Bio::Sequence::NA.new(query_local)
          query_local = s.translate
        end

        # local alignment for hit and query
        seqs = [hit_local, query_local]

        begin
          options   = ['--maxiterate', '1000', '--localpair', '--anysymbol',
                       '--quiet', '--thread', "#{@num_threads}"]
          mafft     = Bio::MAFFT.new('mafft', options)

          report    = mafft.query_align(seqs)
          raw_align = report.alignment
          align     = []

          raw_align.each { |seq| align.push(seq.to_s) }
          hit_alignment   = align[0]
          query_alignment = align[1]
        rescue
          raise NoMafftInstallationError
        end
      end

      # check multiple coverage

      # for each hsp of the curent hit
      # iterate through the alignment and count the matching residues
      [*(0..hit_alignment.length - 1)].each do |i|
        residue_hit   = hit_alignment[i]
        residue_query = query_alignment[i]
        next if residue_hit == ' ' || residue_hit == '+' ||
                residue_hit == '-' || residue_hit != residue_query
        # indexing in blast starts from 1
        idx_hit   = i + (hsp.hit_from - 1) -
                    hit_alignment[0..i].scan(/-/).length
        idx_query = i + (hsp.match_query_from - 1) -
                    query_alignment[0..i].scan(/-/).length
        unless in_range?(ranges_prediction, idx_query)
          coverage[idx_hit] += 1
        end
      end

      ranges_prediction.push((hsp.match_query_from..hsp.match_query_to))
    end
    overlap = coverage.reject { |x| x == 0 }
    if overlap != []
      averages.push((overlap.inject(:+) / (overlap.length + 0.0)).round(2))
    end
  end

  # if all hsps match only one time
  if averages.reject { |x| x == 1 } == []
    @validation_report = DuplicationValidationOutput.new(@short_header,
                                                         @header,
                                                         @description, 1,
                                                         averages)
    @validation_report.run_time = Time.now - start
    return @validation_report
  end

  pval = wilcox_test(averages)

  @validation_report = DuplicationValidationOutput.new(@short_header,
                                                       @header,
                                                       @description, pval,
                                                       averages)
  @run_time = Time.now - start
  @validation_report

rescue NotEnoughHitsError
  @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                            @short_header, @header,
                                            @description)
rescue NoMafftInstallationError
  @validation_report = ValidationReport.new('Mafft error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push NoMafftInstallationError
rescue NoInternetError
  @validation_report = ValidationReport.new('Internet error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push NoInternetError
rescue
  @validation_report = ValidationReport.new('Unexpected error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push 'Unexpected Error'
end

#wilcox_test(averages) ⇒ `Object`

wilcox test implementation from statsample ruby gem many thanks to Claudio for helping us with the implementation!

# File 'lib/genevalidator/validation_duplication.rb', line 253

def wilcox_test(averages)
  wilcox = Statsample::Test.wilcoxon_signed_rank(averages.to_scale,
                                                 Array.new(averages.length,
                                                           1).to_scale)

  (averages.length < 15) ? wilcox.probability_exact : wilcox.probability_z
end

Class: GeneValidator::DuplicationValidation

Overview

Instance Attribute Summary collapse

Attributes inherited from ValidationTest

Instance Method Summary collapse

Constructor Details

#initialize(prediction, hits) ⇒ DuplicationValidation

Instance Attribute Details

#index_file_name ⇒ Object (readonly)

#raw_seq_file ⇒ Object (readonly)

#raw_seq_file_load ⇒ Object (readonly)