Class: GeneValidator::DuplicationValidation
- Inherits:
-
ValidationTest
- Object
- ValidationTest
- GeneValidator::DuplicationValidation
- Extended by:
- Forwardable
- Defined in:
- lib/genevalidator/validation_duplication.rb
Overview
This class contains the methods necessary for finding duplicated subsequences in the predicted gene
Instance Attribute Summary collapse
-
#index_file_name ⇒ Object
readonly
Returns the value of attribute index_file_name.
-
#raw_seq_file ⇒ Object
readonly
Returns the value of attribute raw_seq_file.
-
#raw_seq_file_load ⇒ Object
readonly
Returns the value of attribute raw_seq_file_load.
Attributes inherited from ValidationTest
#cli_name, #description, #header, #hits, #prediction, #run_time, #short_header, #type, #validation_report
Instance Method Summary collapse
- #in_range?(ranges, idx) ⇒ Boolean
-
#initialize(prediction, hits) ⇒ DuplicationValidation
constructor
A new instance of DuplicationValidation.
-
#run(n = 10) ⇒ Object
Check duplication in the first n hits Output:
DuplicationValidationOutput
object. -
#wilcox_test(averages) ⇒ Object
wilcox test implementation from statsample ruby gem many thanks to Claudio for helping us with the implementation!.
Constructor Details
#initialize(prediction, hits) ⇒ DuplicationValidation
Returns a new instance of DuplicationValidation.
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/genevalidator/validation_duplication.rb', line 88 def initialize(prediction, hits) super @short_header = 'Duplication' @header = 'Duplication' @description = 'Check whether there is a duplicated subsequence' \ ' in the predicted gene by counting the hsp' \ ' residue coverage of the prediction, for each hit.' @cli_name = 'dup' @raw_seq_file = opt[:raw_sequences] @index_file_name = config[:raw_seq_file_index] @raw_seq_file_load = config[:raw_seq_file_load] @db = opt[:db] @num_threads = opt[:num_threads] @type = config[:type] end |
Instance Attribute Details
#index_file_name ⇒ Object (readonly)
Returns the value of attribute index_file_name.
85 86 87 |
# File 'lib/genevalidator/validation_duplication.rb', line 85 def index_file_name @index_file_name end |
#raw_seq_file ⇒ Object (readonly)
Returns the value of attribute raw_seq_file.
84 85 86 |
# File 'lib/genevalidator/validation_duplication.rb', line 84 def raw_seq_file @raw_seq_file end |
#raw_seq_file_load ⇒ Object (readonly)
Returns the value of attribute raw_seq_file_load.
86 87 88 |
# File 'lib/genevalidator/validation_duplication.rb', line 86 def raw_seq_file_load @raw_seq_file_load end |
Instance Method Details
#in_range?(ranges, idx) ⇒ Boolean
104 105 106 107 108 109 |
# File 'lib/genevalidator/validation_duplication.rb', line 104 def in_range?(ranges, idx) ranges.each do |range| return (range.member?(idx)) ? true : false end false end |
#run(n = 10) ⇒ Object
Check duplication in the first n hits Output: DuplicationValidationOutput
object
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
# File 'lib/genevalidator/validation_duplication.rb', line 115 def run(n = 10) fail NotEnoughHitsError unless hits.length >= 5 fail unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? && hits[0].is_a?(Query) start = Time.new # get the first n hits less_hits = @hits[0..[n - 1, @hits.length].min] useless_hits = [] # get raw sequences for less_hits less_hits.map do |hit| next unless hit.raw_sequence.nil? hit.raw_sequence = FetchRawSequences.run(hit.identifier, hit.accession_no) useless_hits.push(hit) if hit.raw_sequence.nil? end useless_hits.each { |hit| less_hits.delete(hit) } fail NoInternetError if less_hits.length == 0 averages = [] less_hits.each do |hit| coverage = Array.new(hit.length_protein, 0) # each residue of the protein should be evluated once only ranges_prediction = [] hit.hsp_list.each do |hsp| # align subsequences from the hit and prediction that match if !hsp.hit_alignment.nil? && !hsp.query_alignment.nil? hit_alignment = hsp.hit_alignment query_alignment = hsp.query_alignment else # indexing in blast starts from 1 hit_local = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1] query_local = prediction.raw_sequence[hsp.match_query_from - 1..hsp.match_query_to - 1] # in case of nucleotide prediction sequence translate into protein # use translate with reading frame 1 because # to/from coordinates of the hsp already correspond to the # reading frame in which the prediction was read to match this hsp if @type == :nucleotide s = Bio::Sequence::NA.new(query_local) query_local = s.translate end # local alignment for hit and query seqs = [hit_local, query_local] begin = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet', '--thread', "#{@num_threads}"] mafft = Bio::MAFFT.new('mafft', ) report = mafft.query_align(seqs) raw_align = report.alignment align = [] raw_align.each { |seq| align.push(seq.to_s) } hit_alignment = align[0] query_alignment = align[1] rescue raise NoMafftInstallationError end end # check multiple coverage # for each hsp of the curent hit # iterate through the alignment and count the matching residues [*(0..hit_alignment.length - 1)].each do |i| residue_hit = hit_alignment[i] residue_query = query_alignment[i] next if residue_hit == ' ' || residue_hit == '+' || residue_hit == '-' || residue_hit != residue_query # indexing in blast starts from 1 idx_hit = i + (hsp.hit_from - 1) - hit_alignment[0..i].scan(/-/).length idx_query = i + (hsp.match_query_from - 1) - query_alignment[0..i].scan(/-/).length unless in_range?(ranges_prediction, idx_query) coverage[idx_hit] += 1 end end ranges_prediction.push((hsp.match_query_from..hsp.match_query_to)) end overlap = coverage.reject { |x| x == 0 } if overlap != [] averages.push((overlap.inject(:+) / (overlap.length + 0.0)).round(2)) end end # if all hsps match only one time if averages.reject { |x| x == 1 } == [] @validation_report = DuplicationValidationOutput.new(@short_header, @header, @description, 1, averages) @validation_report.run_time = Time.now - start return @validation_report end pval = wilcox_test(averages) @validation_report = DuplicationValidationOutput.new(@short_header, @header, @description, pval, averages) @run_time = Time.now - start @validation_report rescue NotEnoughHitsError @validation_report = ValidationReport.new('Not enough evidence', :warning, @short_header, @header, @description) rescue NoMafftInstallationError @validation_report = ValidationReport.new('Mafft error', :error, @short_header, @header, @description) @validation_report.errors.push NoMafftInstallationError rescue NoInternetError @validation_report = ValidationReport.new('Internet error', :error, @short_header, @header, @description) @validation_report.errors.push NoInternetError rescue @validation_report = ValidationReport.new('Unexpected error', :error, @short_header, @header, @description) @validation_report.errors.push 'Unexpected Error' end |
#wilcox_test(averages) ⇒ Object
wilcox test implementation from statsample ruby gem many thanks to Claudio for helping us with the implementation!
253 254 255 256 257 258 259 |
# File 'lib/genevalidator/validation_duplication.rb', line 253 def wilcox_test(averages) wilcox = Statsample::Test.wilcoxon_signed_rank(averages.to_scale, Array.new(averages.length, 1).to_scale) (averages.length < 15) ? wilcox.probability_exact : wilcox.probability_z end |