Class: GeneValidator::GeneMergeValidation

Inherits:
ValidationTest show all
Defined in:
lib/genevalidator/validation_gene_merge.rb

Overview

This class contains the methods necessary for checking whether there is evidence that the prediction is a merge of multiple genes

Instance Attribute Summary collapse

Attributes inherited from ValidationTest

#cli_name, #description, #header, #run_time, #short_header, #type, #validation_report

Instance Method Summary collapse

Constructor Details

#initialize(prediction, hits, boundary = 10) ⇒ GeneMergeValidation

Initilizes the object Params: prediction: a Sequence object representing the blast query hits: a vector of Sequence objects (representing blast hits) plot_path: name of the input file, used when generatig the plot files boundary: the offset of the hit from which we start analysing the hit



101
102
103
104
105
106
107
108
109
# File 'lib/genevalidator/validation_gene_merge.rb', line 101

def initialize(prediction, hits, boundary = 10)
  super
  @short_header = 'GeneMerge'
  @header       = 'Gene Merge'
  @description  = 'Check whether BLAST hits make evidence about a merge' \
                  ' of two genes that match the predicted gene.'
  @cli_name     = 'merge'
  @boundary     = boundary
end

Instance Attribute Details

#hitsObject (readonly)

Returns the value of attribute hits.



92
93
94
# File 'lib/genevalidator/validation_gene_merge.rb', line 92

def hits
  @hits
end

#predictionObject (readonly)

Returns the value of attribute prediction.



91
92
93
# File 'lib/genevalidator/validation_gene_merge.rb', line 91

def prediction
  @prediction
end

Instance Method Details

#plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits) ⇒ Object

Generates a json file containing data used for plotting the start/end of the matched region offsets in the prediction Param slope: slope of the linear regression line y_intercept: the ecuation of the line is y= slope*x + y_intercept output: location where the plot will be saved in jped file format hits: array of Sequence objects



220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/genevalidator/validation_gene_merge.rb', line 220

def plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits)
  pairs = hits.map do |hit|
    Pair.new(hit.hsp_list.map(&:match_query_from).min,
             hit.hsp_list.map(&:match_query_to).max)
  end

  data = hits.map { |hit| { 'x' => hit.hsp_list.map(&:match_query_from).min,
                            'y' => hit.hsp_list.map(&:match_query_to).max,
                            'color' => 'red' } }

  Plot.new(data,
           :scatter,
           'Gene Merge Validation: Start/end of matching hit coord. on' \
           ' query (1 point/hit)',
           '',
           'Start Offset (most left hsp)',
           'End Offset (most right hsp)',
           y_intercept.to_s,
           slope.to_s)
end

#plot_matched_regions(hits = @hits) ⇒ Object

Generates a json file containing data used for plotting the matched region of the prediction for each hit Param output: location where the plot will be saved in jped file format hits: array of Sequence objects prediction: Sequence objects



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/genevalidator/validation_gene_merge.rb', line 184

def plot_matched_regions(hits = @hits)
  no_lines = hits.length

  hits_less = hits[0..[no_lines, hits.length - 1].min]

  data = hits_less.each_with_index.map { |hit, i|
    { 'y' => i,
      'start' => hit.hsp_list.map(&:match_query_from).min,
      'stop' => hit.hsp_list.map(&:match_query_to).max,
      'color' => 'black',
      'dotted' => 'true' } }.flatten +
    hits_less.each_with_index.map { |hit, i|
      hit.hsp_list.map { |hsp|
        { 'y' => i,
          'start' => hsp.match_query_from,
          'stop' => hsp.match_query_to,
          'color' => 'orange' } } }.flatten

  Plot.new(data,
           :lines,
           'Gene Merge Validation: Query coord covered by blast hit' \
           ' (1 line/hit)',
           '',
           'Offset in Prediction',
           'Hit Number',
           hits_less.length)
end

#runObject

Validation test for gene merge Output: GeneMergeValidationOutput object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/genevalidator/validation_gene_merge.rb', line 115

def run
  fail NotEnoughHitsError unless hits.length >= 5
  fail unless prediction.is_a?(Query) && hits[0].is_a?(Query)

  start = Time.now

  pairs = hits.map { |hit| Pair.new(hit.hsp_list.map{ |hsp| hsp.match_query_from }.min,
                                    hit.hsp_list.map{ |hsp| hsp.match_query_to }.max) }
  xx_0 = pairs.map(&:x)
  yy_0 = pairs.map(&:y)

  # minimum start shoud be at 'boundary' residues
  xx = xx_0.map { |x| (x < @boundary) ? @boundary : x }

  # maximum end should be at length - 'boundary' residues
  yy = yy_0.map do |y|
    if y > @prediction.raw_sequence.length - @boundary
      @prediction.raw_sequence.length - @boundary
    else
      y
    end
  end

  line_slope = slope(xx, yy, (1..hits.length).map { |x| 1 / (x + 0.0) })
  ## YW - what is this weighting?

  unimodality = false
  if unimodality_test(xx, yy)
    unimodality = true
    lm_slope = 0.0
  else
    lm_slope = line_slope[1]
  end

  y_intercept = line_slope[0]

  @validation_report = GeneMergeValidationOutput.new(@short_header, @header,
                                                     @description, lm_slope,
                                                     unimodality)
  if unimodality
    plot1 = plot_2d_start_from
  else
    plot1 = plot_2d_start_from(lm_slope, y_intercept)
  end

  @validation_report.plot_files.push(plot1)
  plot2 = plot_matched_regions
  @validation_report.plot_files.push(plot2)
  @validation_report.run_time = Time.now - start
  @validation_report

rescue NotEnoughHitsError
  @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                            @short_header, @header,
                                            @description)
rescue
  @validation_report = ValidationReport.new('Unexpected error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push 'Unexpected Error'
end

#slope(xx, yy, weights = nil) ⇒ Object

Caclulates the slope of the regression line give a set of 2d coordonates of the start/stop offests of the hits Param xx: Array of integers yy : Array of integers weights: Array of integers Output: The ecuation of the regression line: [y slope]



250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/genevalidator/validation_gene_merge.rb', line 250

def slope(xx, yy, weights = nil)
  weights = Array.new(hits.length, 1) if weights.nil?

  # calculate the slope
  xx_weighted = xx.each_with_index.map { |x, i| x * weights[i] }
  yy_weighted = yy.each_with_index.map { |y, i| y * weights[i] }

  denominator = weights.reduce(0) { |a, e| a + e }

  x_mean = xx_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)
  y_mean = yy_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)

  numerator = (0...xx.length).reduce(0) do |a, e|
    a + (weights[e] * (xx[e] - x_mean) * (yy[e] - y_mean))
  end

  denominator = (0...xx.length).reduce(0) do |a, e|
    a + (weights[e] * ((xx[e] - x_mean)**2))
  end

  slope = numerator / (denominator + 0.0)
  y_intercept = y_mean - (slope * x_mean)

  [y_intercept, slope]
end

#slope_statsample(xx, yy) ⇒ Object

Caclulates the slope of the regression line give a set of 2d coordonates of the start/stop offests of the hits Param xx : Array of integers yy : Array of integers Output: The ecuation of the regression line: [y slope]



284
285
286
287
# File 'lib/genevalidator/validation_gene_merge.rb', line 284

def slope_statsample(xx, yy)
  sr = Statsample::Regression.simple(xx.to_scale, yy.to_scale)
  [sr.a, sr.b]
end

#unimodality_test(xx, yy) ⇒ Object

xx and yy are the projections of the 2-d data on the two axes



291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'lib/genevalidator/validation_gene_merge.rb', line 291

def unimodality_test(xx, yy)
  mean_x = xx.mean
  median_x = xx.median
  mode_x = xx.mode
  sd_x = xx.standard_deviation

  if sd_x == 0
    cond1_x = true
    cond2_x = true
    cond3_x = true
  else
    cond1_x = ((mean_x - median_x).abs / (sd_x + 0.0)) < Math.sqrt(0.6)
    cond2_x = ((mean_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
    cond3_x = ((median_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
  end

  mean_y = yy.mean
  median_y = yy.median
  mode_y = yy.mode
  sd_y = yy.standard_deviation

  if sd_y == 0
    cond1_y = true
    cond2_y = true
    cond3_y = true
  else
    cond1_y = ((mean_y - median_y).abs / (sd_y + 0.0)) < Math.sqrt(0.6)
    cond2_y = ((mean_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
    cond3_y = ((median_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
  end

  if cond1_x && cond2_x && cond3_x && cond1_y && cond2_y && cond3_y
    true
  else
    false
  end
end