Class: ViralSeq::TcsCore

Inherits:
Object
  • Object
show all
Defined in:
lib/viral_seq/tcs_core.rb

Overview

Core functions for ‘tcs` pipeline

Class Method Summary collapse

Class Method Details

.calculate_cut_off(m, error_rate = 0.02) ⇒ Integer

methods to calculate TCS consensus cut-off based on the maximum numbers of PIDs and platform error rate.

Parameters:

  • m (Integer)

    PID abundance

  • error_rate (Float) (defaults to: 0.02)

    estimated platform error rate.

Returns:

  • (Integer)

    an abundance cut-off (Integer) for offspring Primer IDs.

See Also:



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/viral_seq/tcs_core.rb', line 14

def calculate_cut_off(m, error_rate = 0.02)
  n = 0
  case error_rate
  when 0.005...0.015
    if m <= 10
      n = 2
    else
      n = 1.09*10**-26*m**6 + 7.82*10**-22*m**5 - 1.93*10**-16*m**4 + 1.01*10**-11*m**3 - 2.31*10**-7*m**2 + 0.00645*m + 2.872
    end

  when 0...0.005
    if m <= 10
      n = 2
    else
      n = -9.59*10**-27*m**6 + 3.27*10**-21*m**5 - 3.05*10**-16*m**4 + 1.2*10**-11*m**3 - 2.19*10**-7*m**2 + 0.004044*m + 2.273
    end

  else
    if m <= 10
      n = 2
    elsif m <= 8500
      n = -1.24*10**-21*m**6 + 3.53*10**-17*m**5 - 3.90*10**-13*m**4 + 2.12*10**-9*m**3 - 6.06*10**-6*m**2 + 1.80*10**-2*m + 3.15
    else
      n = 0.0079 * m + 9.4869
    end
  end

  n = n.round
  n = 2 if n < 3
  return n
end

.detection_limit(tcs_number) ⇒ Float

lower detection sensitivity for minority mutations given the number of TCS, calculated based on binomial distribution. R required.

Examples:

calculate lower detection limit

ViralSeq::TcsCore.detection_limit(100)
=> 0.0362

Parameters:

  • tcs_number (Integer)

    number of TCS

Returns:

  • (Float)

    lower detection limit



295
296
297
298
299
300
301
302
# File 'lib/viral_seq/tcs_core.rb', line 295

def detection_limit(tcs_number)
  if ViralSeq::DETECT_SEN[tcs_number]
    return ViralSeq::DETECT_SEN[tcs_number]
  else
    dl =  `Rscript -e "library(dplyr); ifelse(#{tcs_number} > 2, (binom.test(0,#{tcs_number})['conf.int'] %>% unlist %>% unname)[2] %>% round(4) %>% cat, 0)" 2>/dev/null`
    dl.to_f
  end
end

.filter_r1(r1_sh, forward_primer) ⇒ Object

filter r1 raw sequences for non-specific primers. input r1_sh, SeqHash obj. return filtered Hash of sequence name and seq pair, in the object { r1_filtered_seq: r1_filtered_seq_pair }



216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/viral_seq/tcs_core.rb', line 216

def filter_r1(r1_sh, forward_primer)
  if forward_primer.match(/(N+)(\w+)$/)
    forward_n = $1.size
    forward_bio_primer = $2
  else
    forward_n = 0
    forward_bio_primer = forward_primer
  end
  forward_bio_primer_size = forward_bio_primer.size
  forward_starting_number = forward_n + forward_bio_primer_size
  #forward_primer_ref = forward_bio_primer.nt_parser

  r1_passed_seq = {}
  r1_raw = r1_sh.dna_hash

  proc_filter = proc do |name|
    seq = r1_raw[name]
    next unless general_filter seq
    primer_region_seq = seq[forward_n, forward_bio_primer_size]
    if primer_region_seq.nt_diff(forward_bio_primer) < 3
      new_name = remove_tag name
      r1_passed_seq[new_name] = seq
    end
  end

  r1_raw.keys.map do |name|
    proc_filter.call name
  end

  return { r1_passed_seq: r1_passed_seq, forward_starting_number: forward_starting_number }
end

.filter_r2(r2_sh, cdna_primer) ⇒ Object

filter r2 raw sequences for non-specific primers. input r2_sh, SeqHash obj. return filtered Hash of sequence name and seq pair, as well as the length of PID.



251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# File 'lib/viral_seq/tcs_core.rb', line 251

def filter_r2(r2_sh, cdna_primer)
  r2_raw = r2_sh.dna_hash
  cdna_primer.match(/(N+)(\w+)$/)
  pid_length = $1.size
  cdna_bio_primer = $2
  cdna_bio_primer_size = cdna_bio_primer.size
  reverse_starting_number = pid_length + cdna_bio_primer_size
 # cdna_primer_ref = cdna_bio_primer.nt_to_array
  r2_passed_seq = {}
  proc_filter = proc do |name|
    seq = r2_raw[name]
    next unless general_filter seq
    primer_region_seq = seq[pid_length, cdna_bio_primer_size]
    if primer_region_seq.nt_diff(cdna_bio_primer) < 4
      new_name = remove_tag name
      r2_passed_seq[new_name] = seq
    end
  end

  r2_raw.keys.map do |name|
    proc_filter.call name
  end

  return { r2_passed_seq: r2_passed_seq, pid_length: pid_length, reverse_starting_number: reverse_starting_number }
end

.log_and_abort(log, infor) ⇒ Object

puts error message in the log file handler, and abort with the same infor



281
282
283
284
285
# File 'lib/viral_seq/tcs_core.rb', line 281

def log_and_abort(log, infor)
  log.puts Time.now.to_s + "\t" + infor
  log.close
  abort infor.red.bold
end

.r1r2(directory, unzip = true) ⇒ Object

identify which file in the directory is R1 file, and which is R2 file based on file names input as directory (Dir object or a string of path) by default, .gz files will be unzipped. return as an hash of file1, r1_file: file2



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/viral_seq/tcs_core.rb', line 50

def r1r2(directory, unzip = true)
  files = []
  Dir.chdir(directory) { files = Dir.glob "*" }
  r1_file = ""
  r2_file = ""
  files.each do |f|
    tag = parser_file_name(f)[:tag]

    if tag.include? "R1"
      unzip ? r1_file = unzip_r(directory, f) : r1_file = File.join(directory, f)
    elsif tag.include? "R2"
      unzip ? r2_file = unzip_r(directory, f) : r2_file = File.join(directory, f)
    end
  end
  return { r1_file: r1_file, r2_file: r2_file }
end

.sort_by_lib(directory, out_dir = directory + "_sorted") ⇒ Object

sort directories containing mulitple r1 and r2 files. use the library name (first string before “_”) to seperate libraries out_dir is the Dir object or string of the output directory, by default named as directory + “_sorted” return a hash as { with_both_r1_r2: [lib1, lib2, …], missing_r1: [lib1, lib2, …], missing_r2: [lib1, lib2, …], error: [lib1, lib2, …]}



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/viral_seq/tcs_core.rb', line 72

def sort_by_lib(directory, out_dir = directory + "_sorted")
  Dir.mkdir(out_dir) unless File.directory?(out_dir)
  files = []
  Dir.chdir(directory) {files = Dir.glob("*")}

  files.each do |file|
    path = File.join(directory,file)
    index = file.split("_")[0]
    index_dir = File.join(out_dir, index)
    Dir.mkdir(index_dir) unless File.directory?(index_dir)
    File.rename(path, File.join(index_dir, file))
  end

  return_obj = { with_both_r1_r2: [],
                 missing_r1: [],
                 missing_r2: [],
                 error: []
                }

  libs = []
  Dir.chdir(out_dir) { libs = Dir.glob('*') }
  libs.each do |lib|
    file_check = ViralSeq::TcsCore.r1r2(File.join(out_dir, lib))
    if !file_check[:r1_file].empty? and !file_check[:r2_file].empty?
      return_obj[:with_both_r1_r2] << lib
    elsif file_check[:r1_file].empty? and !file_check[:r2_file].empty?
      return_obj[:missing_r1] << lib
    elsif file_check[:r2_file].empty? and !file_check[:r1_file].empty?
      return_obj[:missing_r2] << lib
    else
      return_obj[:error] << lib
    end
  end
  return return_obj
end

.validate_file_name(name_array) ⇒ hash

sort array of file names to determine if there is potential errors

Parameters:

  • name_array (Array)

    array of file names

Returns:

  • (hash)

    name check results



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/viral_seq/tcs_core.rb', line 112

def validate_file_name(name_array)
  errors = {
             file_type_error: [] ,
             missing_r1_file: [] ,
             missing_r2_file: [] ,
             extra_r1_r2_file: [],
             no_region_tag: [] ,
             multiple_region_tag: []
           }

  passed_libs = {}

  name_with_r1_r2 = []

  name_array.each do |name|
    tag = parser_file_name(name)[:tag]
    if name !~ /\.fastq\Z|\.fastq\.gz\Z/
      name_array.delete(name)
    elsif tag.count("R1") == 0 and tag.count("R2") == 0
      errors[:no_region_tag] << name
    elsif tag.count("R1") > 0 and tag.count("R2") > 0
      errors[:multiple_region_tag] << name
    elsif tag.count("R1") > 1 or tag.count("R2") > 1
      errors[:multiple_region_tag] << name
    else
      name_with_r1_r2 << name
    end
  end

  libs = {}

  name_with_r1_r2.map do |name|
    libname = parser_file_name(name)[:libname]
    libs[libname] ||= []
    libs[libname] << name
  end

  libs.each do |libname, files|
    count_r1_file = 0
    count_r2_file = 0
    files.each do |name|
      tag = parser_file_name(name)[:tag]
      if tag.include? "R1"
        count_r1_file += 1
      elsif tag.include? "R2"
        count_r2_file += 1
      end
    end

    if count_r1_file > 1 or count_r2_file > 1
      errors[:extra_r1_r2_file] += files
    elsif count_r1_file.zero?
      errors[:missing_r1_file] += files
    elsif count_r2_file.zero?
      errors[:missing_r2_file] += files
    else
      passed_libs[libname] = files
    end
  end

  file_name_with_lib_name = {}
  passed_libs.each do |lib_name, files|
    files.each do |f|
      file_name_with_lib_name[f] = lib_name
    end
  end

  passed_names = []

  passed_libs.values.each { |names| passed_names += names}

  if passed_names.size < name_array.size
    pass = false
  else
    pass = true
  end

  file_name_with_error_type = {}

  errors.each do |type, files|
    files.each do |f|
      file_name_with_error_type[f] ||= []
      file_name_with_error_type[f] << type.to_s.tr("_", "\s")
    end
  end

  file_check = []

  name_array.each do |name|
    file_check_hash = {}
    file_check_hash[:fileName] = name
    file_check_hash[:errors] = file_name_with_error_type[name]
    file_check_hash[:libName] = file_name_with_lib_name[name]

    file_check << file_check_hash
  end

  return { allPass: pass, files: file_check }
end