Module: Bio::Alignment::DNASequenceReads

Defined in:
lib/bio/alignment/dna_sequence.rb

Constant Summary collapse

CLUSTALW_OPTS =
%w(gapopen gapext dnamatrix type)
ALIGN_OPTS =
{
  :type => 'DNA',
  :gapopen => 20,
  :gapext => 20,
  :dnamatrix => 'IUB',  # "IUB" || "CLUSTALW"
  :fidelity_length => 10,
  :consensus_fidelity => true,
}

Class Method Summary collapse

Class Method Details

.align_pairwise(bioseqs, opt = {}) ⇒ Object

returns high quality pairwise alignments based on the fidelity_length option



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/bio/alignment/dna_sequence.rb', line 87

def align_pairwise(bioseqs, opt={})
  factory = Bio::ClustalW.new
  clustal_opts = hash_opts_to_clustalopts(opt)
  factory.options = clustal_opts
  template = bioseqs.shift
  start_length = []
  pairwise_aligns = bioseqs.map do |bseq|
    clust_al = clustal_align([template, bseq], factory)
    cl_cons = clust_al.consensus
    aligned_string = clust_al[1].to_s
    #(st, len) = find_good_section(aligned_string, opt[:fidelity_length])
    seq_to_use = 
      if opt[:consensus_fidelity]
        cl_cons
      else
        aligned_string
      end
    (st, len) = find_good_section(seq_to_use, opt[:fidelity_length])
    if st
      pristine = aligned_string[st, len].gsub('-','')  # pristine read (ends removed)
      clustal_align([template.to_s, Bio::Sequence::NA.new(pristine)], factory)
    else
      warn "a sequence does not meeting min fidelity! using original alignment" 
      clust_al
    end

  end
end

.clustal_align(bioseqs, factory) ⇒ Object



71
72
73
74
# File 'lib/bio/alignment/dna_sequence.rb', line 71

def clustal_align(bioseqs, factory)
  al = Bio::Alignment.new(bioseqs)
  al.do_align(factory)
end

.consensus_string_and_stats(strings) ⇒ Object

assumes the first is the template



178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/bio/alignment/dna_sequence.rb', line 178

def consensus_string_and_stats(strings)
  as_chars = strings.map {|v| v.split("") }
  stats = Array.new(6, 0)
  consensus_string = as_chars.shift.zip(*as_chars).map do |chrs|
    consensus_bool_ar = Array.new(6)
    symbols = [' '] + %w(^ = . ^ ?) 
    all_gaps = 0
    template_gap = 1
    agreement = 2
    gap_below_template = 3
    all_bad_matches = 4
    non_consensus = 5

    first = chrs.shift
    if [first, *chrs].all? {|v| v.nil? or (v == '-') }
      consensus_bool_ar[all_gaps] = true
    elsif first == '-'
      consensus_bool_ar[template_gap] = true
    elsif chrs.all? {|v| v == '-'}
      consensus_bool_ar[gap_below_template] = true
    elsif chrs.all? {|v| (v == '-') or (v == first) }
      consensus_bool_ar[agreement] = true
    elsif chrs.all? {|v| (v == '-') or (v != first) }
      consensus_bool_ar[all_bad_matches] = true
    else
      consensus_bool_ar[non_consensus] = true
    end
    consensus_bool_ar.each_with_index {|v,i| stats[i] += 1 if v }
    symbols[consensus_bool_ar.index(true)]
  end.join
  [consensus_string, stats]
end

.cs_to_s(ar) ⇒ Object



165
166
167
# File 'lib/bio/alignment/dna_sequence.rb', line 165

def cs_to_s(ar)
  ar.map {|v| v.nil? ? '-' : v.chr }.join
end

.exactly_chars(string, n) ⇒ Object



212
213
214
215
# File 'lib/bio/alignment/dna_sequence.rb', line 212

def exactly_chars(string, n)
  at_least = "%#{n}s" % string
  at_least[0,n]
end

.find_good_section(iupac_concensus_string, min_length) ⇒ Object

returns (start, length) where min_length reads are correct



29
30
31
32
33
34
35
36
37
38
# File 'lib/bio/alignment/dna_sequence.rb', line 29

def find_good_section(iupac_concensus_string, min_length)
  start = find_start_good_section(iupac_concensus_string, min_length)
  from_end = find_start_good_section(iupac_concensus_string.reverse, min_length)
  length = iupac_concensus_string.length - start - from_end
  if length < 0
    nil
  else
    [start, length]
  end
end

.find_start_good_section(iupac_concensus_string, min_length) ⇒ Object

returns the index of the starting run of good chars



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/bio/alignment/dna_sequence.rb', line 11

def find_start_good_section(iupac_concensus_string, min_length)
  good_char_count = 0
  char_index = 0
  iupac_concensus_string.each_char do |char|
    if char =~ /[^\?\-Nn]/
      good_char_count += 1
      if good_char_count >= min_length
        break
      end
    else
      good_char_count = 0
    end
    char_index += 1
  end
  char_index - (good_char_count - 1)
end

.first_non_dash_char(string) ⇒ Object



60
61
62
63
64
65
66
67
68
69
# File 'lib/bio/alignment/dna_sequence.rb', line 60

def first_non_dash_char(string)
  char_cnt = 0
  string.each_char do |char|
    if char != '-'
      break
    end
    char_cnt += 1
  end
  char_cnt
end

.hash_opts_to_clustalopts(hash) ⇒ Object



40
41
42
43
44
45
46
47
48
# File 'lib/bio/alignment/dna_sequence.rb', line 40

def hash_opts_to_clustalopts(hash)
  array = []
  hash.each do |k,v|
    if CLUSTALW_OPTS.include?(k.to_s)
      array << "-#{k}=#{v}"
    end
  end
  array
end

.lstrip_dash(string) ⇒ Object



50
51
52
53
# File 'lib/bio/alignment/dna_sequence.rb', line 50

def lstrip_dash(string)
  chr = first_non_dash_char(string)
  string[chr..-1]
end

.merge_pairwise(aligns) ⇒ Object

assumes all were aligned to the same template (the first of a pair)



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/bio/alignment/dna_sequence.rb', line 117

def merge_pairwise(aligns)
  ps = aligns.map do |align| 
    seqs = []
    align.each do |bioseq|
      seqs << bioseq.to_s
    end
    seqs
  end
  template = []
  #m,x,n
  x = 2
  ftemp = ps.first.first
  nmax = ps.map {|pair| pair.first.size }.max
  mmax = ps.size
  mar = (0...mmax).to_a
  others = mar.map { [] }
  ns = mar.map { 0 }
  tn = 0
  on = 0
  (0...nmax).each do |n|
    (t_dsh, t_no_dsh) = mar.partition do |m| 
      # this is RUBY 1.8 ONLY!!
      ps[m][0][ns[m]] == 45  # '-' is ascii 45
    end

    # if a template has a dash, all other off-templates need a dash
    if t_dsh.size > 0
      template[tn] = 45
      t_no_dsh.each do |m|
        # don't update these guys counter
        others[m][tn] = 45
      end
      t_dsh.each do |m|
        others[m][tn] = ps[m][1][ns[m]]
        ns[m] += 1
      end
    else # no dashes in the template
      t_no_dsh.each do |m|
        others[m][tn] = ps[m][1][ns[m]]
      end
      template[tn] = ps[0][0][ns[0]]
      ns.map!{|v| v+1 } 
    end
    tn += 1
  end
  [cs_to_s(template), others.map! {|ar| cs_to_s(ar) } ]
end

all gaps <blank>

template gap              ^
gap below template        .
agreement                 =
all bad matches           ^
non-consensus             ?

accepts :template => template_sequence



226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/bio/alignment/dna_sequence.rb', line 226

def print_align(io, sequences, labels, opts={})
  opts = {:cutoff => 70, :start => 0, :chars => 20}.merge(opts)
  (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
  spacer = "  "

  if opts[:template]
    sequences.unshift(opts[:template])
    labels.unshift(opts[:template_label])
  end

  all_stats = Array.new(6,0)
  loop do
    fin = false

    max_length = 0
    lines = []
    consensus_line = ""
    fragments = sequences.map do |string|
      fin = (start >= string.length )
      break if fin

      string_frag = string[start, length]

      string_frag
    end ; break if fin

    doubles = fragments.zip(labels)

    doubles = doubles.select {|frag, _| (frag.size > 0) && (frag =~ /[^-]/) }

    max_length = doubles.map {|frag, _| frag.size }.max

    (cs, stats) = consensus_string_and_stats( doubles.map {|frag,_| frag } )
    all_stats = all_stats.zip(stats).map {|a,b| a + b }

    doubles.push( [cs, "<CONSENSUS>"] )

    lines = doubles.map {|frag, label| [exactly_chars(label, chars),spacer,frag].join }

    ## the counters at the top of the line
    start_s = start.to_s
    finish_s = (start + max_length).to_s
    count_line_gap = max_length - (start_s.size + finish_s.size)
    count_line = [start_s, spacer]
    unless count_line_gap < 1
      count_line << " " * count_line_gap
    end
    io.puts [exactly_chars("", chars), spacer, count_line.join].join

    io.puts lines.join("\n")

    io.puts " "  # separator between lines
    start += length
  end
end

.strip_dash(string) ⇒ Object



55
56
57
58
# File 'lib/bio/alignment/dna_sequence.rb', line 55

def strip_dash(string)
  ls = lstrip_dash(string)
  lstrip_dash(ls.reverse).reverse
end