Module: Bio::Alignment::DNASequenceReads
- Defined in:
- lib/bio/alignment/dna_sequence.rb
Constant Summary collapse
- CLUSTALW_OPTS =
%w(gapopen gapext dnamatrix type)
- ALIGN_OPTS =
{ :type => 'DNA', :gapopen => 20, :gapext => 20, :dnamatrix => 'IUB', # "IUB" || "CLUSTALW" :fidelity_length => 10, :consensus_fidelity => true, }
Class Method Summary collapse
-
.align_pairwise(bioseqs, opt = {}) ⇒ Object
returns high quality pairwise alignments based on the fidelity_length option.
- .clustal_align(bioseqs, factory) ⇒ Object
-
.consensus_string_and_stats(strings) ⇒ Object
assumes the first is the template.
- .cs_to_s(ar) ⇒ Object
- .exactly_chars(string, n) ⇒ Object
-
.find_good_section(iupac_concensus_string, min_length) ⇒ Object
returns (start, length) where min_length reads are correct.
-
.find_start_good_section(iupac_concensus_string, min_length) ⇒ Object
returns the index of the starting run of good chars.
- .first_non_dash_char(string) ⇒ Object
- .hash_opts_to_clustalopts(hash) ⇒ Object
- .lstrip_dash(string) ⇒ Object
-
.merge_pairwise(aligns) ⇒ Object
assumes all were aligned to the same template (the first of a pair).
-
.print_align(io, sequences, labels, opts = {}) ⇒ Object
all gaps <blank> template gap ^ gap below template .
- .strip_dash(string) ⇒ Object
Class Method Details
.align_pairwise(bioseqs, opt = {}) ⇒ Object
returns high quality pairwise alignments based on the fidelity_length option
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/bio/alignment/dna_sequence.rb', line 87 def align_pairwise(bioseqs, opt={}) factory = Bio::ClustalW.new clustal_opts = hash_opts_to_clustalopts(opt) factory. = clustal_opts template = bioseqs.shift start_length = [] pairwise_aligns = bioseqs.map do |bseq| clust_al = clustal_align([template, bseq], factory) cl_cons = clust_al.consensus aligned_string = clust_al[1].to_s #(st, len) = find_good_section(aligned_string, opt[:fidelity_length]) seq_to_use = if opt[:consensus_fidelity] cl_cons else aligned_string end (st, len) = find_good_section(seq_to_use, opt[:fidelity_length]) if st pristine = aligned_string[st, len].gsub('-','') # pristine read (ends removed) clustal_align([template.to_s, Bio::Sequence::NA.new(pristine)], factory) else warn "a sequence does not meeting min fidelity! using original alignment" clust_al end end end |
.clustal_align(bioseqs, factory) ⇒ Object
71 72 73 74 |
# File 'lib/bio/alignment/dna_sequence.rb', line 71 def clustal_align(bioseqs, factory) al = Bio::Alignment.new(bioseqs) al.do_align(factory) end |
.consensus_string_and_stats(strings) ⇒ Object
assumes the first is the template
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/bio/alignment/dna_sequence.rb', line 178 def consensus_string_and_stats(strings) as_chars = strings.map {|v| v.split("") } stats = Array.new(6, 0) consensus_string = as_chars.shift.zip(*as_chars).map do |chrs| consensus_bool_ar = Array.new(6) symbols = [' '] + %w(^ = . ^ ?) all_gaps = 0 template_gap = 1 agreement = 2 gap_below_template = 3 all_bad_matches = 4 non_consensus = 5 first = chrs.shift if [first, *chrs].all? {|v| v.nil? or (v == '-') } consensus_bool_ar[all_gaps] = true elsif first == '-' consensus_bool_ar[template_gap] = true elsif chrs.all? {|v| v == '-'} consensus_bool_ar[gap_below_template] = true elsif chrs.all? {|v| (v == '-') or (v == first) } consensus_bool_ar[agreement] = true elsif chrs.all? {|v| (v == '-') or (v != first) } consensus_bool_ar[all_bad_matches] = true else consensus_bool_ar[non_consensus] = true end consensus_bool_ar.each_with_index {|v,i| stats[i] += 1 if v } symbols[consensus_bool_ar.index(true)] end.join [consensus_string, stats] end |
.cs_to_s(ar) ⇒ Object
165 166 167 |
# File 'lib/bio/alignment/dna_sequence.rb', line 165 def cs_to_s(ar) ar.map {|v| v.nil? ? '-' : v.chr }.join end |
.exactly_chars(string, n) ⇒ Object
212 213 214 215 |
# File 'lib/bio/alignment/dna_sequence.rb', line 212 def exactly_chars(string, n) at_least = "%#{n}s" % string at_least[0,n] end |
.find_good_section(iupac_concensus_string, min_length) ⇒ Object
returns (start, length) where min_length reads are correct
29 30 31 32 33 34 35 36 37 38 |
# File 'lib/bio/alignment/dna_sequence.rb', line 29 def find_good_section(iupac_concensus_string, min_length) start = find_start_good_section(iupac_concensus_string, min_length) from_end = find_start_good_section(iupac_concensus_string.reverse, min_length) length = iupac_concensus_string.length - start - from_end if length < 0 nil else [start, length] end end |
.find_start_good_section(iupac_concensus_string, min_length) ⇒ Object
returns the index of the starting run of good chars
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/bio/alignment/dna_sequence.rb', line 11 def find_start_good_section(iupac_concensus_string, min_length) good_char_count = 0 char_index = 0 iupac_concensus_string.each_char do |char| if char =~ /[^\?\-Nn]/ good_char_count += 1 if good_char_count >= min_length break end else good_char_count = 0 end char_index += 1 end char_index - (good_char_count - 1) end |
.first_non_dash_char(string) ⇒ Object
60 61 62 63 64 65 66 67 68 69 |
# File 'lib/bio/alignment/dna_sequence.rb', line 60 def first_non_dash_char(string) char_cnt = 0 string.each_char do |char| if char != '-' break end char_cnt += 1 end char_cnt end |
.hash_opts_to_clustalopts(hash) ⇒ Object
40 41 42 43 44 45 46 47 48 |
# File 'lib/bio/alignment/dna_sequence.rb', line 40 def hash_opts_to_clustalopts(hash) array = [] hash.each do |k,v| if CLUSTALW_OPTS.include?(k.to_s) array << "-#{k}=#{v}" end end array end |
.lstrip_dash(string) ⇒ Object
50 51 52 53 |
# File 'lib/bio/alignment/dna_sequence.rb', line 50 def lstrip_dash(string) chr = first_non_dash_char(string) string[chr..-1] end |
.merge_pairwise(aligns) ⇒ Object
assumes all were aligned to the same template (the first of a pair)
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# File 'lib/bio/alignment/dna_sequence.rb', line 117 def merge_pairwise(aligns) ps = aligns.map do |align| seqs = [] align.each do |bioseq| seqs << bioseq.to_s end seqs end template = [] #m,x,n x = 2 ftemp = ps.first.first nmax = ps.map {|pair| pair.first.size }.max mmax = ps.size mar = (0...mmax).to_a others = mar.map { [] } ns = mar.map { 0 } tn = 0 on = 0 (0...nmax).each do |n| (t_dsh, t_no_dsh) = mar.partition do |m| # this is RUBY 1.8 ONLY!! ps[m][0][ns[m]] == 45 # '-' is ascii 45 end # if a template has a dash, all other off-templates need a dash if t_dsh.size > 0 template[tn] = 45 t_no_dsh.each do |m| # don't update these guys counter others[m][tn] = 45 end t_dsh.each do |m| others[m][tn] = ps[m][1][ns[m]] ns[m] += 1 end else # no dashes in the template t_no_dsh.each do |m| others[m][tn] = ps[m][1][ns[m]] end template[tn] = ps[0][0][ns[0]] ns.map!{|v| v+1 } end tn += 1 end [cs_to_s(template), others.map! {|ar| cs_to_s(ar) } ] end |
.print_align(io, sequences, labels, opts = {}) ⇒ Object
all gaps <blank>
template gap ^
gap below template .
agreement =
all bad matches ^
non-consensus ?
accepts :template => template_sequence
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
# File 'lib/bio/alignment/dna_sequence.rb', line 226 def print_align(io, sequences, labels, opts={}) opts = {:cutoff => 70, :start => 0, :chars => 20}.merge(opts) (start, length, chars) = opts.values_at(:start, :cutoff, :chars) spacer = " " if opts[:template] sequences.unshift(opts[:template]) labels.unshift(opts[:template_label]) end all_stats = Array.new(6,0) loop do fin = false max_length = 0 lines = [] consensus_line = "" fragments = sequences.map do |string| fin = (start >= string.length ) break if fin string_frag = string[start, length] string_frag end ; break if fin doubles = fragments.zip(labels) doubles = doubles.select {|frag, _| (frag.size > 0) && (frag =~ /[^-]/) } max_length = doubles.map {|frag, _| frag.size }.max (cs, stats) = consensus_string_and_stats( doubles.map {|frag,_| frag } ) all_stats = all_stats.zip(stats).map {|a,b| a + b } doubles.push( [cs, "<CONSENSUS>"] ) lines = doubles.map {|frag, label| [exactly_chars(label, chars),spacer,frag].join } ## the counters at the top of the line start_s = start.to_s finish_s = (start + max_length).to_s count_line_gap = max_length - (start_s.size + finish_s.size) count_line = [start_s, spacer] unless count_line_gap < 1 count_line << " " * count_line_gap end io.puts [exactly_chars("", chars), spacer, count_line.join].join io.puts lines.join("\n") io.puts " " # separator between lines start += length end end |
.strip_dash(string) ⇒ Object
55 56 57 58 |
# File 'lib/bio/alignment/dna_sequence.rb', line 55 def strip_dash(string) ls = lstrip_dash(string) lstrip_dash(ls.reverse).reverse end |