Class: Bio::Alignment::CIGAR

Inherits:
Object
  • Object
show all
Includes:
IteratePairs
Defined in:
lib/bio-sam-mutation/bio/alignment/cigar.rb

Overview

Parse a CIGAR string An example from Exonerate output. Ideally will also allow SAM file input to be used.

1 : CGGCTATGGGGTCGTGGGTCCCGCGTTG-CTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT :  63
    | | || | ||||||||||||||||||| |||||||||||||||||||||||||||||||||||
1 : CAG-TA-GTGGTCGTGGGTCCCGCGTTGTCTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT :  62

ref: CAGTAGTGGTCGTGGGTCCCGCGTTGTCTCTGG… cigar: SP-A12_D02_2015-01-16.seq 0 611 + SP-A3_ref 0 621 + 2514 M 3 I 1 M 2 I 1 M 21 D 1 M 306 D 9 M 89 I 1 M 126 D 1 M 24 D 1 M 8 I 1 M 6 D 1 M 5 D 1 M 17 I are not counted in reference Regexp (from SAM specification) - but in exonerate the number comes first: ([0-9][MIDNSHP])|*

Class Attribute Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string, ref = nil, source = "") ⇒ CIGAR

Returns a new instance of CIGAR.



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 27

def initialize(string,ref=nil,source="")
	# strip out whitespace
	string.gsub!(/\s+/,"")

	# Auto-detect source if not supplied
	if !(Bio::Alignment::CIGAR.regexps.keys.include? source)
	  Bio::Alignment::CIGAR.regexps.each do |k,v|
			# Look for match at start of string
			if m = string.match(v)
			  source = k if m.offset(0)[0] == 0
		  end
		end
		if source == ""
			raise "Source (e.g. 'exonerate', 'sam') not given and failed to auto-detect."
		end
   end
	# Make an array of pairs of of cigar elements:
	@pairs = string.scan(Bio::Alignment::CIGAR.regexps[source])
	if source == "exonerate"
		@pairs.map!{|pair| [pair[0].to_s, pair[1].to_i]}
	else
		# Provision to have number and identifier the other way round
		@pairs.map!{|pair| [pair[1].to_s, pair[0].to_i]}
	end

	# Include reference sequence if provided
	@reference = ref
	# Check length of reference = sum(M+D)?
	#warn "Reference length is not equal to that implied by CIGAR string: #{@reference.length}, #{self.reference_length}." unless @reference.length == self.reference_length

end

Class Attribute Details

.query_operationsObject

Returns the value of attribute query_operations.



16
17
18
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 16

def query_operations
  @query_operations
end

.reference_operationsObject

Returns the value of attribute reference_operations.



16
17
18
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 16

def reference_operations
  @reference_operations
end

.regexpsObject

Returns the value of attribute regexps.



16
17
18
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 16

def regexps
  @regexps
end

.subexpObject

Returns the value of attribute subexp.



16
17
18
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 16

def subexp
  @subexp
end

Instance Attribute Details

#pairsObject

Returns the value of attribute pairs.



25
26
27
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 25

def pairs
  @pairs
end

#referenceObject

Returns the value of attribute reference.



25
26
27
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 25

def reference
  @reference
end

Instance Method Details

#combine_adjacentObject

TODO combine adjacent operations of the same type into a single pair



198
199
200
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 198

def combine_adjacent

end

#deleted_lengthObject



96
97
98
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 96

def deleted_length
	count_type("D")
end

#hgnc(reference_pos = 0, insertions = [], type = "g", *subs) ⇒ Object

Output hgnc variant format given reference position. Only deletions can be accurately annotated from the cigar string; insertions or wild type seqeunces return nil NB mutation calling and annotation now implemented as extension to Bio::DB::Alignment (SAM)



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 148

def hgnc(reference_pos=0,insertions=[],type="g",*subs)
	if insertions
		if insertions.is_a? String
			insertions = [insertions]
		end
	end
	first_match = true
	total = 0
	hgnc_format = []
	@pairs.each do |pair|
		case pair[0]
			when "M"
				#break if first_match == false
				reference_pos += pair[1]
				total += pair[1]
				first_match = false
			when "D"
				deleted_bases = @reference[total,pair[1]].upcase
				if (pair[1] == 1)
					string = (reference_pos + 1).to_s
				else
					string = (reference_pos + 1).to_s + "_" + (reference_pos + pair[1]).to_s
				end
				string = string + "del" + deleted_bases
				hgnc_format << string
				total += pair[1]
			when "I"
				inserted_bases = (insertions.length == 0) ? "N" : insertions.shift

				hgnc_format << (reference_pos).to_s + "_" + (reference_pos + 1).to_s + "ins" + inserted_bases.upcase
		end
	end
	# Use for substitutions, but could also pass any other annotation to include in here, as an array of strings
	subs = subs.first # >1 arguments discarded
	if subs
		if (subs.length > 0 && (subs.is_a? Array))
			hgnc_format = hgnc_format + subs
		end
	end
	if hgnc_format.length == 0
		nil
	elsif hgnc_format.length == 1
		type.to_s + "." + hgnc_format[0]
	else
		type.to_s + "." + "[" + hgnc_format.join(";") + "]"
	end

end

#inserted_lengthObject



100
101
102
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 100

def inserted_length
	count_type("I")
end

#masked_lengthObject



104
105
106
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 104

def masked_length
	count_type(/[SH]/)
end

#matched_lengthObject



92
93
94
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 92

def matched_length
	count_type("M")
end

#positions(type) ⇒ Object

Returns a hash (keyed by operation type) of three element arrays: the start positions on the reference of operations of the given type(s) and the length of the operation, followed by query position (for e.g. retrieving inserted bases from SAM). A regexp can be used to specify multiple types e.g. /[ID]/.



204
205
206
207
208
209
210
211
212
213
214
215
216
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 204

def positions(type)
	total = 0
	qtotal = 0
	hash = Hash.new{|h,k| h[k] = []}
	@pairs.each do |pair|
		if pair[0].match(type)
			hash[$&] << [total, pair[1], qtotal]
		end
		total += pair[1] if pair[0].match Bio::Alignment::CIGAR.reference_operations
		qtotal += pair[1] if pair[0].match Bio::Alignment::CIGAR.query_operations
	end
	hash
end

#query(insertions = nil) ⇒ Object

Output a representation of the query: replace deleted portions with “-”, flag insertions with “*” or sim. Optionally provide the sequence (or symbols to use) of insertions, in order of appearence. Should be able to accept an array TODO: Add support for substitution highlighting (e.g lowercasing)



119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 119

def query(insertions=nil)
	if (insertions && (insertions.is_a? String))
		insertions = [insertions]
	end
	sequence = []
	total = 0
	@pairs.each do |pair|
		if pair[0].match("M")
			sequence << @reference[total..total+pair[1]-1].upcase
			total += pair[1]
		end
		if pair[0].match("I")
			if (insertions)
				insertion = insertions.shift.to_s
			else
				insertion = '['+pair[1].to_s+']'
			end
			sequence << insertion
		end
		if pair[0].match("D")
			pair[1].times{ sequence << "-" }
			total += pair[1]
		end
	end
	sequence.join("")
end

#query_lengthObject



112
113
114
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 112

def query_length
	count_type(Bio::Alignment::CIGAR.query_operations)
end

#reference_lengthObject



108
109
110
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 108

def reference_length
	count_type(Bio::Alignment::CIGAR.reference_operations)
end

#remove_empty!Object



87
88
89
90
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 87

def remove_empty!
	self.pairs.keep_if{|pair| pair[1] != 0 }
	self
end

#remove_small!(threshold = 1) ⇒ Object



80
81
82
83
84
85
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 80

def remove_small!(threshold=1)
	# Deletions convert to matches, insertions just remove
	deletions_to_matches(threshold)
	remove_small_nonmatches(threshold)
	self
end

#subalignment(offset, length, regexp = Bio::Alignment::CIGAR.reference_operations) ⇒ Object Also known as: slice

Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment



60
61
62
63
64
65
66
67
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 60

def subalignment(offset,length,regexp=Bio::Alignment::CIGAR.reference_operations)
	new_array = iterate_pairs(@pairs,offset,length,regexp)
	# Return a CIGAR instance with just the new alignment
	new_string = new_array.join(" ")
	# -1 from offset as ruby string starts at zero
	new_cigar = Bio::Alignment::CIGAR.new(new_string,@reference[offset-1,length])
	new_cigar.remove_empty!
end

#subcigar(offset, length) ⇒ Object

Given a CIGAR-based [not reference - use subalignment] offset and length, return a subregion



71
72
73
74
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 71

def subcigar(offset,length)
	# No regexp - includes everything
	self.subalignment(offset,length,//)
end

#unmasked(offset, length) ⇒ Object



76
77
78
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 76

def unmasked(offset,length)
	self.subalignment(offset,length,/[MDI]/)
end