Class: Bio::Alignment::CIGAR

Inherits:
Object
  • Object
show all
Includes:
IteratePairs
Defined in:
lib/bio-sam-mutation/bio/alignment/cigar.rb

Overview

Parse a CIGAR string An example from Exonerate output. Ideally will also allow SAM file input to be used.

1 : CGGCTATGGGGTCGTGGGTCCCGCGTTG-CTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT :  63
    | | || | ||||||||||||||||||| |||||||||||||||||||||||||||||||||||
1 : CAG-TA-GTGGTCGTGGGTCCCGCGTTGTCTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT :  62

ref: CAGTAGTGGTCGTGGGTCCCGCGTTGTCTCTGG… cigar: SP-A12_D02_2015-01-16.seq 0 611 + SP-A3_ref 0 621 + 2514 M 3 I 1 M 2 I 1 M 21 D 1 M 306 D 9 M 89 I 1 M 126 D 1 M 24 D 1 M 8 I 1 M 6 D 1 M 5 D 1 M 17 I are not counted in reference Regexp (from SAM specification) - but in exonerate the number comes first: ([0-9][MIDNSHP])|*

Class Attribute Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string, ref = nil, source = "") ⇒ CIGAR

Returns a new instance of CIGAR.



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 27

def initialize(string,ref=nil,source="")
  # strip out whitespace
  string.gsub!(/\s+/,"")

  # Auto-detect source if not supplied
  if !(Bio::Alignment::CIGAR.regexps.keys.include? source)
    Bio::Alignment::CIGAR.regexps.each do |k,v|
      # Look for match at start of string
      if m = string.match(v)
        source = k if m.offset(0)[0] == 0
      end
    end
    if source == ""
      raise "Source (e.g. 'exonerate', 'sam') not given and failed to auto-detect."
    end
   end
  # Make an array of pairs of of cigar elements:
  @pairs = string.scan(Bio::Alignment::CIGAR.regexps[source])
  if source == "exonerate"
    @pairs.map!{|pair| [pair[0].to_s, pair[1].to_i]}
  else
    # Provision to have number and identifier the other way round
    @pairs.map!{|pair| [pair[1].to_s, pair[0].to_i]}
  end

  # Include reference sequence if provided
  @reference = ref
  # Check length of reference = sum(M+D)?
  #warn "Reference length is not equal to that implied by CIGAR string: #{@reference.length}, #{self.reference_length}." unless @reference.length == self.reference_length

end

Class Attribute Details

.query_operationsObject

Returns the value of attribute query_operations.



16
17
18
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 16

def query_operations
  @query_operations
end

.reference_operationsObject

Returns the value of attribute reference_operations.



16
17
18
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 16

def reference_operations
  @reference_operations
end

.regexpsObject

Returns the value of attribute regexps.



16
17
18
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 16

def regexps
  @regexps
end

.subexpObject

Returns the value of attribute subexp.



16
17
18
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 16

def subexp
  @subexp
end

Instance Attribute Details

#pairsObject

Returns the value of attribute pairs.



25
26
27
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 25

def pairs
  @pairs
end

#referenceObject

Returns the value of attribute reference.



25
26
27
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 25

def reference
  @reference
end

Instance Method Details

#combine_adjacentObject

TODO combine adjacent operations of the same type into a single pair



198
199
200
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 198

def combine_adjacent

end

#deleted_lengthObject



96
97
98
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 96

def deleted_length
  count_type("D")
end

#hgnc(reference_pos = 0, insertions = [], type = "g", *subs) ⇒ Object

Output hgnc variant format given reference position. Only deletions can be accurately annotated from the cigar string; insertions or wild type seqeunces return nil NB mutation calling and annotation now implemented as extension to Bio::DB::Alignment (SAM)



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 148

def hgnc(reference_pos=0,insertions=[],type="g",*subs)
  if insertions
    if insertions.is_a? String
      insertions = [insertions]
    end
  end
  first_match = true
  total = 0
  hgnc_format = []
  @pairs.each do |pair|
    case pair[0]
      when "M"
        #break if first_match == false
        reference_pos += pair[1]
        total += pair[1]
        first_match = false
      when "D"
        deleted_bases = @reference[total,pair[1]].upcase
        if (pair[1] == 1)
          string = (reference_pos + 1).to_s
        else
          string = (reference_pos + 1).to_s + "_" + (reference_pos + pair[1]).to_s
        end
        string = string + "del" + deleted_bases
        hgnc_format << string
        total += pair[1]
      when "I"
        inserted_bases = (insertions.length == 0) ? "N" : insertions.shift

        hgnc_format << (reference_pos).to_s + "_" + (reference_pos + 1).to_s + "ins" + inserted_bases.upcase
    end
  end
  # Use for substitutions, but could also pass any other annotation to include in here, as an array of strings
  subs = subs.first # >1 arguments discarded
  if subs
    if (subs.length > 0 && (subs.is_a? Array))
      hgnc_format = hgnc_format + subs
    end
  end
  if hgnc_format.length == 0
    nil
  elsif hgnc_format.length == 1
    type.to_s + "." + hgnc_format[0]
  else
    type.to_s + "." + "[" + hgnc_format.join(";") + "]"
  end

end

#inserted_lengthObject



100
101
102
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 100

def inserted_length
  count_type("I")
end

#masked_lengthObject



104
105
106
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 104

def masked_length
  count_type(/[SH]/)
end

#matched_lengthObject



92
93
94
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 92

def matched_length
  count_type("M")
end

#positions(type) ⇒ Object

Returns a hash (keyed by operation type) of three element arrays: the start positions on the reference of operations of the given type(s) and the length of the operation, followed by query position (for e.g. retrieving inserted bases from SAM). A regexp can be used to specify multiple types e.g. /[ID]/.



204
205
206
207
208
209
210
211
212
213
214
215
216
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 204

def positions(type)
  total = 0
  qtotal = 0
  hash = Hash.new{|h,k| h[k] = []}
  @pairs.each do |pair|
    if pair[0].match(type)
      hash[$&] << [total, pair[1], qtotal]
    end
    total += pair[1] if pair[0].match Bio::Alignment::CIGAR.reference_operations
    qtotal += pair[1] if pair[0].match Bio::Alignment::CIGAR.query_operations
  end
  hash
end

#query(insertions = nil) ⇒ Object

Output a representation of the query: replace deleted portions with “-”, flag insertions with “*” or sim. Optionally provide the sequence (or symbols to use) of insertions, in order of appearence. Should be able to accept an array TODO: Add support for substitution highlighting (e.g lowercasing)



119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 119

def query(insertions=nil)
  if (insertions && (insertions.is_a? String))
    insertions = [insertions]
  end
  sequence = []
  total = 0
  @pairs.each do |pair|
    if pair[0].match("M")
      sequence << @reference[total..total+pair[1]-1].upcase
      total += pair[1]
    end
    if pair[0].match("I")
      if (insertions)
        insertion = insertions.shift.to_s
      else
        insertion = '['+pair[1].to_s+']'
      end
      sequence << insertion
    end
    if pair[0].match("D")
      pair[1].times{ sequence << "-" }
      total += pair[1]
    end
  end
  sequence.join("")
end

#query_lengthObject



112
113
114
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 112

def query_length
  count_type(Bio::Alignment::CIGAR.query_operations)
end

#reference_lengthObject



108
109
110
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 108

def reference_length
  count_type(Bio::Alignment::CIGAR.reference_operations)
end

#remove_empty!Object



87
88
89
90
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 87

def remove_empty!
  self.pairs.keep_if{|pair| pair[1] != 0 }
  self
end

#remove_small!(threshold = 1) ⇒ Object



80
81
82
83
84
85
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 80

def remove_small!(threshold=1)
  # Deletions convert to matches, insertions just remove
  deletions_to_matches(threshold)
  remove_small_nonmatches(threshold)
  self
end

#subalignment(offset, length, regexp = Bio::Alignment::CIGAR.reference_operations) ⇒ Object Also known as: slice

Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment



60
61
62
63
64
65
66
67
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 60

def subalignment(offset,length,regexp=Bio::Alignment::CIGAR.reference_operations)
  new_array = iterate_pairs(@pairs,offset,length,regexp)
  # Return a CIGAR instance with just the new alignment
  new_string = new_array.join(" ")
  # -1 from offset as ruby string starts at zero
  new_cigar = Bio::Alignment::CIGAR.new(new_string,@reference[offset-1,length])
  new_cigar.remove_empty!
end

#subcigar(offset, length) ⇒ Object

Given a CIGAR-based [not reference - use subalignment] offset and length, return a subregion



71
72
73
74
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 71

def subcigar(offset,length)
  # No regexp - includes everything
  self.subalignment(offset,length,//)
end

#unmasked(offset, length) ⇒ Object



76
77
78
# File 'lib/bio-sam-mutation/bio/alignment/cigar.rb', line 76

def unmasked(offset,length)
  self.subalignment(offset,length,/[MDI]/)
end