Class: Bio::MAF::Tiler
- Inherits:
-
Object
- Object
- Bio::MAF::Tiler
- Defined in:
- lib/bio/maf/tiler.rb
Overview
Tiles a given genomic interval. Inspired by: lib/bx/align/tools/tile.py in bx-python
Instance Attribute Summary collapse
-
#fill_char ⇒ String
The character used to fill regions where no sequence data is available for a particular species.
-
#index ⇒ Object
Returns the value of attribute index.
-
#interval ⇒ Object
GenomicInterval.
-
#parser ⇒ Object
Returns the value of attribute parser.
-
#reference ⇒ Object
Returns the value of attribute reference.
-
#remove_absent_species ⇒ Object
Returns the value of attribute remove_absent_species.
-
#species ⇒ Array<String>
The species of interest to extract from the MAF file.
-
#species_map ⇒ Hash
A hash mapping species to their desired output names.
Instance Method Summary collapse
-
#build_bio_alignment ⇒ Bio::BioAlignment::Alignment
Tile sequences to build a new Alignment object.
-
#initialize ⇒ Tiler
constructor
A new instance of Tiler.
- #non_fill_re ⇒ Object
- #output_text ⇒ Object
- #ref_data(range) ⇒ Object
- #runs(mask) {|cur_start...mask.size, cur| ... } ⇒ Object
- #species_for_output ⇒ Object
- #species_to_use ⇒ Object
-
#tile ⇒ Array<String>
Return an array of tiled sequence data, in the order given by #species_to_use.
-
#write_fasta(f) ⇒ Object
Write a FASTA representation of the tiled sequences to the given output stream.
Constructor Details
#initialize ⇒ Tiler
Returns a new instance of Tiler.
35 36 37 38 39 |
# File 'lib/bio/maf/tiler.rb', line 35 def initialize @species_map = {} self.fill_char = '*' self.remove_absent_species = true end |
Instance Attribute Details
#fill_char ⇒ String
The character used to fill regions where no sequence data is available for a particular species. Defaults to *
.
31 32 33 |
# File 'lib/bio/maf/tiler.rb', line 31 def fill_char @fill_char end |
#index ⇒ Object
Returns the value of attribute index.
11 12 13 |
# File 'lib/bio/maf/tiler.rb', line 11 def index @index end |
#interval ⇒ Object
GenomicInterval
15 16 17 |
# File 'lib/bio/maf/tiler.rb', line 15 def interval @interval end |
#parser ⇒ Object
Returns the value of attribute parser.
12 13 14 |
# File 'lib/bio/maf/tiler.rb', line 12 def parser @parser end |
#reference ⇒ Object
Returns the value of attribute reference.
13 14 15 |
# File 'lib/bio/maf/tiler.rb', line 13 def reference @reference end |
#remove_absent_species ⇒ Object
Returns the value of attribute remove_absent_species.
33 34 35 |
# File 'lib/bio/maf/tiler.rb', line 33 def remove_absent_species @remove_absent_species end |
#species ⇒ Array<String>
The species of interest to extract from the MAF file. Will be set as a Parser#sequence_filter for parsing. Defaults to the keys of #species_map.
22 23 24 |
# File 'lib/bio/maf/tiler.rb', line 22 def species @species end |
#species_map ⇒ Hash
A hash mapping species to their desired output names.
27 28 29 |
# File 'lib/bio/maf/tiler.rb', line 27 def species_map @species_map end |
Instance Method Details
#build_bio_alignment ⇒ Bio::BioAlignment::Alignment
Tile sequences to build a new Alignment object. This will have one Sequence per entry in #species or #species_map, in the same order. Each sequence will have an id given by #species_map or, if none is present, the identifier from #species.
181 182 183 184 185 |
# File 'lib/bio/maf/tiler.rb', line 181 def build_bio_alignment out = output_text.to_a Bio::BioAlignment::Alignment.new(out.collect { |e| e[1] }, out.collect { |e| e[0] }) end |
#non_fill_re ⇒ Object
163 164 165 166 |
# File 'lib/bio/maf/tiler.rb', line 163 def non_fill_re fill_esc = Regexp.escape(fill_char) Regexp.compile("[^#{fill_esc}]") end |
#output_text ⇒ Object
168 169 170 |
# File 'lib/bio/maf/tiler.rb', line 168 def output_text species_for_output.zip(tile()).reject { |s, t| t.nil? } end |
#ref_data(range) ⇒ Object
73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/bio/maf/tiler.rb', line 73 def ref_data(range) if reference if reference.respond_to? :read_interval reference.read_interval(range.begin, range.end) elsif reference.is_a? String reference.slice(range) else raise "Unhandled reference data source: #{reference}" end else nil end end |
#runs(mask) {|cur_start...mask.size, cur| ... } ⇒ Object
199 200 201 202 203 204 205 206 207 208 209 210 |
# File 'lib/bio/maf/tiler.rb', line 199 def runs(mask) cur = nil cur_start = nil mask.each_with_index do |obj, i| if ! cur.equal?(obj) yield(cur_start...i, cur) if cur cur = obj cur_start = i end end yield(cur_start...mask.size, cur) end |
#species_for_output ⇒ Object
91 92 93 |
# File 'lib/bio/maf/tiler.rb', line 91 def species_for_output species_to_use.collect { |s| species_map[s] || s } end |
#species_to_use ⇒ Object
87 88 89 |
# File 'lib/bio/maf/tiler.rb', line 87 def species_to_use species || species_map.keys end |
#tile ⇒ Array<String>
Return an array of tiled sequence data, in the order given by #species_to_use.
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# File 'lib/bio/maf/tiler.rb', line 98 def tile parser.sequence_filter[:only_species] = species_to_use parser.opts[:remove_gaps] = true LOG.debug { "finding blocks covering interval #{interval}." } blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] } mask = Array.new(interval.length, :ref) i_start = interval.zero_start i_end = interval.zero_end if reference LOG.debug { "using a #{reference.class} reference." } ref_region = ref_data(i_start...i_end) end LOG.debug "tiling #{blocks.count} blocks." blocks.each do |block| ref = block.ref_seq LOG.debug { "tiling with block #{ref.start}-#{ref.end}" } slice_start = [i_start, ref.start].max slice_end = [i_end, ref.end].min mask.fill(block, (slice_start - i_start)...(slice_end - i_start)) end text = [] species_to_use.each { |s| text << '' } nonref_text = text[1...text.size] runs(mask) do |range, block| g_range = (range.begin + i_start)...(range.end + i_start) if block == :ref # not covered by an alignment block # use the reference sequence if given, otherwise 'N' range_size = range.end - range.begin text[0] << if ref_region ref_region.slice(range) else 'N' * range_size end fill_text = fill_char * range_size nonref_text.each { |t| t << fill_text } else # covered by an alignment block t_range = block.ref_seq.text_range(g_range) species_to_use.each_with_index do |species, i| sp_text = text[i] seq = block.sequences.find { |s| s.source == species || s.species == species } if seq # got alignment text sp_text << seq.text.slice(t_range) else # no alignment for this one here, use the fill char sp_text << fill_char * (t_range.end - t_range.begin) end end end end if remove_absent_species non_fill = non_fill_re LOG.debug { "searching for non-fill characters with #{non_fill}" } text.each_with_index do |seq, i| unless non_fill.match(seq) text[i] = nil end end end text end |
#write_fasta(f) ⇒ Object
Write a FASTA representation of the tiled sequences to the given output stream.
192 193 194 195 196 197 |
# File 'lib/bio/maf/tiler.rb', line 192 def write_fasta(f) output_text.each do |sp_out, text| f.puts ">#{sp_out}" f.puts text end end |