Class: Bio::DB::Fasta::FastaFile

Inherits:
Object
  • Object
show all
Defined in:
lib/bio/db/fastadb.rb

Overview

Class that holds the fasta file. It is used as a database.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(fasta: nil, samtools: false) ⇒ FastaFile

Initialize the fasta file. If the fai file doesn't exists, it is generated at startup

  • fasta path to the fasta file

  • samtools path to samtools, if it is not provided, use the bundled version

Raises:


190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/bio/db/fastadb.rb', line 190

def initialize(fasta: nil, samtools: false)
  #puts "The arguments are: '#{fasta}':'#{samtools}'"
  @fasta_path = fasta
  @samtools = samtools  
  @index = nil
  @fasta_file = nil
  @samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools') if samtools == true
  raise FastaDBException.new(), "No path for the refernce fasta file. " if @fasta_path.nil?
  @fai_file = @fasta_path + ".fai" 
  unless File.file?(@fai_file) then
    command = "#{@samtools} faidx '#{@fasta_path}'"
    @last_command = command
    system(command)
  end
end

Instance Attribute Details

#fasta_pathObject (readonly)

Returns the value of attribute fasta_path


185
186
187
# File 'lib/bio/db/fastadb.rb', line 185

def fasta_path
  @fasta_path
end

Instance Method Details

#faidx(opts = {}) ⇒ Object

Index reference sequence in the FASTA format or extract subsequence from indexed reference sequence. If no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk. If regions are speficified, the subsequences will be retrieved and printed to stdout in the FASTA format. Options - if a subsequence is required

  • chr - [STRING] the reference name of the subsequence

  • start - [INT] the start position for the subsequence

  • stop - [INT] the stop position for the subsequence


225
226
227
228
229
230
231
232
233
234
# File 'lib/bio/db/fastadb.rb', line 225

def faidx(opts={})
  if opts.has_key?(:chr) and opts.has_key?(:start) and opts.has_key?(:stop)
    opts={:as_bio => false}
    self.fetch_reference(:chr,:start,:stop,opts)
  else
    command = "#{@samtools} faidx #{@fasta_path}"
    @last_command = command
    system(command)
  end
end

#fetch_sequence(region) ⇒ Object

The region needs to have a method to_region or a method to_s that ha the format “chromosome:start-end” as in samtools


282
283
284
285
286
287
288
289
290
291
292
293
294
# File 'lib/bio/db/fastadb.rb', line 282

def fetch_sequence(region)
  load_fai_entries
  region = Region.parse_region(region.to_s) unless region.is_a?(Region) 
  entry = index.region_for_entry(region.entry)
  raise FastaDBException.new "Entry (#{region.entry})not found in reference" unless entry
  raise FastaDBException.new "Region in invalid range (#{region}): Valid range: #{entry.to_region.to_s} has a size of #{entry.size}." if region.end > entry.size or region.start < 1
  seq = @samtools ?  fetch_sequence_samtools(region): fetch_sequence_native(region)
  reference = Bio::Sequence::NA.new(seq)
  if region.respond_to? :orientation and region.orientation == :reverse
    reference.reverse_complement!()
  end
  reference
end

#fetch_sequence_native(region) ⇒ Object


262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# File 'lib/bio/db/fastadb.rb', line 262

def fetch_sequence_native(region)
  query = region
  query = Region.parse_region(region) unless region.is_a?(Region) 
  seq = ""
  #In order to make this reentrant, if we want to make a multithreaded
  #version of this function, we need to get a lock. Currently, only one thred
  #can be assosiated with eache fastadb object
  @fasta_file = File.open(@fasta_path) unless @fasta_file
  entry = index.region_for_entry(query.entry)
 
  start_pointer  =  entry.get_base_coordinate(query.start)
  @fasta_file.seek(start_pointer, IO::SEEK_SET)
  end_pointer  =  entry.get_base_coordinate(query.end)
  to_read = end_pointer - start_pointer + 1
  seq = @fasta_file.read(to_read)
  seq.gsub!(/\s+/, '')
  seq 
end

#fetch_sequence_samtools(region) ⇒ Object


251
252
253
254
255
256
257
258
259
260
# File 'lib/bio/db/fastadb.rb', line 251

def fetch_sequence_samtools(region)
  query = region.to_s
  query = region.to_region.to_s if region.respond_to?(:to_region) 
  command = "#{@samtools} faidx #{@fasta_path} '#{query}'"
  puts "Running: #{command}"  if $DEBUG
  @last_command = command
  seq = ""
  yield_from_pipe(command, String, :text ) {|line| seq = seq + line unless line =~ /^>/}
  seq
end

#indexObject


236
237
238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/bio/db/fastadb.rb', line 236

def index
  return @index if @index
  if @samtools
    faidx
  else
    samtools = File.join(File.expand_path(File.dirname(__FILE__)),'sam','external','samtools')
    #TODO: make a ruby implementations 
    command = "#{samtools} faidx #{@fasta_path}"
    @last_command = command
    system(command)
  end
    load_fai_entries
  return @index
end

#load_fai_entriesObject

Loads the fai entries


207
208
209
210
211
212
213
214
215
216
# File 'lib/bio/db/fastadb.rb', line 207

def load_fai_entries()
  return  @index.length if @index
  @index = Index.new
  fai_file = @fai_file
  File.open(fai_file).each do | line |
    fields = line.split("\t")
    @index << Entry.new(fields[0], fields[1], fields[2], fields[3], fields[4])
  end     
  @index.length
end