Class: Fastabin
- Inherits:
-
Object
- Object
- Fastabin
- Defined in:
- lib/scbi_fqbin/fastabin.rb
Constant Summary collapse
- COMPRESSION =
Compression type. Valid values are Zlib::NO_COMPRESSION, Zlib::BEST_SPEED, Zlib::BEST_COMPRESSION, Zlib::DEFAULT_COMPRESSION, and an integer from 0 to 9.
Zlib::BEST_COMPRESSION
- SEQUENCES_PER_BLOCK =
10- READ_BIN_REG_EXP =
/^([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)/- READ_REG_EXP =
/^([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)/
Instance Method Summary collapse
-
#add_fasta_qual(fasta_file_name, qual_file_name) ⇒ Object
————————————– Add a pair of fasta and qual files to the Fastabin file ————————————–.
-
#add_seq(seq_name, seq_fasta, seq_qual, seq_extras = nil) ⇒ Object
————————————– Add one seq to the fastabin file ————————————–.
-
#close ⇒ Object
————————————– Close files ————————————–.
-
#count ⇒ Object
————————————– Count lines in index file.
-
#each(get_fasta = true, get_qual = true, get_extras = true) ⇒ Object
————————————– Iterate over all sequences of a fastabin file ————————————–.
-
#each_by_index(get_fasta = true, get_qual = true) ⇒ Object
————————————– Iterate over all sequences of a fastabin file ————————————–.
-
#initialize(filename, mode = 'r', index_filename = nil) ⇒ Fastabin
constructor
————————————– Constructor ————————————–.
-
#read_seq(seq_name) ⇒ Object
————————————– Read one seq from the fastabin file by name ————————————–.
Constructor Details
#initialize(filename, mode = 'r', index_filename = nil) ⇒ Fastabin
Constructor
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/scbi_fqbin/fastabin.rb', line 29 def initialize(filename, mode = 'r', index_filename=nil) @filename=filename @index_filename = index_filename ||= filename+'.index'; @bin_file = nil @need_to_regenerate_index=false @added_sequence_count=0 # check open mode read or write if mode.upcase.index('W') bin_mode='wb' if File.exists?(@index_filename) File.delete(@index_filename) end else # if read mode, check if file exits if !File.exists?(filename) raise "File #{filename} doesn't exists'" end # if index doesn't exits, recreate it if !File.exists?(@index_filename) regenerate_index end # check open mode bin_mode = 'rb' end #open files @bin_io = File.open(filename,bin_mode) @bin_file = Zlib::GzipWriter.new(@bin_io) end |
Instance Method Details
#add_fasta_qual(fasta_file_name, qual_file_name) ⇒ Object
Add a pair of fasta and qual files to the Fastabin file
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/scbi_fqbin/fastabin.rb', line 71 def add_fasta_qual(fasta_file_name,qual_file_name) # use FastaQualFile to read fasta qf = FastaQualFile.new(fasta_file_name,qual_file_name) # iterate over sequences qf.each do |name,fasta,qual| if (qf.num_seqs % 10000) == 0 puts Time.now.to_s + ',' + qf.num_seqs.to_s + ':' + name end # #add them to fastabin add_seq(name,fasta,qual,nil) end qf.close end |
#add_seq(seq_name, seq_fasta, seq_qual, seq_extras = nil) ⇒ Object
Add one seq to the fastabin file
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
# File 'lib/scbi_fqbin/fastabin.rb', line 96 def add_seq(seq_name, seq_fasta, seq_qual, seq_extras=nil) if (@added_sequence_count % SEQUENCES_PER_BLOCK==0) #@bin_file.flush @bin_file.close #@bin_io.close @bin_io = File.open(@filename,'ab') @bin_file = Zlib::GzipWriter.new(@bin_io) puts "NEW BLOCK" end @added_sequence_count += 1 @need_to_regenerate_index = true zipped_fasta = '' zipped_qual = '' zipped_extras = '' ini = @bin_file.pos # get current pos and write deflated data to fastabin format #zipped_fasta = deflate_fasta(seq_fasta.strip) zipped_fasta = seq_fasta.strip if !seq_qual.empty? q = seq_qual.strip.split(' ') q.map!{|e| (e.to_i+33).chr} #zipped_qual = deflate_qual(q.join) zipped_qual = q.join #puts q.join #raise #zipped_qual = deflate_qual(seq_qual.strip) end if !seq_extras.nil? #zipped_extras = deflate_extras(seq_extras) zipped_extras = seq_extras end # write data to index file and bin (to retrieve it later if index file gets lost) head = "#{seq_name} #{zipped_fasta.size} #{zipped_qual.size} #{zipped_extras.size}" bin_index_line ="#{head.size.to_s.rjust(4)}#{head}" #index_line ="#{seq_name} #{ini+bin_index_line.size} #{zipped_fasta.size} #{zipped_qual.size} #{zipped_extras.size}" #puts index_line #index_file.puts index_line @bin_file.write bin_index_line # puts zipped_fasta # puts zipped_qual # puts zipped_extras # puts "1F:#{zipped_fasta}@#{zipped_fasta.size.to_s}@#{zipped_fasta.length.to_s}" # puts "1Q:#{zipped_qual}@#{zipped_qual.size.to_s}@#{zipped_qual.length.to_s}" # puts "1E:#{zipped_extras}@#{zipped_extras.size.to_s}@#{zipped_extras.length.to_s}" # @bin_file.puts zipped_fasta @bin_file.puts zipped_qual @bin_file.puts zipped_extras end |
#close ⇒ Object
Close files
279 280 281 282 283 284 285 |
# File 'lib/scbi_fqbin/fastabin.rb', line 279 def close @bin_file.close if !@bin_file.closed? #@bin_io.close if !@bin_io.closed? regenerate_index if @need_to_regenerate_index end |
#count ⇒ Object
Count lines in index file. This is sequence count
266 267 268 269 270 271 272 273 274 |
# File 'lib/scbi_fqbin/fastabin.rb', line 266 def count index_file = Zlib::GzipReader.open(@index_filename) res=index_file.readlines.count index_file.close return res end |
#each(get_fasta = true, get_qual = true, get_extras = true) ⇒ Object
Iterate over all sequences of a fastabin file
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
# File 'lib/scbi_fqbin/fastabin.rb', line 169 def each(get_fasta=true,get_qual=true,get_extras=true) #bin = '' @bin_file.pos=0 while !@bin_file.eof? head_size = @bin_file.read(4).to_i line = @bin_file.read(head_size) if line =~ READ_BIN_REG_EXP name = $1 i = @bin_file.pos fz = $2.to_i qz=$3.to_i ez=$4.to_i @bin_file.pos = i+fz+qz+ez name,fasta,qual,extras = extract_seq(name,i,fz,qz,ez) yield(name,fasta,qual,extras) else raise "Invalid index line found at each" end end end |
#each_by_index(get_fasta = true, get_qual = true) ⇒ Object
Iterate over all sequences of a fastabin file
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
# File 'lib/scbi_fqbin/fastabin.rb', line 205 def each_by_index(get_fasta=true,get_qual=true) index_file = Zlib::GzipReader.open(@index_filename) # iterate over each line of index_file index_file.each_line do |e| # parse params if e=~ READ_REG_EXP name = $1 i = $2.to_i fz = $3.to_i qz=$4.to_i ez=$5.to_i name,fasta,qual,extras = extract_seq(name,i,fz,qz,ez) yield(name,fasta,qual,extras) end end index_file.close end |
#read_seq(seq_name) ⇒ Object
Read one seq from the fastabin file by name
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 |
# File 'lib/scbi_fqbin/fastabin.rb', line 233 def read_seq(seq_name) index_file = Zlib::GzipReader.open(@index_filename) res = nil e=nil index_file.grep(/^#{seq_name}\s/) do |line| e=line.chomp # parse params if e=~ READ_REG_EXP name = $1 i = $2.to_i fz = $3.to_i qz=$4.to_i ez=$5.to_i res = extract_seq(name,i,fz,qz,ez) end break end index_file.close return res end |