Class: Fastabin

Inherits:
Object
  • Object
show all
Defined in:
lib/scbi_fqbin/fastabin.rb

Constant Summary collapse

COMPRESSION =

Compression type. Valid values are Zlib::NO_COMPRESSION, Zlib::BEST_SPEED, Zlib::BEST_COMPRESSION, Zlib::DEFAULT_COMPRESSION, and an integer from 0 to 9.

Zlib::BEST_COMPRESSION
SEQUENCES_PER_BLOCK =
10
READ_BIN_REG_EXP =
/^([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)/
READ_REG_EXP =
/^([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)/

Instance Method Summary collapse

Constructor Details

#initialize(filename, mode = 'r', index_filename = nil) ⇒ Fastabin


Constructor




29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/scbi_fqbin/fastabin.rb', line 29

def initialize(filename, mode = 'r', index_filename=nil)
  
  @filename=filename
  @index_filename = index_filename ||= filename+'.index';
  @bin_file = nil
  @need_to_regenerate_index=false

  @added_sequence_count=0
  
  # check open mode read or write
  if mode.upcase.index('W')
    bin_mode='wb'
    if File.exists?(@index_filename)
      File.delete(@index_filename)
    end
  else
  
    # if read mode, check if file exits
    if !File.exists?(filename)
      raise "File #{filename} doesn't exists'"
    end

    # if index doesn't exits, recreate it
    if !File.exists?(@index_filename)
      regenerate_index
    end

    # check open mode
    bin_mode = 'rb' 
    
  end
  
  #open files
  @bin_io = File.open(filename,bin_mode)   
  @bin_file = Zlib::GzipWriter.new(@bin_io)
  
end

Instance Method Details

#add_fasta_qual(fasta_file_name, qual_file_name) ⇒ Object


Add a pair of fasta and qual files to the Fastabin file




71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/scbi_fqbin/fastabin.rb', line 71

def add_fasta_qual(fasta_file_name,qual_file_name)

    # use FastaQualFile to read fasta
      qf = FastaQualFile.new(fasta_file_name,qual_file_name)

      
      # iterate over sequences
      qf.each do |name,fasta,qual|

         if (qf.num_seqs % 10000) == 0
                 puts Time.now.to_s + ',' + qf.num_seqs.to_s + ':' + name
         end
#                           
            #add them to fastabin 
            add_seq(name,fasta,qual,nil)

      end

      qf.close

end

#add_seq(seq_name, seq_fasta, seq_qual, seq_extras = nil) ⇒ Object


Add one seq to the fastabin file




96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/scbi_fqbin/fastabin.rb', line 96

def add_seq(seq_name, seq_fasta, seq_qual, seq_extras=nil)

      if (@added_sequence_count % SEQUENCES_PER_BLOCK==0)
              #@bin_file.flush
              @bin_file.close
              #@bin_io.close
              
              @bin_io = File.open(@filename,'ab')  
              @bin_file = Zlib::GzipWriter.new(@bin_io)
              puts "NEW BLOCK"
            
      end
      @added_sequence_count += 1
      
      @need_to_regenerate_index = true
      
      zipped_fasta = ''
      zipped_qual = ''
      zipped_extras = ''

      ini = @bin_file.pos
      
      # get current pos and write deflated data to fastabin format
      #zipped_fasta = deflate_fasta(seq_fasta.strip)
      zipped_fasta = seq_fasta.strip

      if !seq_qual.empty?
        q = seq_qual.strip.split(' ')
        q.map!{|e| (e.to_i+33).chr}
          #zipped_qual = deflate_qual(q.join)
          zipped_qual = q.join
          
        #puts q.join
        #raise
        #zipped_qual = deflate_qual(seq_qual.strip)

      end

      if !seq_extras.nil?
        #zipped_extras = deflate_extras(seq_extras)
        zipped_extras = seq_extras
      end
  
      # write data to index file and bin (to retrieve it later if index file gets lost)
      head = "#{seq_name} #{zipped_fasta.size} #{zipped_qual.size} #{zipped_extras.size}"
      bin_index_line ="#{head.size.to_s.rjust(4)}#{head}"
      
      #index_line ="#{seq_name} #{ini+bin_index_line.size} #{zipped_fasta.size} #{zipped_qual.size} #{zipped_extras.size}"
      #puts index_line
      #index_file.puts index_line
      
      @bin_file.write bin_index_line

#        puts zipped_fasta
#       puts zipped_qual
#       puts zipped_extras

#       puts "1F:#{zipped_fasta}@#{zipped_fasta.size.to_s}@#{zipped_fasta.length.to_s}"
#       puts "1Q:#{zipped_qual}@#{zipped_qual.size.to_s}@#{zipped_qual.length.to_s}"
#       puts "1E:#{zipped_extras}@#{zipped_extras.size.to_s}@#{zipped_extras.length.to_s}"
#       
      
      @bin_file.puts zipped_fasta
      @bin_file.puts zipped_qual
      @bin_file.puts zipped_extras
      
      
      
end

#closeObject


Close files




279
280
281
282
283
284
285
# File 'lib/scbi_fqbin/fastabin.rb', line 279

def close

  @bin_file.close if !@bin_file.closed?    
  #@bin_io.close if !@bin_io.closed?
      
  regenerate_index if @need_to_regenerate_index
end

#countObject


Count lines in index file. This is sequence count




266
267
268
269
270
271
272
273
274
# File 'lib/scbi_fqbin/fastabin.rb', line 266

def count
    index_file = Zlib::GzipReader.open(@index_filename)
    
    res=index_file.readlines.count

    index_file.close
    
    return res
end

#each(get_fasta = true, get_qual = true, get_extras = true) ⇒ Object


Iterate over all sequences of a fastabin file




169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/scbi_fqbin/fastabin.rb', line 169

def each(get_fasta=true,get_qual=true,get_extras=true)

    #bin = ''
    @bin_file.pos=0

    
    while !@bin_file.eof?
    
        head_size = @bin_file.read(4).to_i
        line = @bin_file.read(head_size)
    
        if line =~ READ_BIN_REG_EXP
    
            name = $1
            i = @bin_file.pos
            fz = $2.to_i
            qz=$3.to_i
            ez=$4.to_i
  
            @bin_file.pos = i+fz+qz+ez
            
            name,fasta,qual,extras = extract_seq(name,i,fz,qz,ez)
            
            yield(name,fasta,qual,extras)
          
        else
          raise "Invalid index line found at each"
        end
    
    end

end

#each_by_index(get_fasta = true, get_qual = true) ⇒ Object


Iterate over all sequences of a fastabin file




205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/scbi_fqbin/fastabin.rb', line 205

def each_by_index(get_fasta=true,get_qual=true)
      index_file = Zlib::GzipReader.open(@index_filename)
      
      # iterate over each line of index_file       
     index_file.each_line do |e|

            # parse params
           if e=~ READ_REG_EXP
        
            name = $1
            i = $2.to_i
            fz = $3.to_i
            qz=$4.to_i
            ez=$5.to_i
  
            name,fasta,qual,extras = extract_seq(name,i,fz,qz,ez)
            
            yield(name,fasta,qual,extras)
           end
            
      end
      index_file.close
                      
end

#read_seq(seq_name) ⇒ Object


Read one seq from the fastabin file by name




233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/scbi_fqbin/fastabin.rb', line 233

def read_seq(seq_name)

  index_file = Zlib::GzipReader.open(@index_filename)
  
  res = nil
  e=nil
  
  index_file.grep(/^#{seq_name}\s/) do |line|
   
       e=line.chomp

      # parse params
      if e=~ READ_REG_EXP
        name = $1
        i = $2.to_i
        fz = $3.to_i
        qz=$4.to_i
        ez=$5.to_i

        res = extract_seq(name,i,fz,qz,ez)
      end
      break
   end
   
   index_file.close
   
  return res 
end