Class: FastaReader
- Inherits:
-
Object
- Object
- FastaReader
- Includes:
- Indexer
- Defined in:
- lib/bigbio/db/fasta/fastareader.rb,
lib/bigbio/db/fasta/fastareader.rb
Overview
The following is actually a module/trait implementation without state
Class Method Summary collapse
-
.emit(getbuf_func) {|id, descr, seq| ... } ⇒ Object
func passes in a FASTA buffer.
- .emit_fastarecord(getbuf_func) ⇒ Object
Instance Method Summary collapse
- #close ⇒ Object
- #digest_tag(tag) ⇒ Object
-
#each ⇒ Object
returns a FastaRecord for every item (invokes parse_each).
- #first ⇒ Object
-
#get(id) ⇒ Object
Return a record by its
id
, nil when not found. - #get_by_index(idx) ⇒ Object
- #get_rec(fpos) ⇒ Object
-
#initialize(fn, opts = {}) ⇒ FastaReader
constructor
Initalize the reader of FASTA file fn.
-
#parse_each ⇒ Object
Parse the FASTA file and yield id, descr, sequence.
-
#size ⇒ Object
Returns the size of the dataset - as read.
Methods included from Indexer
#indexer_get, #indexer_get_by_index, #indexer_set, #indexer_use
Constructor Details
#initialize(fn, opts = {}) ⇒ FastaReader
Initalize the reader of FASTA file fn. Options can be :regex and :index (true/false)
12 13 14 15 16 17 18 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 12 def initialize fn, opts = {} @f = File.open(fn) @fread_once = false @regex = opts[:regex] @regex = '^(\S+)' if @regex == nil indexer_use opts[:index] end |
Class Method Details
.emit(getbuf_func) {|id, descr, seq| ... } ⇒ Object
func passes in a FASTA buffer. Every time a record is parsed it is yielded.
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 141 def FastaReader::emit getbuf_func seq = "" id = nil descr = nil while buf = getbuf_func.call buf.split(/\n/).each do | line | if line =~ /^>/ yield id, descr, seq if descr descr = line[1..-1].strip matched = /^(\S+)/.match(descr) id = matched[0] seq = "" else seq += line.strip end end end yield id, descr, seq if descr and seq.size > 0 end |
.emit_fastarecord(getbuf_func) ⇒ Object
161 162 163 164 165 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 161 def FastaReader::emit_fastarecord getbuf_func emit(getbuf_func) do | id, descr, seq | yield FastaRecord.new(id, descr, seq) end end |
Instance Method Details
#close ⇒ Object
117 118 119 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 117 def close @f.close end |
#digest_tag(tag) ⇒ Object
97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 97 def digest_tag tag if tag =~ /^>/ descr = $'.strip if descr =~ /#{@regex}/ id = $1 # p [descr,id] return id, descr end p descr # do not remove these p @regex end raise "Can not digest '#{tag}' using '"+@regex+"'" end |
#each ⇒ Object
returns a FastaRecord for every item (invokes parse_each)
55 56 57 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 55 def each parse_each { | id, descr, seq | yield FastaRecord.new(id, descr, seq) } end |
#first ⇒ Object
59 60 61 62 63 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 59 def first parse_each { | id, descr, seq | return FastaRecord.new(id, descr, seq) } end |
#get(id) ⇒ Object
Return a record by its id
, nil when not found
66 67 68 69 70 71 72 73 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 66 def get id indexed? if fpos = indexer_get(id) get_rec(fpos) else nil end end |
#get_by_index(idx) ⇒ Object
88 89 90 91 92 93 94 95 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 88 def get_by_index idx indexed? if fpos = indexer_get_by_index(idx)[1] ret = get_rec(fpos) return ret end nil end |
#get_rec(fpos) ⇒ Object
75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 75 def get_rec fpos @f.seek fpos tag = @f.gets seq = "" begin line = @f.gets break if line =~ /^>/ seq += line.strip end while !@f.eof id, descr = digest_tag(tag) FastaRecord.new(id,descr,seq) end |
#parse_each ⇒ Object
Parse the FASTA file and yield id, descr, sequence. When the indexer is on it will index the records the first time. Note that, with indexing, when you don’t complete parsing there will be an error the second time. This is a # trade-off, otherwise one would always have to index the file and read it twice.
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 25 def parse_each @f.seek 0 # force file rewind @rec_fpos = 0 @rec_line = @f.gets fpos = 0 @count = 0 begin # digest id from record description id, descr = digest_tag(@rec_line) id_fpos = @rec_fpos # parse the sequence seq = "" begin fpos = @f.tell line = @f.gets break if line =~ /^>/ seq += line.strip end while !@f.eof # new record @count += 1 @rec_fpos = fpos @rec_line = line # p [@rec_line, id, id_fpos] indexer_set(id, id_fpos) if @indexer and not @fread_once yield id, descr, seq end while !@f.eof @fread_once = true end |
#size ⇒ Object
Returns the size of the dataset - as read. After the final record the size represents the number of items in the FASTA file
113 114 115 |
# File 'lib/bigbio/db/fasta/fastareader.rb', line 113 def size @count end |