Class: Mspire::Ident::Peptide::Db::Creator
- Inherits:
-
Object
- Object
- Mspire::Ident::Peptide::Db::Creator
- Defined in:
- lib/mspire/ident/peptide/db/creator.rb
Constant Summary collapse
- MAX_NUM_AA_EXPANSION =
3
- STANDARD_AA =
the twenty standard amino acids
%w(A C D E F G H I K L M N P Q R S T V W Y)
- EXPAND_AA =
{'X' => STANDARD_AA}
- DEFAULT_PEPTIDE_CENTRIC_DB =
{ missed_cleavages: 2, min_length: 5, enzyme: Mspire::Digester[:trypsin], remove_digestion_file: true, cleave_initiator_methionine: true, expand_aa: false, uniprot: true, data_type: :hash, }
Class Method Summary collapse
-
.cmdline(argv) ⇒ Object
sets defaults according to DEFAULT_PEPTIDE_CENTRIC_DB.
Instance Method Summary collapse
-
#create(fasta_file, opts = {}) ⇒ Object
writes a new file with the added ‘min_aaseq<Integer>’ creates a temporary digestion file that contains all peptides digesting with certain missed_cleavages (i.e., min_seq_length is not applied to this file but on the final peptide centric db) returns the full name of the written file.
-
#create_digestion_file(fasta_file, opts = {}) ⇒ Object
returns the name of the digestion file that was written.
-
#db_from_fasta_digestion_file(digestion_file, opts = {}) ⇒ Object
returns the full path of the created file.
-
#expand_peptides(peptide, expand_aa_hash) ⇒ Object
does combinatorial expansion of all letters requesting it.
- #get_a_trie ⇒ Object
-
#hash_like_from_digestion_file(digestion_file, min_length, data_type = :hash) ⇒ Object
data_type can be :hash (default), :google_hash (better memory and speed), or :trie (experimental/broken).
Class Method Details
.cmdline(argv) ⇒ Object
sets defaults according to DEFAULT_PEPTIDE_CENTRIC_DB
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/mspire/ident/peptide/db/creator.rb', line 25 def self.cmdline(argv) opt = DEFAULT_PEPTIDE_CENTRIC_DB.dup opts = OptionParser.new do |op| op. = "usage: #{File.basename($0)} <file>.fasta ..." op.separator "output: " op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml" op.separator "format:" op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..." op.separator "" op.separator " Initiator Methionines - by default, will generate two peptides" op.separator " for any peptide found at the N-termini starting with 'M'" op.separator " (i.e., one with and one without the leading methionine)" op.separator "" op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v } op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v } op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false } op.on("--expand-x", "enumerate all aa possibilities for 'X'", "(default is to remove these peptides)") {|v| opt[:expand_aa] = v } op.on("--no-uniprot", "use entire protid section of fasta header", "for non-uniprot fasta files") { opt[:uniprot] = false } #op.on("--trie", "use a trie (for very large uniprot files)", "must have fast_trie gem installed") {|v| opt[:trie] = v } op.on("--data-type <#{opt[:data_type]}>", "hash|google_hash|trie", "need google_hash or fast_trie gem installed") {|v| opt[:data_type] = v.to_sym } op.on("-e", "--enzyme <#{opt[:enzyme].name}>", "enzyme for digestion (Mspire::Digester[name])") {|v| opt[:enzyme] = Mspire::Digester[v] } op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 } op.on("--list-enzymes", "lists approved enzymes and exits") do op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 } puts Mspire::Digester::ENZYMES.keys.join("\n") exit end end opts.parse!(argv) if argv.size == 0 puts opts || exit end argv.map do |file| creator = Mspire::Ident::Peptide::Db::Creator.new creator.create(file, opt) end end |
Instance Method Details
#create(fasta_file, opts = {}) ⇒ Object
writes a new file with the added ‘min_aaseq<Integer>’ creates a temporary digestion file that contains all peptides digesting with certain missed_cleavages (i.e., min_seq_length is not applied to this file but on the final peptide centric db) returns the full name of the written file.
223 224 225 226 227 228 |
# File 'lib/mspire/ident/peptide/db/creator.rb', line 223 def create(fasta_file, opts={}) opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts) digestion_file = create_digestion_file(fasta_file, opts) puts "created file of size: #{File.size(digestion_file)}" if $VERBOSE db_from_fasta_digestion_file(digestion_file, opts) end |
#create_digestion_file(fasta_file, opts = {}) ⇒ Object
returns the name of the digestion file that was written
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/mspire/ident/peptide/db/creator.rb', line 69 def create_digestion_file(fasta_file, opts={}) opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts) (missed_cleavages, enzyme, cleave_initiator_methionine, ) = opts.values_at(:missed_cleavages, :enzyme, :cleave_initiator_methionine, :expand_aa) start_time = Time.now print "Digesting #{fasta_file} ..." if $VERBOSE = Regexp.new("[" << Regexp.escape(EXPAND_AA.keys.join) << "]") base = fasta_file.chomp(File.extname(fasta_file)) digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides" File.open(digestion_file, "w") do |fh| Mspire::Fasta.open(fasta_file) do |fasta| fasta.each do |prot| peptides = enzyme.digest(prot.sequence, missed_cleavages) if (cleave_initiator_methionine && (prot.sequence[0,1] == "M")) m_peps = [] init_methionine_peps = [] peptides.each do |pep| # if the peptide is at the beginning of the protein sequence if prot.sequence[0,pep.size] == pep m_peps << pep[1..-1] end end peptides.push(*m_peps) end peptides = if peptides.flat_map do |pep| (pep =~ ) ? (pep, EXPAND_AA) : pep end else peptides.select {|pep| pep !~ } end header = prot.header id = opts[:uniprot] ? Mspire::Fasta.uniprot_id(header) : header.split(/\s/,2).first fh.puts( id + "\t" + peptides.join(" ") ) end end end puts "#{Time.now - start_time} sec" if $VERBOSE digestion_file end |
#db_from_fasta_digestion_file(digestion_file, opts = {}) ⇒ Object
returns the full path of the created file
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# File 'lib/mspire/ident/peptide/db/creator.rb', line 114 def db_from_fasta_digestion_file(digestion_file, opts={}) opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts) start_time = Time.now puts "Organizing raw digestion #{digestion_file} ..." if $VERBOSE puts "#{Time.now - start_time} sec" if $VERBOSE hash_like = hash_like_from_digestion_file(digestion_file, opts[:min_length], opts[:data_type]) base = digestion_file.chomp(File.extname(digestion_file)) final_outfile = if opts[:data_type] == :trie base + ".min_aaseq#{opts[:min_length]}" else base + ".min_aaseq#{opts[:min_length]}" + ".yml" end start_time = Time.now print "Writing #{hash_like.size} peptides to #{} ..." if $VERBOSE if opts[:data_type] == :trie trie = hash_like trie.save(final_outfile) else File.open(final_outfile, 'w') do |out| hash_like.each do |k,v| out.puts( [k, v.join(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)].join(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER) ) #out.puts "#{k}#{Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER}#{v}" end end end puts "#{Time.now - start_time} sec" if $VERBOSE if opts[:remove_digestion_file] File.unlink(digestion_file) end File.(final_outfile) end |
#expand_peptides(peptide, expand_aa_hash) ⇒ Object
does combinatorial expansion of all letters requesting it. expand_aa is hash like: ‘X’=>STANDARD_AA returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to be expanded returns an empty array if there is no expansion
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 |
# File 'lib/mspire/ident/peptide/db/creator.rb', line 235 def (peptide, ) letters_in_order = .keys.sort index_and_key = [] peptide.split('').each_with_index do |char,i| if let_index = letters_in_order.index(char) index_and_key << [i, letters_in_order[let_index]] end end if index_and_key.size > MAX_NUM_AA_EXPANSION return nil end = [peptide] index_and_key.each do |i,letter| new_peps = [] while current_pep = .shift do new_peps << [letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp } end = new_peps.flatten end end |
#get_a_trie ⇒ Object
153 154 155 156 157 158 159 160 |
# File 'lib/mspire/ident/peptide/db/creator.rb', line 153 def get_a_trie begin require 'trie' rescue raise LoadError, "must first install fast_trie" end Trie.new end |
#hash_like_from_digestion_file(digestion_file, min_length, data_type = :hash) ⇒ Object
data_type can be :hash (default), :google_hash (better memory and speed), or :trie (experimental/broken)
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
# File 'lib/mspire/ident/peptide/db/creator.rb', line 164 def hash_like_from_digestion_file(digestion_file, min_length, data_type=:hash) case data_type when :trie trie = get_a_trie ::IO.foreach(digestion_file) do |line| line.chomp! (prot, *peps) = line.split(/\s+/) # prot is something like this: "P31946" peps.uniq! peps.each do |pep| if pep.size >= min_length if trie.has_key?(pep) ar = trie.get(pep) ar << prot else trie.add( pep, [prot] ) end end end end trie when :hash, :google_hash hash = case data_type when :hash Hash.new when :google_hash require 'google_hash' # this guy is very slow compared to ruby hash (10X slower?) but more # memory efficient (but only ~10% or so improvement with this # application) GoogleHashDenseRubyToRuby.new # still need to try Sparse one out and compare performance #GoogleHashSparseRubyToRuby.new end ::IO.foreach(digestion_file) do |line| line.chomp! (prot, *peps) = line.split(/\s+/) # prot is something like this: "P31946" peps.uniq! peps.each do |pep| if pep.size >= min_length if hash.key?(pep) hash[pep] << prot else hash[pep] = [prot] end end end end hash end end |