Module: MS::Sequest::Srf::Pepxml
- Included in:
- MS::Sequest::Srf
- Defined in:
- lib/ms/sequest/srf/pepxml.rb,
lib/ms/sequest/srf/pepxml.rb
Constant Summary collapse
- DEFAULT_OPTIONS =
A hash with the following symbol keys may be set:
Run Info
- :ms_model
-
nil
- :ms_ionization
-
‘ESI’
- :ms_detector
-
‘UNKNOWN’
- :ms_mass_analyzer
-
nil - typically extracted from the srf file and matched with ModelToMsAnalyzer
- :ms_manufacturer
-
‘Thermo’
Raw data
- :mz_dir
-
nil - path to the mz[X]ML directory, defaults to the directory the srf file is contained in. mz[X]ML data must be available to embed retention times
- :raw_data
-
[‘.mzML’, ‘.mzXML’] - preferred extension for raw data
Database
- :db_seq_type
-
‘AA’ - AA or NA
- :db_dir
-
nil - the directory the fasta file used for the search is housed in. A valid pepxml file must point to a valid fasta file!
- :db_residue_size
-
nil - An integer for the number of residues in the database. if true, calculates the size of the fasta database.
- *:db_name
-
nil
- :db_orig_url
-
nil
- :db_release_date
-
nil
- :db_release_id
-
nil
Search Hits
- :num_hits
-
1 - the top number of hits to include
- :retention_times
-
false - include retention times in the file (requires mz_dir to be set)
- :deltacn_orig
-
false - when true, the original SEQUEST deltacn values are used. If false, Bioworks deltacn values are used which are derived by taking the original deltacn of the following hit. This gives the top ranking hit an informative deltacn but makes the deltacn meaningless for other hits.
- :pepxml_version
-
MS::Ident::Pepxml::DEFAULT_PEPXML_VERSION, - Integer to set the pepxml version. The converter and xml output attempts to produce xml specific to the version.
- :verbose
-
true - set to false to quiet warnings
{ :ms_model => nil, :ms_ionization => 'ESI', :ms_detector => 'UNKNOWN', :ms_mass_analyzer => nil, :ms_manufacturer => 'Thermo', :mz_dir => nil, #:raw_data => [".mzXML", '.mzML'], :raw_data => ['.mzML', '.mzXML'], :db_seq_type => 'AA', :db_dir => nil, :db_residue_size => nil, :db_name => nil, :db_orig_url => nil, :db_release_date => nil, :db_release_id => nil, :num_hits => 1, :retention_times => false, :deltacn_orig => false, :pepxml_version => MS::Ident::Pepxml::DEFAULT_PEPXML_VERSION, :verbose => true, }
- ModelToMsAnalyzer =
An array of regexp to string pairs. The regexps are matched against the model (srf.header.model) and the corresponding string will be used as the mass analyzer.
- /Orbitrap/
-
‘Orbitrap’
- /LCQ Deca XP/
-
‘Ion Trap’
- /LTQ/
-
‘Ion Trap’
- /w+/
-
‘UNKNOWN’
[ [/Orbitrap/, 'Orbitrap'], [/LCQ Deca XP/, 'Ion Trap'], [/LTQ/, 'Ion Trap'], [/\w+/, 'UNKNOWN'], ]
Class Method Summary collapse
Instance Method Summary collapse
-
#to_pepxml(opts = {}) ⇒ Object
returns an MS::Ident::Pepxml object.
Class Method Details
.commandline(argv, progname = $0) ⇒ Object
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 |
# File 'lib/ms/sequest/srf/pepxml.rb', line 263 def self.commandline(argv, progname=$0) opts = Trollop::Parser.new do %Q{ usage: #{progname} [OPTIONS] <file>.srf ... output: <file>.xml ... }.lines.map(&:lstrip).join text "" text "major options:" opt :db_dir, "The dir holding the DB if different than in Srf. (pepxml requires a valid database path)", :type => :string opt :enzyme, "overide the enzyme name embedded in the params file", :type => :string opt :mz_dir, "directory holding mz[X]ML files (defaults to the folder holding the srf file)", :type => :string opt :retention_times, "include retention times (requires mz-dir)" opt :deltacn_orig, "use original deltacn values created by SEQUEST. By default, the top hit gets the next hit's original deltacn." opt :no_filter, "do not filter hits by peptide_mass_tolerance (per sequest params)" opt :num_hits, "include N top hits", :default => 1 opt :outdirs, "list of output directories", :type => :strings opt :quiet, "do not print warnings, etc." text "" text "minor options:" opt :pepxml_version, 'schema version number to use', :default => MS::Ident::Pepxml::DEFAULT_PEPXML_VERSION opt :ms_model, 'mass spectrometer model', :type => :string opt :ms_ionization, 'type of ms ionization', :default => 'ESI' opt :ms_detector, 'ms detector', :default => 'UNKNOWN' opt :ms_mass_analyzer, 'ms mass analyzer', :type => :string opt :ms_manufacturer, 'ms manufacturer', :default => 'Thermo' opt :raw_data, 'preferred extension for raw data', :default => '.mzML' opt :db_seq_type, "'AA' or 'NA'", :default => 'AA' opt :db_residue_size, 'calculate the size of the fasta file' opt :db_name, 'the database name', :type => :string opt :db_orig_url, 'original database url', :type => :string opt :db_release_date, 'database release date', :type => :string opt :db_release_id, 'the database release identifier', :type => :string end opt = opts.parse argv opts.educate && exit if argv.empty? Trollop.die :outdirs, "outdirs must be same size as number of input files" if opt.outdirs && opt.outdirs.size != argv.size opt[:filter] = !opt.delete(:no_filter) opt[:outdirs] ||= [] opt[:raw_data] = [opt[:raw_data]] if opt[:raw_data] opt[:verbose] = !opt[:quiet] argv.zip(opt.delete(:outdirs)) do |srf_file,outdir| outdir ||= File.dirname(srf_file) srf = MS::Sequest::Srf.new(srf_file, :link_protein_hits => false, :filter_by_precursor_mass_tolerance => opt.delete(:filter)) pepxml = srf.to_pepxml(opt) outfile = pepxml.to_xml(outdir) puts "wrote file: #{outfile}" if opt[:verbose] end end |
Instance Method Details
#to_pepxml(opts = {}) ⇒ Object
returns an MS::Ident::Pepxml object. See that object for creating an xml string or writing to file.
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 |
# File 'lib/ms/sequest/srf/pepxml.rb', line 85 def to_pepxml(opts={}) opt = DEFAULT_OPTIONS.merge(opts) srf = self # with newer pepxml version these are not required anymore hidden_opts = { # format of file storing the runner up peptides (if not present in # pepXML) this was made optional after version 19 :out_data_type => "out", ## may be srf?? # runner up search hit data type extension (e.g. .tgz) :out_data => ".srf", } opt.merge!(hidden_opts) params = srf.params header = srf.header opt[:ms_model] ||= srf.header.model unless opt[:ms_mass_analyzer] ModelToMsAnalyzer.each do |regexp, val| if opt[:ms_model].match(regexp) opt[:ms_mass_analyzer] = val break end end end # get the database name db_filename = header.db_filename.sub(/\.hdr$/, '') if opt[:db_dir] db_filename = File.join(opt[:db_dir], db_filename.split(/[\/\\]+/).last) end if File.exist?(db_filename) db_filename = File.(db_filename) else msg = ["!!! WARNING !!!"] msg << "!!! Can't find database: #{db_filename}" msg << "!!! pepxml *requires* that the db path be valid" msg << "!!! make sure 1) the fasta file is available on this system" msg << "!!! 2) you've specified a valid directory with --db-dir (or :db_dir)" puts msg.join("\n") if opt[:verbose] end modifications_obj = MS::Sequest::Pepxml::Modifications.new(params, srf.header.modifications) mass_index = params.mass_index(:precursor) h_plus = mass_index['h+'] opt[:mz_dir] ||= srf.resident_dir found_ext = opt[:raw_data].find do |raw_data| Dir[File.join(opt[:mz_dir], srf.base_name_noext + raw_data)].first end opt[:raw_data] = [found_ext] if found_ext scan_to_ret_time = if opt[:retention_times] raise NotImplementedError, "will implement shortly" #mz_file = Dir[File.join(opt[:mz_dir], srf.base_name_noext + opt[:raw_data].first)].first #if mz_file # MS::Msrun.scans_to_times(mz_file) #else # warn "turning retention_times off since no valid mz[X]ML file was found!!!" # opt[:retention_times] = false # nil #end end summary_xml_filename = srf.base_name_noext + '.xml' pepxml = MS::Ident::Pepxml.new do |msms_pipeline_analysis| msms_pipeline_analysis.merge!(:summary_xml => summary_xml_filename, :pepxml_version => opt[:pepxml_version]) do |msms_run_summary| # prep the sample enzyme and search_summary msms_run_summary.merge!( :base_name => File.join(opt[:mz_dir], srf.base_name_noext), :ms_manufacturer => opt[:ms_manufacturer], :ms_model => opt[:ms_model], :ms_ionization => opt[:ms_ionization], :ms_mass_analyzer => opt[:ms_mass_analyzer], :ms_detector => opt[:ms_detector], :raw_data => opt[:raw_data].first, :raw_data_type => opt[:raw_data].first, ) do |sample_enzyme, search_summary, spectrum_queries| sample_enzyme.merge!(params.sample_enzyme_hash) sample_enzyme.name = opt[:enzyme] if opt[:enzyme] search_summary.merge!( :base_name=> srf.resident_dir + '/' + srf.base_name_noext, :search_engine => 'SEQUEST', :precursor_mass_type => params.precursor_mass_type, :fragment_mass_type => params.fragment_mass_type, :out_data_type => opt[:out_data_type], :out_data => opt[:out_data], ) do |search_database, enzymatic_search_constraint, modifications_ar, parameters_hash| search_database.merge!(:local_path => db_filename, :seq_type => opt[:db_seq_type], :database_name => opt[:db_name], :orig_database_url => opt[:db_orig_url], :database_release_date => opt[:db_release_date], :database_release_identifier => opt[:db_release_id]) case opt[:db_residue_size] when Integer search_database.size_of_residues = opt[:db_residue_size] when true search_database.set_size_of_residues! end enzymatic_search_constraint.merge!( :enzyme => opt[:enzyme] ? opt[:enzyme] : params.enzyme, :max_num_internal_cleavages => params.max_num_internal_cleavages, :min_number_termini => params.min_number_termini, ) modifications_ar.replace(modifications_obj.modifications) parameters_hash.merge!(params.opts) end spec_queries = srf.dta_files.zip(srf.out_files, index).map do |dta_file,out_file,i_ar| precursor_neutral_mass = dta_file.mh - h_plus search_hits = out_file.hits[0,opt[:num_hits]].each_with_index.map do |pep,i| (prev_aa, pure_aaseq, next_aa) = MS::Ident::Peptide.prepare_sequence(pep.sequence) calc_neutral_pep_mass = pep.mh - h_plus sh = MS::Ident::Pepxml::SearchHit.new( :hit_rank => i+1, :peptide => pure_aaseq, :peptide_prev_aa => prev_aa, :peptide_next_aa => next_aa, :protein => pep.proteins.first.reference.split(' ')[0], :num_tot_proteins => pep.proteins.size, :num_matched_ions => pep.ions_matched, :tot_num_ions => pep.ions_total, :calc_neutral_pep_mass => calc_neutral_pep_mass, :massdiff => precursor_neutral_mass - calc_neutral_pep_mass, :num_tol_term => sample_enzyme.num_tol_term(prev_aa, pure_aaseq, next_aa), :num_missed_cleavages => sample_enzyme.num_missed_cleavages(pure_aaseq), :modification_info => modifications_obj.modification_info(MS::Ident::Peptide.split_sequence(pep.sequence)[1]) ) do |search_scores| if opt[:deltacn_orig] deltacn = pep.deltacn_orig deltacnstar = nil else deltacn = pep.deltacn deltacn = 1.0 if deltacn == 1.1 deltacnstar = out_file.hits[i+1].nil? ? '1' : '0' end search_scores.merge!( :xcorr => pep.xcorr, :deltacn => deltacn, :spscore => pep.sp, :sprank => pep.rsp) search_scores[:deltacnstar] = deltacnstar if deltacnstar end end sr = MS::Ident::Pepxml::SearchResult.new(:search_hits => search_hits) ret_time = if opt[:retention_times] (first_scan, last_scan) = i_ar[0,2] if first_scan==last_scan scan_to_ret_time[i_ar[0]] else times = ((i_ar[0])..(i_ar[1])).step(1).map {|i| scan_to_ret_time[i] }.compact times.inject(&:+) / times.size.to_f end end MS::Ident::Pepxml::SpectrumQuery.new( :spectrum => [srf.base_name_noext, *i_ar].join('.'), :start_scan => i_ar[0], :end_scan => i_ar[1], :precursor_neutral_mass => dta_file.mh - h_plus, :assumed_charge => i_ar[2], :retention_time_sec => ret_time, :search_results => [sr], ) end spectrum_queries.replace(spec_queries) end end end pepxml end |