Module: Mspire::Sequest::Srf::Pepxml

Included in:
Mspire::Sequest::Srf
Defined in:
lib/mspire/sequest/srf/pepxml.rb,
lib/mspire/sequest/srf/pepxml.rb

Constant Summary collapse

DEFAULT_OPTIONS =

A hash with the following symbol keys may be set:

Run Info

:ms_model

nil

:ms_ionization

‘ESI’

:ms_detector

‘UNKNOWN’

:ms_mass_analyzer

nil - typically extracted from the srf file and matched with ModelToMsAnalyzer

:ms_manufacturer

‘Thermo’

Raw data

:mz_dir

nil - path to the mz[X]ML directory, defaults to the directory the srf file is contained in. mz[X]ML data must be available to embed retention times

:raw_data

[‘.mzML’, ‘.mzXML’] - preferred extension for raw data

Database

:db_seq_type

‘AA’ - AA or NA

:db_dir

nil - the directory the fasta file used for the search is housed in. A valid pepxml file must point to a valid fasta file!

:db_residue_size

nil - An integer for the number of residues in the database. if true, calculates the size of the fasta database.

*:db_name

nil

:db_orig_url

nil

:db_release_date

nil

:db_release_id

nil

Search Hits

:num_hits

1 - the top number of hits to include

:retention_times

false - include retention times in the file (requires mz_dir to be set)

:deltacn_orig

false - when true, the original SEQUEST deltacn values are used. If false, Bioworks deltacn values are used which are derived by taking the original deltacn of the following hit. This gives the top ranking hit an informative deltacn but makes the deltacn meaningless for other hits.

:pepxml_version

Mspire::Ident::Pepxml::DEFAULT_PEPXML_VERSION, - Integer to set the pepxml version. The converter and xml output attempts to produce xml specific to the version.

:verbose

true - set to false to quiet warnings

{
  :ms_model => nil,
  :ms_ionization => 'ESI',
  :ms_detector => 'UNKNOWN',
  :ms_mass_analyzer => nil,
  :ms_manufacturer => 'Thermo',

  :mz_dir => nil,
  #:raw_data => [".mzXML", '.mzML'],
  :raw_data => ['.mzML', '.mzXML'],

  :db_seq_type => 'AA', 
  :db_dir => nil, 
  :db_residue_size => nil,
  :db_name => nil,
  :db_orig_url => nil,
  :db_release_date => nil,
  :db_release_id => nil,

  :num_hits => 1,
  :retention_times => false,
  :deltacn_orig => false,

  :pepxml_version => Mspire::Ident::Pepxml::DEFAULT_PEPXML_VERSION,
  :verbose => true,
}
ModelToMsAnalyzer =

An array of regexp to string pairs. The regexps are matched against the model (srf.header.model) and the corresponding string will be used as the mass analyzer.

/Orbitrap/

‘Orbitrap’

/LCQ Deca XP/

‘Ion Trap’

/LTQ/

‘Ion Trap’

/w+/

‘UNKNOWN’

[
  [/Orbitrap/, 'Orbitrap'], 
  [/LCQ Deca XP/, 'Ion Trap'],
  [/LTQ/, 'Ion Trap'],
  [/\w+/, 'UNKNOWN'],
]

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.commandline(argv, progname = $0) ⇒ Object



276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
# File 'lib/mspire/sequest/srf/pepxml.rb', line 276

def self.commandline(argv, progname=$0)
  opts = Trollop::Parser.new do
    banner %Q{
      usage: #{progname} [OPTIONS] <file>.srf ...
      output: <file>.xml ...
    }.lines.map(&:lstrip).join

    text ""
    text "major options:"
    opt :db_dir, "The dir holding the DB if different than in Srf. (pepxml requires a valid database path)", :type => :string
    opt :enzyme, "overide the enzyme name embedded in the params file", :type => :string
    opt :mz_dir, "directory holding mz[X]ML files (defaults to the folder holding the srf file)", :type => :string
    opt :retention_times, "include retention times (requires mz-dir)"
    opt :deltacn_orig, "use original deltacn values created by SEQUEST.  By default, the top hit gets the next hit's original deltacn."
    opt :no_filter, "do not filter hits by peptide_mass_tolerance (per sequest params)"
    opt :num_hits, "include N top hits", :default => 1
    opt :outdirs, "list of output directories", :type => :strings
    opt :quiet, "do not print warnings, etc."

    text ""
    text "minor options:"
    opt :pepxml_version, 'schema version number to use', :default => Mspire::Ident::Pepxml::DEFAULT_PEPXML_VERSION
    opt :ms_model, 'mass spectrometer model', :type => :string
    opt :ms_ionization, 'type of ms ionization', :default => 'ESI'
    opt :ms_detector, 'ms detector', :default => 'UNKNOWN'
    opt :ms_mass_analyzer, 'ms mass analyzer', :type => :string
    opt :ms_manufacturer, 'ms manufacturer', :default => 'Thermo'
    opt :raw_data, 'preferred extension for raw data', :default => '.mzML'
    opt :db_seq_type, "'AA' or 'NA'", :default => 'AA'
    opt :db_residue_size, 'calculate the size of the fasta file'
    opt :db_name, 'the database name', :type => :string
    opt :db_orig_url, 'original database url', :type => :string
    opt :db_release_date, 'database release date', :type => :string
    opt :db_release_id, 'the database release identifier', :type => :string
  end

  opt = opts.parse argv
  opts.educate && exit if argv.empty?

  Trollop.die :outdirs, "outdirs must be same size as number of input files" if opt.outdirs && opt.outdirs.size != argv.size
  opt[:filter] = !opt.delete(:no_filter)
  opt[:outdirs] ||= []
  opt[:raw_data] = [opt[:raw_data]] if opt[:raw_data]
  opt[:verbose] = !opt[:quiet]

  argv.zip(opt.delete(:outdirs)) do |srf_file,outdir|
    outdir ||= File.dirname(srf_file)
    srf = Mspire::Sequest::Srf.new(srf_file, :link_protein_hits => false, :filter_by_precursor_mass_tolerance => opt.delete(:filter))
    pepxml = srf.to_pepxml(opt)
    outfile = pepxml.to_xml(outdir)
    puts "wrote file: #{outfile}" if opt[:verbose]
  end
end

Instance Method Details

#to_pepxml(opts = {}) ⇒ Object

returns an Mspire::Ident::Pepxml object. See that object for creating an xml string or writing to file.



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# File 'lib/mspire/sequest/srf/pepxml.rb', line 86

def to_pepxml(opts={})
  opt = DEFAULT_OPTIONS.merge(opts)
  srf = self

  # with newer pepxml version these are not required anymore
  hidden_opts = {
    # format of file storing the runner up peptides (if not present in
    # pepXML) this was made optional after version 19
    :out_data_type => "out", ## may be srf??
    # runner up search hit data type extension (e.g. .tgz)
    :out_data => ".srf",
  }
  opt.merge!(hidden_opts)

  params = srf.params
  header = srf.header

  opt[:ms_model] ||= srf.header.model

  unless opt[:ms_mass_analyzer]
    ModelToMsAnalyzer.each do |regexp, val|
      if opt[:ms_model].match(regexp)
        opt[:ms_mass_analyzer] = val
        break
      end
    end
  end

  # get the database name
  db_filename = header.db_filename.sub(/\.hdr$/, '')
  if opt[:db_dir]
    db_filename = File.join(opt[:db_dir], db_filename.split(/[\/\\]+/).last)
  end
  if File.exist?(db_filename)
    db_filename = File.expand_path(db_filename)
  else
    msg = ["!!! WARNING !!!"]
    msg << "!!! Can't find database: #{db_filename}"
    msg << "!!! pepxml *requires* that the db path be valid"
    msg << "!!! make sure 1) the fasta file is available on this system"
    msg << "!!!           2) you've specified a valid directory with --db-dir (or :db_dir)"
    puts msg.join("\n") if opt[:verbose]
  end

  modifications_obj = Mspire::Sequest::Pepxml::Modifications.new(params, srf.header.modifications)
  mass_index = params.mass_index(:precursor)
  h_plus = mass_index['h+']

  opt[:mz_dir] ||= srf.resident_dir
  found_ext = opt[:raw_data].find do |raw_data|
    Dir[File.join(opt[:mz_dir], srf.base_name_noext + raw_data)].first
  end
  opt[:raw_data] = [found_ext] if found_ext

  scan_to_ret_time = 
    if opt[:retention_times]
      raise NotImplementedError, "will implement shortly"
      #mz_file = Dir[File.join(opt[:mz_dir], srf.base_name_noext + opt[:raw_data].first)].first
      #if mz_file
      #  Mspire::Msrun.scans_to_times(mz_file) 
      #else
      #  warn "turning retention_times off since no valid mz[X]ML file was found!!!"
      #  opt[:retention_times] = false
      #  nil
      #end
    end

  summary_xml_filename = srf.base_name_noext + '.xml'

  pepxml = Mspire::Ident::Pepxml.new do |msms_pipeline_analysis|
    msms_pipeline_analysis.merge!(:summary_xml => summary_xml_filename, :pepxml_version => opt[:pepxml_version]) do |msms_run_summary|
      # prep the sample enzyme and search_summary
      msms_run_summary.merge!(
        :base_name => File.join(opt[:mz_dir], srf.base_name_noext),
        :ms_manufacturer => opt[:ms_manufacturer],
        :ms_model => opt[:ms_model],
        :ms_ionization => opt[:ms_ionization],
        :ms_mass_analyzer => opt[:ms_mass_analyzer],
        :ms_detector => opt[:ms_detector],
        :raw_data => opt[:raw_data].first,
        :raw_data_type => opt[:raw_data].first,
      ) do |sample_enzyme, search_summary, spectrum_queries|
        enzyme_data = Hash[ [:offset, :cut, :no_cut].zip(params.enzyme_specificity) ]
        # if the offset is 1, it is C terminal, offset == 0 then it is N
        # terminal
        enzyme_data[:sense] = 
          case enzyme_data.delete(:offset)
          when 1 ; 'C'
          when 0 ; 'N'
          else
          raise "pepxml cannot deal with enzymes that don't have an offset of 0 or 1"
        end
        
        enzyme_data[:name] = params.enzyme
        sample_enzyme.merge!( enzyme_data )
        sample_enzyme.name = opt[:enzyme] if opt[:enzyme]
        search_summary.merge!(
          :base_name=> srf.resident_dir + '/' + srf.base_name_noext,
          :search_engine => 'SEQUEST',
          :precursor_mass_type => params.precursor_mass_type,
          :fragment_mass_type => params.fragment_mass_type,
          :out_data_type => opt[:out_data_type],
          :out_data => opt[:out_data],
        ) do |search_database, enzymatic_search_constraint, modifications_ar, parameters_hash|
          search_database.merge!(:local_path => db_filename, :seq_type => opt[:db_seq_type], :database_name => opt[:db_name], :orig_database_url => opt[:db_orig_url], :database_release_date => opt[:db_release_date], :database_release_identifier => opt[:db_release_id])

          case opt[:db_residue_size]
          when Integer
            search_database.size_of_residues = opt[:db_residue_size]
          when true
            search_database.set_size_of_residues!
          end

          enzymatic_search_constraint.merge!(
            :enzyme => opt[:enzyme] ? opt[:enzyme] : params.enzyme, 
            :max_num_internal_cleavages => params.max_num_internal_cleavages,
            :min_number_termini => params.min_number_termini,
          )
          modifications_ar.replace(modifications_obj.modifications)
          parameters_hash.merge!(params.opts)
        end

        spec_queries = srf.dta_files.zip(srf.out_files, index).map do |dta_file,out_file,i_ar|
          precursor_neutral_mass = dta_file.mh - h_plus

          search_hits = out_file.hits[0,opt[:num_hits]].each_with_index.map do |pep,i|
            (prev_aa, pure_aaseq, next_aa) = Mspire::Ident::Peptide.prepare_sequence(pep.sequence)
            calc_neutral_pep_mass = pep.mh - h_plus
            sh = Mspire::Ident::Pepxml::SearchHit.new(
              :hit_rank => i+1, 
              :peptide => pure_aaseq, 
              :peptide_prev_aa => prev_aa,
              :peptide_next_aa => next_aa,
              :protein => pep.proteins.first.reference.split(' ')[0],
              :num_tot_proteins => pep.proteins.size,
              :num_matched_ions => pep.ions_matched,
              :tot_num_ions => pep.ions_total,
              :calc_neutral_pep_mass => calc_neutral_pep_mass, 
              :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
              :num_tol_term => sample_enzyme.num_tol_term(prev_aa, pure_aaseq, next_aa),
              :num_missed_cleavages => sample_enzyme.num_missed_cleavages(pure_aaseq),
              :modification_info => modifications_obj.modification_info(Mspire::Ident::Peptide.split_sequence(pep.sequence)[1])
            ) do |search_scores|
              if opt[:deltacn_orig]
                deltacn = pep.deltacn_orig
                deltacnstar = nil
              else
                deltacn = pep.deltacn
                deltacn = 1.0 if deltacn == 1.1
                deltacnstar = out_file.hits[i+1].nil? ? '1' : '0'
              end
              search_scores.merge!( :xcorr => pep.xcorr, :deltacn => deltacn, 
                                   :spscore => pep.sp, :sprank => pep.rsp)
              search_scores[:deltacnstar] = deltacnstar if deltacnstar
            end
          end

          sr = Mspire::Ident::Pepxml::SearchResult.new(:search_hits => search_hits)

          ret_time = 
            if opt[:retention_times]
              (first_scan, last_scan) = i_ar[0,2]
              if first_scan==last_scan
                scan_to_ret_time[i_ar[0]]
              else
                times = ((i_ar[0])..(i_ar[1])).step(1).map {|i| scan_to_ret_time[i] }.compact
                times.inject(&:+) / times.size.to_f
              end
            end
          Mspire::Ident::Pepxml::SpectrumQuery.new(
            :spectrum  => [srf.base_name_noext, *i_ar].join('.'), :start_scan => i_ar[0], :end_scan => i_ar[1], 
            :precursor_neutral_mass => dta_file.mh - h_plus, :assumed_charge => i_ar[2],
            :retention_time_sec => ret_time,  
            :search_results => [sr], 
          )
        end
        spectrum_queries.replace(spec_queries)
      end
    end
  end
  pepxml
end