Module: MS::Sequest::Srf::Pepxml

Included in:: MS::Sequest::Srf

Defined in:: lib/ms/sequest/srf/pepxml.rb,
lib/ms/sequest/srf/pepxml.rb

Constant Summary collapse

DEFAULT_OPTIONS =

A hash with the following symbol keys may be set:

Run Info

:ms_model

nil

:ms_ionization

‘ESI’

:ms_detector

‘UNKNOWN’

:ms_mass_analyzer

nil - typically extracted from the srf file and matched with ModelToMsAnalyzer

:ms_manufacturer

‘Thermo’

Raw data

:mz_dir

nil - path to the mz[X]ML directory, defaults to the directory the srf file is contained in. mz[X]ML data must be available to embed retention times

:raw_data

[‘.mzML’, ‘.mzXML’] - preferred extension for raw data

Database

:db_seq_type

‘AA’ - AA or NA

:db_dir

nil - the directory the fasta file used for the search is housed in. A valid pepxml file must point to a valid fasta file!

:db_residue_size

nil - An integer for the number of residues in the database. if true, calculates the size of the fasta database.

*:db_name

nil

:db_orig_url

nil

:db_release_date

nil

:db_release_id

nil

Search Hits

:num_hits

1 - the top number of hits to include

:retention_times

false - include retention times in the file (requires mz_dir to be set)

:deltacn_orig

false - when true, the original SEQUEST deltacn values are used. If false, Bioworks deltacn values are used which are derived by taking the original deltacn of the following hit. This gives the top ranking hit an informative deltacn but makes the deltacn meaningless for other hits.

:pepxml_version

MS::Ident::Pepxml::DEFAULT_PEPXML_VERSION, - Integer to set the pepxml version. The converter and xml output attempts to produce xml specific to the version.

:verbose

true - set to false to quiet warnings

{
  :ms_model => nil,
  :ms_ionization => 'ESI',
  :ms_detector => 'UNKNOWN',
  :ms_mass_analyzer => nil,
  :ms_manufacturer => 'Thermo',

  :mz_dir => nil,
  #:raw_data => [".mzXML", '.mzML'],
  :raw_data => ['.mzML', '.mzXML'],

  :db_seq_type => 'AA', 
  :db_dir => nil, 
  :db_residue_size => nil,
  :db_name => nil,
  :db_orig_url => nil,
  :db_release_date => nil,
  :db_release_id => nil,

  :num_hits => 1,
  :retention_times => false,
  :deltacn_orig => false,

  :pepxml_version => MS::Ident::Pepxml::DEFAULT_PEPXML_VERSION,
  :verbose => true,
}

ModelToMsAnalyzer = An array of regexp to string pairs. The regexps are matched against the model (srf.header.model) and the corresponding string will be used as the mass analyzer. /Orbitrap/ ‘Orbitrap’ /LCQ Deca XP/ ‘Ion Trap’ /LTQ/ ‘Ion Trap’ /w+/ ‘UNKNOWN’

[
  [/Orbitrap/, 'Orbitrap'], 
  [/LCQ Deca XP/, 'Ion Trap'],
  [/LTQ/, 'Ion Trap'],
  [/\w+/, 'UNKNOWN'],
]

Class Method Summary collapse

.commandline(argv, progname = $0) ⇒ Object

Instance Method Summary collapse

#to_pepxml(opts = {}) ⇒ Object

returns an MS::Ident::Pepxml object.

Class Method Details

.commandline(argv, progname = $0) ⇒ `Object`

# File 'lib/ms/sequest/srf/pepxml.rb', line 263

def self.commandline(argv, progname=$0)
  opts = Trollop::Parser.new do
    banner %Q{
      usage: #{progname} [OPTIONS] <file>.srf ...
      output: <file>.xml ...
    }.lines.map(&:lstrip).join

    text ""
    text "major options:"
    opt :db_dir, "The dir holding the DB if different than in Srf. (pepxml requires a valid database path)", :type => :string
    opt :enzyme, "overide the enzyme name embedded in the params file", :type => :string
    opt :mz_dir, "directory holding mz[X]ML files (defaults to the folder holding the srf file)", :type => :string
    opt :retention_times, "include retention times (requires mz-dir)"
    opt :deltacn_orig, "use original deltacn values created by SEQUEST.  By default, the top hit gets the next hit's original deltacn."
    opt :no_filter, "do not filter hits by peptide_mass_tolerance (per sequest params)"
    opt :num_hits, "include N top hits", :default => 1
    opt :outdirs, "list of output directories", :type => :strings
    opt :quiet, "do not print warnings, etc."

    text ""
    text "minor options:"
    opt :pepxml_version, 'schema version number to use', :default => MS::Ident::Pepxml::DEFAULT_PEPXML_VERSION
    opt :ms_model, 'mass spectrometer model', :type => :string
    opt :ms_ionization, 'type of ms ionization', :default => 'ESI'
    opt :ms_detector, 'ms detector', :default => 'UNKNOWN'
    opt :ms_mass_analyzer, 'ms mass analyzer', :type => :string
    opt :ms_manufacturer, 'ms manufacturer', :default => 'Thermo'
    opt :raw_data, 'preferred extension for raw data', :default => '.mzML'
    opt :db_seq_type, "'AA' or 'NA'", :default => 'AA'
    opt :db_residue_size, 'calculate the size of the fasta file'
    opt :db_name, 'the database name', :type => :string
    opt :db_orig_url, 'original database url', :type => :string
    opt :db_release_date, 'database release date', :type => :string
    opt :db_release_id, 'the database release identifier', :type => :string
  end

  opt = opts.parse argv
  opts.educate && exit if argv.empty?

  Trollop.die :outdirs, "outdirs must be same size as number of input files" if opt.outdirs && opt.outdirs.size != argv.size
  opt[:filter] = !opt.delete(:no_filter)
  opt[:outdirs] ||= []
  opt[:raw_data] = [opt[:raw_data]] if opt[:raw_data]
  opt[:verbose] = !opt[:quiet]

  argv.zip(opt.delete(:outdirs)) do |srf_file,outdir|
    outdir ||= File.dirname(srf_file)
    srf = MS::Sequest::Srf.new(srf_file, :link_protein_hits => false, :filter_by_precursor_mass_tolerance => opt.delete(:filter))
    pepxml = srf.to_pepxml(opt)
    outfile = pepxml.to_xml(outdir)
    puts "wrote file: #{outfile}" if opt[:verbose]
  end
end

Instance Method Details

#to_pepxml(opts = {}) ⇒ `Object`

returns an MS::Ident::Pepxml object. See that object for creating an xml string or writing to file.

# File 'lib/ms/sequest/srf/pepxml.rb', line 85

def to_pepxml(opts={})
  opt = DEFAULT_OPTIONS.merge(opts)
  srf = self

  # with newer pepxml version these are not required anymore
  hidden_opts = {
    # format of file storing the runner up peptides (if not present in
    # pepXML) this was made optional after version 19
    :out_data_type => "out", ## may be srf??
    # runner up search hit data type extension (e.g. .tgz)
    :out_data => ".srf",
  }
  opt.merge!(hidden_opts)

  params = srf.params
  header = srf.header

  opt[:ms_model] ||= srf.header.model

  unless opt[:ms_mass_analyzer]
    ModelToMsAnalyzer.each do |regexp, val|
      if opt[:ms_model].match(regexp)
        opt[:ms_mass_analyzer] = val
        break
      end
    end
  end

  # get the database name
  db_filename = header.db_filename.sub(/\.hdr$/, '')
  if opt[:db_dir]
    db_filename = File.join(opt[:db_dir], db_filename.split(/[\/\\]+/).last)
  end
  if File.exist?(db_filename)
    db_filename = File.expand_path(db_filename)
  else
    msg = ["!!! WARNING !!!"]
    msg << "!!! Can't find database: #{db_filename}"
    msg << "!!! pepxml *requires* that the db path be valid"
    msg << "!!! make sure 1) the fasta file is available on this system"
    msg << "!!!           2) you've specified a valid directory with --db-dir (or :db_dir)"
    puts msg.join("\n") if opt[:verbose]
  end

  modifications_obj = MS::Sequest::Pepxml::Modifications.new(params, srf.header.modifications)
  mass_index = params.mass_index(:precursor)
  h_plus = mass_index['h+']

  opt[:mz_dir] ||= srf.resident_dir
  found_ext = opt[:raw_data].find do |raw_data|
    Dir[File.join(opt[:mz_dir], srf.base_name_noext + raw_data)].first
  end
  opt[:raw_data] = [found_ext] if found_ext

  scan_to_ret_time = 
    if opt[:retention_times]
      raise NotImplementedError, "will implement shortly"
      #mz_file = Dir[File.join(opt[:mz_dir], srf.base_name_noext + opt[:raw_data].first)].first
      #if mz_file
      #  MS::Msrun.scans_to_times(mz_file) 
      #else
      #  warn "turning retention_times off since no valid mz[X]ML file was found!!!"
      #  opt[:retention_times] = false
      #  nil
      #end
    end

  summary_xml_filename = srf.base_name_noext + '.xml'

  pepxml = MS::Ident::Pepxml.new do |msms_pipeline_analysis|
    msms_pipeline_analysis.merge!(:summary_xml => summary_xml_filename, :pepxml_version => opt[:pepxml_version]) do |msms_run_summary|
      # prep the sample enzyme and search_summary
      msms_run_summary.merge!(
        :base_name => File.join(opt[:mz_dir], srf.base_name_noext),
        :ms_manufacturer => opt[:ms_manufacturer],
        :ms_model => opt[:ms_model],
        :ms_ionization => opt[:ms_ionization],
        :ms_mass_analyzer => opt[:ms_mass_analyzer],
        :ms_detector => opt[:ms_detector],
        :raw_data => opt[:raw_data].first,
        :raw_data_type => opt[:raw_data].first,
      ) do |sample_enzyme, search_summary, spectrum_queries|
        sample_enzyme.merge!(params.sample_enzyme_hash)
        sample_enzyme.name = opt[:enzyme] if opt[:enzyme]
        search_summary.merge!(
          :base_name=> srf.resident_dir + '/' + srf.base_name_noext,
          :search_engine => 'SEQUEST',
          :precursor_mass_type => params.precursor_mass_type,
          :fragment_mass_type => params.fragment_mass_type,
          :out_data_type => opt[:out_data_type],
          :out_data => opt[:out_data],
        ) do |search_database, enzymatic_search_constraint, modifications_ar, parameters_hash|
          search_database.merge!(:local_path => db_filename, :seq_type => opt[:db_seq_type], :database_name => opt[:db_name], :orig_database_url => opt[:db_orig_url], :database_release_date => opt[:db_release_date], :database_release_identifier => opt[:db_release_id])

          case opt[:db_residue_size]
          when Integer
            search_database.size_of_residues = opt[:db_residue_size]
          when true
            search_database.set_size_of_residues!
          end

          enzymatic_search_constraint.merge!(
            :enzyme => opt[:enzyme] ? opt[:enzyme] : params.enzyme, 
            :max_num_internal_cleavages => params.max_num_internal_cleavages,
            :min_number_termini => params.min_number_termini,
          )
          modifications_ar.replace(modifications_obj.modifications)
          parameters_hash.merge!(params.opts)
        end

        spec_queries = srf.dta_files.zip(srf.out_files, index).map do |dta_file,out_file,i_ar|
          precursor_neutral_mass = dta_file.mh - h_plus

          search_hits = out_file.hits[0,opt[:num_hits]].each_with_index.map do |pep,i|
            (prev_aa, pure_aaseq, next_aa) = MS::Ident::Peptide.prepare_sequence(pep.sequence)
            calc_neutral_pep_mass = pep.mh - h_plus
            sh = MS::Ident::Pepxml::SearchHit.new(
              :hit_rank => i+1, 
              :peptide => pure_aaseq, 
              :peptide_prev_aa => prev_aa,
              :peptide_next_aa => next_aa,
              :protein => pep.proteins.first.reference.split(' ')[0],
              :num_tot_proteins => pep.proteins.size,
              :num_matched_ions => pep.ions_matched,
              :tot_num_ions => pep.ions_total,
              :calc_neutral_pep_mass => calc_neutral_pep_mass, 
              :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
              :num_tol_term => sample_enzyme.num_tol_term(prev_aa, pure_aaseq, next_aa),
              :num_missed_cleavages => sample_enzyme.num_missed_cleavages(pure_aaseq),
              :modification_info => modifications_obj.modification_info(MS::Ident::Peptide.split_sequence(pep.sequence)[1])
            ) do |search_scores|
              if opt[:deltacn_orig]
                deltacn = pep.deltacn_orig
                deltacnstar = nil
              else
                deltacn = pep.deltacn
                deltacn = 1.0 if deltacn == 1.1
                deltacnstar = out_file.hits[i+1].nil? ? '1' : '0'
              end
              search_scores.merge!( :xcorr => pep.xcorr, :deltacn => deltacn, 
                                   :spscore => pep.sp, :sprank => pep.rsp)
              search_scores[:deltacnstar] = deltacnstar if deltacnstar
            end
          end

          sr = MS::Ident::Pepxml::SearchResult.new(:search_hits => search_hits)

          ret_time = 
            if opt[:retention_times]
              (first_scan, last_scan) = i_ar[0,2]
              if first_scan==last_scan
                scan_to_ret_time[i_ar[0]]
              else
                times = ((i_ar[0])..(i_ar[1])).step(1).map {|i| scan_to_ret_time[i] }.compact
                times.inject(&:+) / times.size.to_f
              end
            end
          MS::Ident::Pepxml::SpectrumQuery.new(
            :spectrum  => [srf.base_name_noext, *i_ar].join('.'), :start_scan => i_ar[0], :end_scan => i_ar[1], 
            :precursor_neutral_mass => dta_file.mh - h_plus, :assumed_charge => i_ar[2],
            :retention_time_sec => ret_time,  
            :search_results => [sr], 
          )
        end
        spectrum_queries.replace(spec_queries)
      end
    end
  end
  pepxml
end