Class: Bio::PepXML
- Inherits:
-
Object
- Object
- Bio::PepXML
- Includes:
- DivvyProteomics::Logging
- Defined in:
- lib/pep_xml.rb
Defined Under Namespace
Instance Attribute Summary collapse
-
#peptide_name_to_object ⇒ Object
Returns the value of attribute peptide_name_to_object.
-
#protein_name_to_object ⇒ Object
Returns the value of attribute protein_name_to_object.
Class Method Summary collapse
Methods included from DivvyProteomics::Logging
Instance Attribute Details
#peptide_name_to_object ⇒ Object
Returns the value of attribute peptide_name_to_object.
6 7 8 |
# File 'lib/pep_xml.rb', line 6 def peptide_name_to_object @peptide_name_to_object end |
#protein_name_to_object ⇒ Object
Returns the value of attribute protein_name_to_object.
6 7 8 |
# File 'lib/pep_xml.rb', line 6 def protein_name_to_object @protein_name_to_object end |
Class Method Details
.parse(io) ⇒ Object
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# File 'lib/pep_xml.rb', line 40 def self.parse(io) protein_name_to_object = {} peptide_name_to_object = {} #pep.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit'){|e| # c+=1; p e.attributes['protein_descr'].strip; # e.elements.each{|e| # p e.name, e.attributes['protein_descr'].strip};break} xml = REXML::Document.new(io) parse_name_and_description = lambda do |e| name = e.attributes['protein'].strip description = e.attributes['protein_descr'].strip if name.nil? or name == '' name = e.attributes['protein_descr'].strip else description = name+' '+description end name.gsub!(/\t.*/,'') description.gsub!(/[\t\n]/,' ') [name, description] end #TODO: some better sanity checking here would be ideal. num_hits_parsed = 0 xml.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit') do |hit| hit_number = hit.attributes['hit_rank'] raise "Parsing error on #{hit}" if hit_number.nil? next if hit_number != "1" # Parse the primary hit name1, description1 = parse_name_and_description.call(hit) raise "No protein name found in this xml fragment: #{hit.to_s}" if name1.nil? spectrum_name = hit.parent.parent.attributes['spectrum'].strip raise "Parsing error (couldn't find spectrum name) with spectra #{hit.inspect}" if spectrum_name.nil? # It is possible to have multiple peptides both hit the spectra with hit_rank="1" # This happens when when e.g. leucine and isoleucine are possible. spectrum = peptide_name_to_object[spectrum_name] if spectrum.nil? spectrum = Peptide.new spectrum.identifier = spectrum_name peptide_name_to_object[spectrum_name] = spectrum end protein1 = protein_name_to_object[name1] if protein1.nil? protein1 = Protein.new protein1.identifier = name1 protein1.descriptive_name = description1 protein1.peptides = [] protein_name_to_object[name1] = protein1 end protein1.peptides.push spectrum spectrum.parent_proteins ||= [] spectrum.parent_proteins.push protein1 # Parse the alternate hits. Only look at children with protein_descr attributes - these are # these are the alternate proteins hit.each_element_with_attribute('protein_descr') do |e| name, description = parse_name_and_description.call(e) alternate = protein_name_to_object[name] if alternate.nil? alternate = Protein.new alternate.identifier = name alternate.descriptive_name = description alternate.peptides = [] protein_name_to_object[name] = alternate end alternate.peptides.push spectrum spectrum.parent_proteins.push alternate end # Don't count the same protein multiple times - might happen when a spectru spectrum.parent_proteins.uniq! num_hits_parsed += 1 end log.info "Parsed #{num_hits_parsed} search hits" pepxml = Bio::PepXML.new pepxml.protein_name_to_object = protein_name_to_object pepxml.peptide_name_to_object = peptide_name_to_object return pepxml end |