Class: Bio::PepXML

Inherits:
Object
  • Object
show all
Includes:
DivvyProteomics::Logging
Defined in:
lib/pep_xml.rb

Defined Under Namespace

Classes: Peptide, Protein

Instance Attribute Summary collapse

Class Method Summary collapse

Methods included from DivvyProteomics::Logging

#log

Instance Attribute Details

#peptide_name_to_objectObject

Returns the value of attribute peptide_name_to_object.



6
7
8
# File 'lib/pep_xml.rb', line 6

def peptide_name_to_object
  @peptide_name_to_object
end

#protein_name_to_objectObject

Returns the value of attribute protein_name_to_object.



6
7
8
# File 'lib/pep_xml.rb', line 6

def protein_name_to_object
  @protein_name_to_object
end

Class Method Details

.logObject



36
37
38
# File 'lib/pep_xml.rb', line 36

def self.log
  Bio::PepXML.new.log
end

.parse(io) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/pep_xml.rb', line 40

def self.parse(io)
  protein_name_to_object = {}
  peptide_name_to_object = {}

  #pep.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit'){|e|
  #  c+=1; p e.attributes['protein_descr'].strip;
  #  e.elements.each{|e|
  #    p e.name, e.attributes['protein_descr'].strip};break}
  xml = REXML::Document.new(io)

  parse_name_and_description = lambda do |e|
    name = e.attributes['protein'].strip
    description = e.attributes['protein_descr'].strip
    if name.nil? or name == ''
      name = e.attributes['protein_descr'].strip
    else
      description = name+' '+description
    end
    name.gsub!(/\t.*/,'')
    description.gsub!(/[\t\n]/,' ')

    [name, description]
  end

  #TODO: some better sanity checking here would be ideal.
  num_hits_parsed = 0
  xml.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit') do |hit|
    hit_number = hit.attributes['hit_rank']
    raise "Parsing error on #{hit}" if hit_number.nil?
    next if hit_number != "1"

    # Parse the primary hit
    name1, description1 = parse_name_and_description.call(hit)
    raise "No protein name found in this xml fragment: #{hit.to_s}" if name1.nil?
    spectrum_name = hit.parent.parent.attributes['spectrum'].strip
    raise "Parsing error (couldn't find spectrum name) with spectra #{hit.inspect}" if spectrum_name.nil?

    # It is possible to have multiple peptides both hit the spectra with hit_rank="1"
    # This happens when when e.g. leucine and isoleucine are possible.
    spectrum = peptide_name_to_object[spectrum_name]
    if spectrum.nil?
      spectrum = Peptide.new
      spectrum.identifier = spectrum_name
      peptide_name_to_object[spectrum_name] = spectrum
    end


    protein1 = protein_name_to_object[name1]
    if protein1.nil?
      protein1 = Protein.new
      protein1.identifier = name1
      protein1.descriptive_name = description1
      protein1.peptides = []
      protein_name_to_object[name1] = protein1
    end
    protein1.peptides.push spectrum
    spectrum.parent_proteins ||= []
    spectrum.parent_proteins.push protein1


    # Parse the alternate hits. Only look at children with protein_descr attributes - these are
    # these are the alternate proteins
    hit.each_element_with_attribute('protein_descr') do |e|
      name, description = parse_name_and_description.call(e)

      alternate = protein_name_to_object[name]
      if alternate.nil?
        alternate = Protein.new
        alternate.identifier = name
        alternate.descriptive_name = description
        alternate.peptides = []
        protein_name_to_object[name] = alternate
      end
      alternate.peptides.push spectrum
      spectrum.parent_proteins.push alternate
    end

    # Don't count the same protein multiple times - might happen when a spectru
    spectrum.parent_proteins.uniq!

    num_hits_parsed += 1
  end
  log.info "Parsed #{num_hits_parsed} search hits"

  pepxml = Bio::PepXML.new
  pepxml.protein_name_to_object = protein_name_to_object
  pepxml.peptide_name_to_object = peptide_name_to_object

  return pepxml
end