Class: Mspire::Mzml

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/mspire/mzml.rb,
lib/mspire/mzml/cv.rb,
lib/mspire/mzml/run.rb,
lib/mspire/mzml/list.rb,
lib/mspire/mzml/scan.rb,
lib/mspire/mzml/plms1.rb,
lib/mspire/mzml/sample.rb,
lib/mspire/mzml/contact.rb,
lib/mspire/mzml/product.rb,
lib/mspire/mzml/software.rb,
lib/mspire/mzml/spectrum.rb,
lib/mspire/mzml/component.rb,
lib/mspire/mzml/precursor.rb,
lib/mspire/mzml/scan_list.rb,
lib/mspire/mzml/activation.rb,
lib/mspire/mzml/data_array.rb,
lib/mspire/mzml/index_list.rb,
lib/mspire/mzml/scan_window.rb,
lib/mspire/mzml/source_file.rb,
lib/mspire/mzml/chromatogram.rb,
lib/mspire/mzml/file_content.rb,
lib/mspire/mzml/selected_ion.rb,
lib/mspire/mzml/scan_settings.rb,
lib/mspire/mzml/spectrum_list.rb,
lib/mspire/mzml/data_processing.rb,
lib/mspire/mzml/file_description.rb,
lib/mspire/mzml/isolation_window.rb,
lib/mspire/mzml/chromatogram_list.rb,
lib/mspire/mzml/processing_method.rb,
lib/mspire/mzml/instrument_configuration.rb,
lib/mspire/mzml/data_array_container_like.rb,
lib/mspire/mzml/referenceable_param_group.rb

Overview

Reading an mzml file:

Mspire::Mzml.open("somefile.mzML") do |mzml|
  mzml.each do |spectrum|
    scan = spectrum.scan
    spectrum.mzs                  # array of m/zs
    spectrum.intensities          # array of intensities
    spectrum.peaks do |mz,intensity|
      puts "mz: #{mz} intensity: #{intensity}" 
    end
  end
end

Note that the mzml object supports random spectrum access (even if the mzml was not indexed):

mzml[22]  # retrieve spectrum at index 22

Writing an mzml file from scratch:

spec1 = Mspire::Mzml::Spectrum.new('scan=1', params: ['MS:1000127', ['MS:1000511', 1]]) do |spec|
  spec.data_arrays = [[1,2,3], [4,5,6]]
  spec.scan_list = Mspire::Mzml::ScanList.new do |sl|
    scan = Mspire::Mzml::Scan.new do |scan|
      # retention time of 40 seconds
      scan.describe! ['MS:1000016', 40.0, 'UO:0000010']
    end
    sl << scan
  end
end

mzml = Mspire::Mzml.new do |mzml|
  mzml.id = 'the_little_example'
  mzml.cvs = Mspire::Mzml::CV::DEFAULT_CVS
  mzml.file_description = Mspire::Mzml::FileDescription.new  do |fd|
    fd.file_content = Mspire::Mzml::FileContent.new
    fd.source_files << Mspire::Mzml::SourceFile.new
  end
  default_instrument_config = Mspire::Mzml::InstrumentConfiguration.new("IC",[], params: ['MS:1000031'])
  mzml.instrument_configurations << default_instrument_config
  software = Mspire::Mzml::Software.new
  mzml.software_list << software
  default_data_processing = Mspire::Mzml::DataProcessing.new("did_nothing")
  mzml.data_processing_list << default_data_processing
  mzml.run = Mspire::Mzml::Run.new("little_run", default_instrument_config) do |run|
    spectrum_list = Mspire::Mzml::SpectrumList.new(default_data_processing)
    spectrum_list.push(spec1)
    run.spectrum_list = spectrum_list
  end
end

Defined Under Namespace

Modules: Component, DataArrayContainerLike, Default, List, Parser Classes: Activation, CV, Chromatogram, ChromatogramList, Contact, DataArray, DataProcessing, FileContent, FileDescription, Index, IndexList, InstrumentConfiguration, IsolationWindow, Precursor, ProcessingMethod, Product, ReferenceableParamGroup, Run, Sample, Scan, ScanList, ScanNumbersNotFound, ScanNumbersNotUnique, ScanSettings, ScanWindow, SelectedIon, Software, SourceFile, Spectrum, SpectrumList

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(arg = nil, &block) ⇒ Mzml

arg must be an IO object for automatic index and header parsing to occur. If arg is a hash, then attributes are set. In addition (or alternatively) a block called that yields self to setup the object.

io must respond_to?(:size), giving the size of the io object in bytes which allows seeking. get_index_list is called to get or create the index list.



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/mspire/mzml.rb', line 147

def initialize(arg=nil, &block)
  %w(cvs software_list instrument_configurations data_processing_list).each {|guy| self.send( guy + '=', [] ) }

  case arg
  when IO
    @io = arg
    begin
      @encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
    rescue EOFError
      raise RuntimeError, "no encoding present in XML!  (Is this even an xml file?)"
    end
    @index_list = get_index_list
    read_header!
  when Hash
    arg.each {|k,v| self.send("#{k}=", v) }
  end
  if block
    block.call(self)
  end
end

Instance Attribute Details

#accessionObject

(optional) e.g. a PRIDE accession number



98
99
100
# File 'lib/mspire/mzml.rb', line 98

def accession
  @accession
end

#cvsObject

(required) an array of Mspire::Mzml::CV objects



105
106
107
# File 'lib/mspire/mzml.rb', line 105

def cvs
  @cvs
end

#data_processing_listObject

(required) an array of Mspire::Mzml::DataProcessing objects



126
127
128
# File 'lib/mspire/mzml.rb', line 126

def data_processing_list
  @data_processing_list
end

#encodingObject

Returns the value of attribute encoding.



138
139
140
# File 'lib/mspire/mzml.rb', line 138

def encoding
  @encoding
end

#file_descriptionObject

(required) an Mspire::Mzml::FileDescription



108
109
110
# File 'lib/mspire/mzml.rb', line 108

def file_description
  @file_description
end

#idObject

(optional) an id for accessing from external files



92
93
94
# File 'lib/mspire/mzml.rb', line 92

def id
  @id
end

#index_listObject

Returns the value of attribute index_list.



137
138
139
# File 'lib/mspire/mzml.rb', line 137

def index_list
  @index_list
end

#instrument_configurationsObject

(required) an array of Mspire::Mzml::InstrumentConfiguration objects



123
124
125
# File 'lib/mspire/mzml.rb', line 123

def instrument_configurations
  @instrument_configurations
end

#ioObject

Returns the value of attribute io.



136
137
138
# File 'lib/mspire/mzml.rb', line 136

def io
  @io
end

#referenceable_param_groupsObject

(optional) an array of CV::ReferenceableParamGroup objects



111
112
113
# File 'lib/mspire/mzml.rb', line 111

def referenceable_param_groups
  @referenceable_param_groups
end

#runObject

(required) an Mspire::Mzml::Run object



129
130
131
# File 'lib/mspire/mzml.rb', line 129

def run
  @run
end

#samplesObject

(optional) an array of Mspire::Mzml::Sample objects



114
115
116
# File 'lib/mspire/mzml.rb', line 114

def samples
  @samples
end

#scan_settings_listObject

(optional) an array of Mspire::Mzml::ScanSettings objects



120
121
122
# File 'lib/mspire/mzml.rb', line 120

def scan_settings_list
  @scan_settings_list
end

#software_listObject

(required) an array of Mspire::Mzml::Software objects



117
118
119
# File 'lib/mspire/mzml.rb', line 117

def software_list
  @software_list
end

#versionObject

(required) the Mzml document version



95
96
97
# File 'lib/mspire/mzml.rb', line 95

def version
  @version
end

Class Method Details

.foreach(filename, &block) ⇒ Object



220
221
222
223
224
225
# File 'lib/mspire/mzml.rb', line 220

def foreach(filename, &block)
  block or return enum_for(__method__, filename)
  open(filename) do |mzml|
    mzml.each(&block)
  end
end

.open(filename, &block) ⇒ Object

read-only right now



214
215
216
217
218
# File 'lib/mspire/mzml.rb', line 214

def open(filename, &block)
  File.open(filename) do |io|
    block.call(self.new(io))
  end
end

Instance Method Details

#create_index_listMspire::Mzml::IndexList

Reads through and captures start bytes



325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# File 'lib/mspire/mzml.rb', line 325

def create_index_list
  indices_hash = @io.bookmark(true) do |inner_io|   # sets to beginning of file
    indices = {:spectrum => {}, :chromatogram => {}}
    byte_total = 0
    @io.each do |line|
      if md=%r{<(spectrum|chromatogram).*?id=['"](.*?)['"][ >]}.match(line)
        indices[md[1].to_sym][md[2]] = byte_total + md.pre_match.bytesize
      end
      byte_total += line.bytesize
    end
    indices
  end

  indices = indices_hash.map do |sym, hash|
    indices = Index.new ; ids = []
    hash.each {|id, startbyte| ids << id ; indices << startbyte }
    indices.ids = ids ; indices.name = sym
    indices
  end
  IndexList.new(indices)
end

#each_spectrum(&block) ⇒ Object Also known as: each



242
243
244
245
246
247
248
249
250
251
# File 'lib/mspire/mzml.rb', line 242

def each_spectrum(&block)
  block or return enum_for(__method__)
  (0...@index_list[:spectrum].size).each do |int|
    block.call(spectrum(int))
  end
  #block_given? or return enum_for(__method__)
  #(0...@index_list[:spectrum].size).each do |int|
  #  yield spectrum(int)
  #end
end

#each_spectrum_node(&block) ⇒ Object

returns the Nokogiri::XML::Node object associated with each spectrum



254
255
256
257
258
# File 'lib/mspire/mzml.rb', line 254

def each_spectrum_node(&block)
  @index_list[:spectrum].each do |start_byte|
    block.call spectrum_node_from_start_byte(start_byte)
  end
end

#get_index_listArray

reads or creates an index list

Returns:

  • (Array)

    an array of indices



349
350
351
# File 'lib/mspire/mzml.rb', line 349

def get_index_list
  read_index_list || create_index_list
end

#get_xml_string(start_byte, name = :spectrum) ⇒ Object

name can be :spectrum or :chromatogram



229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/mspire/mzml.rb', line 229

def get_xml_string(start_byte, name=:spectrum)
  io.seek(start_byte)
  data = []
  regexp = %r{</#{name}>}
  io.each_line do |line|
    data << line 
    #unless (line.index('<binary') && line[-12..-1].include?('</binary>'))
      break if regexp.match(line)
    #end
  end
  data.join
end

#read_header!Object



168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/mspire/mzml.rb', line 168

def read_header!
  @io.rewind
  chunk_size = 2**12
  loc = 0
  string = ''
  while chunk = @io.read(chunk_size)
    string << chunk
    start_looking = ((loc-20) < 0) ? 0 : (loc-20)
    break if string[start_looking..-1] =~ /<(spectrum|chromatogram)/
    loc += chunk_size
  end
  doc = Nokogiri::XML.parse(string, nil, @encoding, Parser::NOBLANKS)
  mzml_n = doc.root
  if mzml_n.name == 'indexedmzML'
    mzml_n = mzml_n.child
  end
  cv_list_n = mzml_n.child
  file_description_n = cv_list_n.next
  self.cvs = cv_list_n.children.map do |cv_n|
    Mspire::Mzml::CV.from_xml(cv_n)
  end
  self.file_description = Mspire::Mzml::FileDescription.from_xml(file_description_n)
  next_n = file_description_n.next
  loop do
    case next_n.name
    when 'referenceableParamGroupList'
      # get a hash ready
    when 'sampleList'
      # set objects
    when 'softwareList'  # required
      # set objects
    when 'instrumentConfigurationList'
      # set objects
    when 'dataProcessingList'
      # set objects
    when 'run'
      # get defaults ready
      break
    end
    next_n = next_n.next
  end
end

#read_index_listMspire::Mzml::IndexList

mzML

Returns:



302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# File 'lib/mspire/mzml.rb', line 302

def read_index_list
  if offset=Mspire::Mzml::Index.index_offset(@io)
    @io.seek(offset)
    xml = Nokogiri::XML.parse(@io.read, nil, @encoding, Parser::NOBLANKS)
    index_list = xml.root
    num_indices = index_list['count'].to_i
    array = index_list.children.map do |index_n|
      #index = Index.new(index_n['name'])
      index = Index.new
      index.name = index_n['name'].to_sym
      ids = []
      index_n.children.map do |offset_n| 
        index << offset_n.text.to_i 
        ids << offset_n['idRef']
      end
      index.ids = ids
      index
    end
    IndexList.new(array)
  end
end

#sizeObject

returns the number of spectra



282
283
284
# File 'lib/mspire/mzml.rb', line 282

def size
  @index_list[:spectrum].size
end

#spectrum(arg) ⇒ Mspire::Spectrum Also known as: []

Returns a spectrum object.

Parameters:

  • arg (Object)

    an index number (Integer) or id string (String)

Returns:



275
276
277
278
279
# File 'lib/mspire/mzml.rb', line 275

def spectrum(arg)
  start_byte = index_list[0].start_byte(arg)
  spec_n = spectrum_node_from_start_byte(start_byte)
  Mspire::Mzml::Spectrum.from_xml(spec_n)
end

#spectrum_from_scan_num(scan_num) ⇒ Mspire::Spectrum

Returns a spectrum object, or nil if not found.

Parameters:

  • scan_num (Integer)

    the scan number

Returns:

Raises:



293
294
295
296
297
298
# File 'lib/mspire/mzml.rb', line 293

def spectrum_from_scan_num(scan_num)
  @scan_to_index ||= @index_list[0].create_scan_index
  raise ScanNumbersNotUnique if @scan_to_index == false
  raise ScanNumbersNotFound if @scan_to_index == nil
  spectrum(@scan_to_index[scan_num])
end

#spectrum_node(index) ⇒ Object

returns the nokogiri xml node for the spectrum at that index



263
264
265
# File 'lib/mspire/mzml.rb', line 263

def spectrum_node(index)
  spectrum_node_from_start_byte(@index_list[:spectrum][index])
end

#spectrum_node_from_start_byte(start_byte) ⇒ Object



267
268
269
270
271
# File 'lib/mspire/mzml.rb', line 267

def spectrum_node_from_start_byte(start_byte)
  xml = get_xml_string(start_byte, :spectrum)
  doc = Nokogiri::XML.parse(xml, nil, @encoding, Parser::NOBLANKS)
  doc.root
end

#to_plms1(use_scan_nums = true) ⇒ Object

will use scan numbers if use_scan_nums is true, otherwise it will use index numbers in place of scan nums



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/mspire/mzml/plms1.rb', line 8

def to_plms1(use_scan_nums=true)
  spectrum_index_list = self.index_list[:spectrum]
  scan_nums = 
    if use_scan_nums 
      spectrum_index_list.create_scan_to_index.keys
    else
      (0...spectrum_index_list.size).to_a
    end
  retention_times = self.enum_for(:each_spectrum_node).map do |xml_node|
    rt_xml_node=xml_node.xpath("scanList/scan/cvParam[@accession='MS:1000016']")[0]
    raise 'no retention time xml node' unless rt_xml_node
    retention_time = rt_xml_node['value'].to_f
    case rt_xml_node['unitName']
    when 'minute'
      retention_time * 60
    when 'second'
      retention_time
    else
      raise 'retention time must be in minutes or seconds (or add some code to handle)'
    end
  end
  # plms1 only requires that the obect respond to :each, giving a spectrum
  # object, so an Mzml object will work.
  Mspire::Plms1.new(scan_nums, retention_times, self)
end

#to_xml(filename = nil) ⇒ Object

Because mzml files are often very large, we try to avoid storing the entire object tree in memory before writing.

takes a filename and uses builder to write to it if no filename is given, returns a string



358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
# File 'lib/mspire/mzml.rb', line 358

def to_xml(filename=nil)
  # TODO: support indexed mzml files
  io = filename ? File.open(filename, 'w') : StringIO.new
  xml = Builder::XmlMarkup.new(:target => io, :indent => 2)
  xml.instruct!

  mzml_atts = Default::NAMESPACE.dup
  mzml_atts[:version] = @version || Default::VERSION
  mzml_atts[:accession] = @accession if @accession
  mzml_atts[:id] = @id if @id

  xml.mzML(mzml_atts) do |mzml_n|
    # the 'if' statements capture whether or not the list is required or not
    raise "#{self.class}#cvs must have > 0 Mspire::Mzml::CV objects" unless @cvs.size > 0 
    Mspire::Mzml::CV.list_xml(@cvs, mzml_n)
    @file_description.to_xml(mzml_n)
    if @referenceable_param_groups
      Mspire::Mzml::ReferenceableParamGroup.list_xml(@referenceable_param_groups, mzml_n)
    end
    if @samples
      Mspire::Mzml::Sample.list_xml(@samples, mzml_n)
    end
    Mspire::Mzml::Software.list_xml(@software_list, mzml_n)
    if @scan_settings_list && @scan_settings_list.size > 0
      Mspire::Mzml::ScanSettings.list_xml(@scan_settings_list, mzml_n)
    end
    icl = Mspire::Mzml::InstrumentConfiguration.list_xml(@instrument_configurations, mzml_n)
    Mspire::Mzml::DataProcessing.list_xml(@data_processing_list, mzml_n)
    @run.to_xml(mzml_n)
  end
  
  if filename
    io.close 
    self
  else
    io.string
  end
end