Class: Ms::Sequest::Srf
- Inherits:
-
Ident::Search
- Object
- Ident::Search
- Ms::Sequest::Srf
- Defined in:
- lib/ms/sequest/srf.rb,
lib/ms/sequest/srf/sqt.rb,
lib/ms/sequest/srf/pepxml.rb,
lib/ms/sequest/srf/search.rb
Defined Under Namespace
Modules: Pepxml, Search, Sqt Classes: Dta, DtaGen, Header, NoSequestParamsError, Out
Constant Summary
Constants included from Pepxml
Pepxml::DEFAULT_OPTIONS, Pepxml::ModelToMsAnalyzer
Instance Attribute Summary collapse
-
#base_name ⇒ Object
(also: #base_name_noext)
the base name of the file with no extension.
-
#dta_files ⇒ Object
Returns the value of attribute dta_files.
-
#filtered_by_precursor_mass_tolerance ⇒ Object
a boolean to indicate if the results have been filtered by the sequest.params precursor mass tolerance.
-
#header ⇒ Object
Returns the value of attribute header.
-
#index ⇒ Object
a parallel array to dta_files and out_files where each entry is: [first_scan, last_scan, charge].
-
#out_files ⇒ Object
Returns the value of attribute out_files.
-
#params ⇒ Object
Returns the value of attribute params.
-
#resident_dir ⇒ Object
the directory the srf file was residing in when the filename was passed in.
-
#version ⇒ Object
a String: 3.5, 3.3 or 3.2.
Class Method Summary collapse
-
.get_sequest_params_and_finish_pos(filename) ⇒ Object
returns a Sequest::Params object or nil if none.
Instance Method Summary collapse
- #dta_start_byte ⇒ Object
-
#filter_by_precursor_mass_tolerance! ⇒ Object
1.
-
#from_file(filename, opts) ⇒ Object
returns self opts are the same as for ‘new’.
-
#initialize(filename = nil, opts = {}) ⇒ Srf
constructor
opts: :filter_by_precursor_mass_tolerance => true | false (default true) # this will filter by the sequest params prec tolerance as is # typically done by Bioworks.
- #protein_class ⇒ Object
- #read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0) ⇒ Object
-
#read_dta_files(fh, num_files, unpack_35) ⇒ Object
returns an array of dta_files.
-
#read_out_files(fh, number_files, unpack_35, dup_refs_gt_0) ⇒ Object
filehandle (fh) must be at the start of the outfiles.
-
#read_scan_index(fh, num) ⇒ Object
returns an index where each entry is [first_scan, last_scan, charge].
Methods included from Search
Methods included from Pepxml
Methods included from Sqt
Constructor Details
#initialize(filename = nil, opts = {}) ⇒ Srf
opts:
:filter_by_precursor_mass_tolerance => true | false (default true)
# this will filter by the sequest params prec tolerance as is
# typically done by Bioworks.
:link_protein_hits => true | false (default true)
# if true, generates the @protein attribute for the :protein method
# and creates one protein per reference that is linked to each
# relevant peptide hit.
# if false, each protein for each peptide hit is a unique object
# and the :proteins method returns nil. If you are merging multiple
# searches then you probably want to set this to false to avoid
# recalculation.
:read_pephits => true | false (default true)
# will attempt to read peptide hit information (equivalent to .out
# files), otherwise, just reads the dta information.
103 104 105 106 107 108 109 110 111 |
# File 'lib/ms/sequest/srf.rb', line 103 def initialize(filename=nil, opts={}) @peptide_hits = [] @dta_files = [] @out_files = [] if filename from_file(filename, opts) end end |
Instance Attribute Details
#base_name ⇒ Object Also known as: base_name_noext
the base name of the file with no extension
37 38 39 |
# File 'lib/ms/sequest/srf.rb', line 37 def base_name @base_name end |
#dta_files ⇒ Object
Returns the value of attribute dta_files.
29 30 31 |
# File 'lib/ms/sequest/srf.rb', line 29 def dta_files @dta_files end |
#filtered_by_precursor_mass_tolerance ⇒ Object
a boolean to indicate if the results have been filtered by the sequest.params precursor mass tolerance
48 49 50 |
# File 'lib/ms/sequest/srf.rb', line 48 def filtered_by_precursor_mass_tolerance @filtered_by_precursor_mass_tolerance end |
#header ⇒ Object
Returns the value of attribute header.
28 29 30 |
# File 'lib/ms/sequest/srf.rb', line 28 def header @header end |
#index ⇒ Object
a parallel array to dta_files and out_files where each entry is:
- first_scan, last_scan, charge
34 35 36 |
# File 'lib/ms/sequest/srf.rb', line 34 def index @index end |
#out_files ⇒ Object
Returns the value of attribute out_files.
30 31 32 |
# File 'lib/ms/sequest/srf.rb', line 30 def out_files @out_files end |
#params ⇒ Object
Returns the value of attribute params.
31 32 33 |
# File 'lib/ms/sequest/srf.rb', line 31 def params @params end |
#resident_dir ⇒ Object
the directory the srf file was residing in when the filename was passed in. May not be available.
44 45 46 |
# File 'lib/ms/sequest/srf.rb', line 44 def resident_dir @resident_dir end |
#version ⇒ Object
a String: 3.5, 3.3 or 3.2
26 27 28 |
# File 'lib/ms/sequest/srf.rb', line 26 def version @version end |
Class Method Details
.get_sequest_params_and_finish_pos(filename) ⇒ Object
returns a Sequest::Params object or nil if none
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/ms/sequest/srf.rb', line 55 def self.get_sequest_params_and_finish_pos(filename) # split the file in half and only read the second half (since we can be # confident that the params file will be there!) params = nil finish_parsing_io_pos = nil File.open(filename, 'rb') do |handle| halfway = handle.stat.size / 2 handle.seek halfway last_half = handle.read if sequest_start_from_last_half = last_half.rindex('[SEQUEST]') params_start_index = sequest_start_from_last_half + halfway handle.seek(params_start_index) params = Ms::Sequest::Params.new.parse_io(handle) finish_parsing_io_pos = handle.pos else nil # not found end end [params, finish_parsing_io_pos] end |
Instance Method Details
#dta_start_byte ⇒ Object
77 78 79 80 81 82 83 |
# File 'lib/ms/sequest/srf.rb', line 77 def dta_start_byte case @version when '3.2' ; 3260 when '3.3' ; 3644 when '3.5' ; 3644 end end |
#filter_by_precursor_mass_tolerance! ⇒ Object
-
updates the out_file’s list of hits based on passing peptides (but not
the original hit id; rank is implicit in array ordering)
-
recalculates deltacn values completely if number of hits changed (does
not touch deltacn orig)
This can spoil proper protein -> peptide linkages. Ms::Id::Search.merge! should be run after this method to ensure correct protein -> peptide linkages.
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/ms/sequest/srf.rb', line 122 def filter_by_precursor_mass_tolerance! pmt = params.peptide_mass_tolerance.to_f methd = nil # the method to case params.peptide_mass_units when '0' amu_based = true milli_amu = false when '1' amu_based = true milli_amu = true when '2' amu_based = false end self.filtered_by_precursor_mass_tolerance = true self.out_files.each do |out_file| hits = out_file.hits before = hits.size hits.reject! do |pep| if amu_based if milli_amu (pep.deltamass.abs > (pmt/1000)) else (pep.deltamass.abs > pmt) end else (pep.ppm.abs > pmt) end end if hits.size != before out_file.hits = hits # <- is this necessary Ms::Sequest::Srf::Out::Peptide.update_deltacns_from_xcorr(hits) out_file.num_hits = hits.size end end self end |
#from_file(filename, opts) ⇒ Object
returns self opts are the same as for ‘new’
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 |
# File 'lib/ms/sequest/srf.rb', line 178 def from_file(filename, opts) @resident_dir = File.dirname(File.(filename)) opts = { :filter_by_precursor_mass_tolerance => true, :link_protein_hits => true, :read_pephits => true}.merge(opts) (@params, after_params_io_pos) = Ms::Sequest::Srf.get_sequest_params_and_finish_pos(filename) return unless @params dup_references = 0 dup_refs_gt_0 = false dup_references = @params.print_duplicate_references.to_i if dup_references == 0 # warn %Q{ #***************************************************************************** #WARNING: This srf file lists only 1 protein per peptide! (based on the #print_duplicate_references parameter in the sequest.params file used in its #creation) So, downstream output will likewise only contain a single protein #for each peptide hit. In many instances this is OK since downstream programs #will recalculate protein-to-peptide linkages from the database file anyway. #For complete protein lists per peptide hit, .srf files must be created with #print_duplicate_references > 0. HINT: to capture all duplicate references, #set the sequest parameter 'print_duplicate_references' to 100 or greater. #***************************************************************************** # } else dup_refs_gt_0 = true end File.open(filename, 'rb') do |fh| @header = Ms::Sequest::Srf::Header.from_io(fh) @version = @header.version unpack_35 = case @version when '3.2' false when '3.3' false when '3.5' true end if @header.combined @base_name = File.basename(filename, '.*') # I'm not sure why this is the case, but the reported number is too # big by one on the 2 files I've seen so far, so we will correct it here! @header.dta_gen.num_dta_files = @header.dta_gen.num_dta_files - 1 if opts[:read_pephits] == false raise NotImplementedError, "on combined files must read everything right now!" end (@dta_files, @out_files) = read_dta_and_out_interleaved(fh, @header.num_dta_files, unpack_35, dup_refs_gt_0) else @base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first @dta_files = read_dta_files(fh, @header.num_dta_files, unpack_35) if opts[:read_pephits] # need the params file to know if the duplicate_references is set > 0 raise NoSequestParamsError, "no sequest params info in srf file!\npass in path to sequest.params file" if @params.nil? @out_files = read_out_files(fh,@header.num_dta_files, unpack_35, dup_refs_gt_0) # FOR DISPLAY ONLY! #@out_files.each do |f| # if f.num_hits == 10 # p f.hits.last # end #end if fh.eof? #warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..." @params = nil @index = [] end end end fh.pos = after_params_io_pos # This is very sensitive to the grab_params method in sequest params fh.read(12) ## gap between last params entry and index @index = read_scan_index(fh,@header.num_dta_files) end ### UPDATE SOME THINGS: # give each hit a base_name, first_scan, last_scan if opts[:read_pephits] && !@header.combined @index.each_with_index do |ind,i| mass_measured = @dta_files[i][0] outfile = @out_files[i] outfile.first_scan = ind[0] outfile.last_scan = ind[1] outfile.charge = ind[2] pep_hits = @out_files[i].hits @peptide_hits.push( *pep_hits ) pep_hits.each do |pep_hit| pep_hit[15] = @base_name pep_hit[16] = ind[0] pep_hit[17] = ind[1] pep_hit[18] = ind[2] # add the deltamass pep_hit[12] = pep_hit[0] - mass_measured # real - measured (deltamass) pep_hit[13] = 1.0e6 * pep_hit[12].abs / mass_measured ## ppm pep_hit[19] = self ## link with the srf object end end filter_by_precursor_mass_tolerance! if params if opts[:link_protein_hits] (@peptide_hits, @proteins) = merge!([self.peptide_hits]) do |_protein, _peptides| Ms::Sequest::Srf::Out::Protein.new(_protein.reference, _peptides) end end end self end |
#protein_class ⇒ Object
50 51 52 |
# File 'lib/ms/sequest/srf.rb', line 50 def protein_class Ms::Sequest::Srf::Out::Protein end |
#read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0) ⇒ Object
161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# File 'lib/ms/sequest/srf.rb', line 161 def read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0) dta_files = Array.new(num_files) out_files = Array.new(num_files) start = dta_start_byte fh.pos = start num_files.times do |i| dta_files[i] = Ms::Sequest::Srf::Dta.from_io(fh, unpack_35) #p dta_files[i] out_files[i] = Ms::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0) #p out_files[i] end [dta_files, out_files] end |
#read_dta_files(fh, num_files, unpack_35) ⇒ Object
returns an array of dta_files
320 321 322 323 324 325 326 327 328 329 |
# File 'lib/ms/sequest/srf.rb', line 320 def read_dta_files(fh, num_files, unpack_35) dta_files = Array.new(num_files) start = dta_start_byte fh.pos = start header.num_dta_files.times do |i| dta_files[i] = Ms::Sequest::Srf::Dta.from_io(fh, unpack_35) end dta_files end |
#read_out_files(fh, number_files, unpack_35, dup_refs_gt_0) ⇒ Object
filehandle (fh) must be at the start of the outfiles. ‘read_dta_files’ will put the fh there.
333 334 335 336 337 338 339 |
# File 'lib/ms/sequest/srf.rb', line 333 def read_out_files(fh,number_files, unpack_35, dup_refs_gt_0) out_files = Array.new(number_files) header.num_dta_files.times do |i| out_files[i] = Ms::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0) end out_files end |
#read_scan_index(fh, num) ⇒ Object
returns an index where each entry is [first_scan, last_scan, charge]
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 |
# File 'lib/ms/sequest/srf.rb', line 299 def read_scan_index(fh, num) #string = fh.read(80) #puts "STRING: " #p string #puts string #File.open("tmp.tmp",'wb') {|out| out.print string } #abort 'her' ind_len = 24 index = Array.new(num) unpack_string = 'III' st = '' ind_len.times do st << '0' end ## create a 24 byte string to receive data num.times do |i| fh.read(ind_len, st) result = st.unpack(unpack_string) index[i] = st.unpack(unpack_string) end index end |