Class: ROCker

Inherits:
Object
  • Object
show all
Defined in:
lib/rocker/step/plot.rb,
lib/rocker.rb,
lib/rocker/step/build.rb,
lib/rocker/step/filter.rb,
lib/rocker/step/search.rb,
lib/rocker/step/compile.rb

Overview

Author:

  • Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>

  • Luis (Coto) Orellana

Constant Summary collapse

@@DEFAULTS =
[ Class ]
{
   # General
   :q=>false, :r=>'R', :nucl=>false, :debug=>false,:thr=>2,:search=>:blast,
   # External software
   :searchbins=>'',
   :searchcmd=>{
	 :blast=>'%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" -num_threads %6$d -outfmt 6 -max_target_seqs 1',
	 :diamond=>'%1$sdiamond %2$s -q "%3$s" -d "%4$s" -o "%5$s" -t %6$d -k 1 --min-score 20 --sensitive'},
   :makedbcmd=>{
	 :blast=>'%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
	 :diamond=>'%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
}
@@EBIREST =
[ Class ]
'http://www.ebi.ac.uk/Tools'
@@HAS_BUILD_GEMS =
nil

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts) ⇒ ROCker

Returns a new instance of ROCker.



30
31
32
33
34
# File 'lib/rocker.rb', line 30

def initialize(opts)
   @o = ROCker.defaults
   opts.each{ |k,v| @o[k] = v }
   RInterface.R_BIN = opts[:r] unless opts[:r].nil?
end

Instance Attribute Details

#oObject (readonly)

[ Instance ]


29
30
31
# File 'lib/rocker.rb', line 29

def o
  @o
end

Class Method Details

.default(k) ⇒ Object



26
# File 'lib/rocker.rb', line 26

def self.default(k) @@DEFAULTS[k] ; end

.defaultsObject



25
# File 'lib/rocker.rb', line 25

def self.defaults() @@DEFAULTS ; end

.ebirestObject



22
# File 'lib/rocker/step/build.rb', line 22

def self.ebirest() @@EBIREST ; end

.has_build_gems?Boolean

Returns:

  • (Boolean)


23
24
25
26
27
28
29
30
31
32
33
# File 'lib/rocker/step/build.rb', line 23

def self.has_build_gems?
   return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
   @@HAS_BUILD_GEMS = TRUE
   begin
	 require 'rubygems'
	 require 'restclient'
   rescue LoadError
	 @@HAS_BUILD_GEMS = FALSE
   end
   @@HAS_BUILD_GEMS
end

Instance Method Details

#bash(cmd, err_msg = nil) ⇒ Object



47
48
49
50
51
# File 'lib/rocker.rb', line 47

def bash(cmd, err_msg=nil)
   o = `#{cmd} 2>&1 && echo '{'`
   raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless o[-2]=='{'
   true
end

#blast2table(blast_f, table_f, aln, minscore) ⇒ Object

[ Utilities ]


37
38
39
40
41
42
43
44
45
46
# File 'lib/rocker.rb', line 37

def blast2table(blast_f, table_f, aln, minscore)
   ifh = File.open(blast_f, "r")
   ofh = File.open(table_f, "w")
   while ln = ifh.gets
	 bh = BlastHit.new(ln, aln)
	 ofh.print bh.to_s if bh.bits >= minscore
   end
   ifh.close
   ofh.close
end

#build!Object

[ Build ]


124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
# File 'lib/rocker/step/build.rb', line 124

def build!
   # Check requirements
   puts "Testing environment." unless @o[:q]
   @o[:searchcmd] = @o[:searchcmd][@o[:search]] if @o[:searchcmd].is_a? Hash
   @o[:makedbcmd] = @o[:makedbcmd][@o[:search]] if @o[:makedbcmd].is_a? Hash
   @o[:alignercmd] = @o[:alignercmd][@o[:aligner]] if @o[:alignercmd].is_a? Hash
   @o[:simulatorcmd] = @o[:simulatorcmd][@o[:simulator]] if @o[:simulatorcmd].is_a? Hash
   @o[:alignerbin] = @o[:alignerbin][@o[:aligner]] if @o[:alignerbin].is_a? Hash
   @o[:simulatorbin] = @o[:simulatorbin][@o[:simulator]] if @o[:simulatorbin].is_a? Hash
   @o[:nosearch]=true if @o[:nosimulate]
   raise "Unsatisfied requirements, please see the help message (-h)." unless ROCker.has_build_gems?
   @o[:positive] += @o[:posori] unless @o[:posori].nil?
   @o[:positive] += File.readlines(@o[:posfile]).map{ |l| l.chomp } unless @o[:posfile].nil?
   @o[:negative] += File.readlines(@o[:negfile]).map{ |l| l.chomp } unless @o[:negfile].nil?
   unless @o[:aln].nil?
      aln = Alignment.new
	 aln.read_fasta @o[:aln]
	 @o[:positive] += aln.get_ids
   end
   raise "-p or -P are mandatory." if @o[:positive].size==0
   raise "-o/--baseout is mandatory." if @o[:baseout].nil?
   if @o[:positive].size == 1 and not @o[:noaln]
	 warn "\nWARNING: Positive set contains only one sequence, turning off alignment.\n\n"
	 @o[:noaln] = true
   end
   unless @o[:nosimulate]
	 self.bash "#{@o[:simulatorbin]} --version", "--simulator-bin must be executable. Is Grinder installed?" if @o[:simulator]==:grinder
   end
   unless @o[:noaln]
	 self.bash "#{@o[:alignerbin]} -version", "--aligner-bin must be executable. Is Muscle installed?" if @o[:aligner]==:muscle
	 self.bash "#{@o[:alignerbin]} --version", "--aligner-bin must be executable. Is ClustalOmega installed?" if @o[:aligner]==:clustalo
   end
   unless @o[:nosearch]
	 self.bash "#{@o[:searchbins]}makeblastdb -version", "--search-bins must contain executables. Is BLAST+ installed?" if @o[:search]==:blast
	 self.bash "#{@o[:searchbins]}diamond --help", "--search-bins must contain executables. Is DIAMOND installed?" if @o[:search]==:diamond
   end

   # Download genes
   puts "Downloading gene data." unless @o[:q]
   ref_file = @o[:baseout] + ".ref.fasta"
   if @o[:posori].nil? and @o[:posfile].nil? and not @o[:aln].nil?
	 puts "  * reusing aligned sequences as positive set." unless @o[:q]
	 f = File.open(ref_file, "w")
	 f.print aln.to_seq_s
	 f.close
	 @o[:noaln] = true
   elsif @o[:reuse] and File.size? ref_file
	 puts "  * reusing positive set: #{ref_file}." unless @o[:q]
   else
	 puts "  * downloading #{@o[:positive].size} sequence(s) in positive set." unless @o[:q]
	 $stderr.puts "   # #{@o[:positive]}" if @o[:debug]
	 ids = Array.new(@o[:positive])
	 f = File.open(ref_file, "w")
	 while ids.size>0
  f.print ebiFetch(:uniprotkb, ids.shift(200), :fasta)
	 end
	 f.close
   end
   genome_ids = {:positive=>[], :negative=>[]}
   transl_ids = {:positive=>[], :negative=>[]}
   [:positive, :negative].each do |set|
      unless @o[set].size==0
  puts "  * linking genomes from #{@o[set].size} #{set.to_s} sequence(s)." unless @o[:q]
  $stderr.puts "   # #{@o[set]}" if @o[:debug]
  r = genes2genomes(@o[set])
  genome_ids[set] = r.map{|i| i[:genome_id]}.uniq
  transl_ids[set] = r.map{|i| i[:transl_id]}.uniq
	 end
   end
   raise "No genomes associated with the positive set." if genome_ids[:positive].size==0
   genome_ids[:positive] = genome_ids[:positive].sample( (genome_ids[:positive].size*@o[:genomefrx]).round ) if @o[:genomefrx]
   raise "No positive genomes selected for metagenome construction, is --genome-frx too small?" if genome_ids[:positive].empty?
   all_genome_ids = genome_ids.values.reduce(:+).uniq
   
   # Locate genes
   puts "Analyzing genome data." unless @o[:q]
   coords_file = @o[:baseout] + ".src.coords"
   if @o[:reuse] and File.size? coords_file
	 puts "  * reusing coordinates: #{coords_file}." unless @o[:q]
	 c = JSON.parse File.read(coords_file), {:symbolize_names=>true}
	 positive_coords = c[:positive_coords]
	 genome_org = c[:genome_org]
   else
	 thrs = [@o[:thr], genome_ids[:positive].size].min
	 puts "  * downloading and parsing #{genome_ids[:positive].size} GFF3 document(s) in #{thrs} threads." unless @o[:q]
	 $stderr.puts "   # Looking for proteins: #{@o[:positive]}" if @o[:debug]
	 $stderr.puts "   # Looking for translations: #{transl_ids[:positive]}" if @o[:debug]
	 $stderr.puts "   # Looking into: #{genome_ids[:positive]}" if @o[:debug]
	 thr_obj = []
	 (0 .. (thrs-1)).each do |thr_i|
  ids_to_parse = []
  (0 .. (genome_ids[:positive].size-1)).each do |i|
     ids_to_parse << genome_ids[:positive][i] if (i % thrs)==thr_i
  end
  json_file = @o[:baseout] + ".src.coords." + thr_i.to_s
  thr_obj << json_file
  fork do
     get_coords_from_gff3(ids_to_parse, @o[:positive], transl_ids[:positive], thr_i, json_file)
  end
	 end
	 Process.waitall
	 # Combine results
	 positive_coords = {}
	 genomes_org = {}
	 genome_org = {}
	 thr_obj.each do |t|
  raise "Thread failed without error trace: #{t}" unless File.exist? t
  o = JSON.parse File.read(t), {:symbolize_names=>true, :create_additions=>true}
  o[:positive_coords].each_pair do |k,v|
     positive_coords[ k ] ||= []
     positive_coords[ k ] += v
  end
  o[:genomes_org].each_pair do |k,v|
     genomes_org[ k ] ||= []
     genomes_org[ k ] << v
  end
  File.unlink t
	 end
	 # Select one genome per taxon
	 unless @o[:pertaxon].nil?
  genomes_org.each_pair{ |k,v| genome_org[ k ] = v.sample.first }
	 end
	 # Save coordinates
	 ofh = File.open(coords_file, "w")
	 ofh.print JSON.pretty_generate({:positive_coords=>positive_coords, :genome_org=>genome_org})
	 ofh.close
   end
   unless @o[:pertaxon].nil?
	 genome_ids[:positive] = genome_org.values
	 puts "  Using #{genome_org.size} genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
   end
   all_genome_ids = genome_ids.values.reduce(:+).uniq
   found = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id] } }.reduce(:+).compact.uniq
   unknown_pid = positive_coords.values.map{ |a| a.map{ |b| b[:prot_id].nil? ? b[:tran_id] : nil } }.reduce(:+).compact.uniq
   raise "Cannot find the genomic location of any provided sequence." if found.nil?
   missing = @o[:positive] - found
   warn "\nWARNING: Cannot find genomic location of sequence(s) #{missing.join(',')}.\nMissing: #{missing.size}, Unlinked translations: #{unknown_pid.size}\n\n" unless missing.size==0 or missing.size==unknown_pid.size or @o[:genomefrx]<1.0
   
   # Download genomes
   genomes_file = @o[:baseout] + '.src.fasta'
   if @o[:reuse] and File.size? genomes_file
	 puts "  * reusing existing file: #{genomes_file}." unless @o[:q]
   else
	 puts "  * downloading #{all_genome_ids.size} genome(s) in FastA." unless @o[:q]
	 $stderr.puts "   # #{all_genome_ids}" if @o[:debug]
	 ids = Array.new(all_genome_ids)
	 ofh = File.open(genomes_file, 'w')
	 while ids.size>0
  ofh.print ebiFetch('embl', ids.shift(200), 'fasta')
	 end
	 ofh.close
   end
   
   # Generate metagenome
   unless @o[:nosimulate]
	 puts "Generating in silico metagenome" unless @o[:q]
	 if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
  puts "  * reusing existing file: #{@o[:baseout]}.mg.fasta." unless @o[:q]
	 else
  all_src = File.readlines("#{@o[:baseout]}.src.fasta").select{ |l| l =~ /^>/ }.size
  thrs = [@o[:thr], all_src].min
  puts "  * simulating metagenomes and tagging positive reads in #{thrs} threads." unless @o[:q]
  $stderr.puts "   # #{positive_coords}" if @o[:debug]
  thr_obj = []
  seqs_per_thr = (all_src/thrs).ceil
  (0 .. (thrs-1)).each do |thr_i|
     output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
     thr_obj << output
     fork do
 seqs_a = thr_i*seqs_per_thr + 1
 seqs_b = [seqs_a + seqs_per_thr, all_src].min
 # Create sub-fasta
 ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", "w")
 ifh = File.open("#{@o[:baseout]}.src.fasta", "r")
 seq_i = 0
 while l = ifh.gets
    seq_i+=1 if l =~ /^>/
    break if seq_i > seqs_b
    ofh.print l if seq_i >= seqs_a
 end
 ifh.close
 ofh.close

 # Run simulator (except if the temporal file is already there and can be reused)
 unless @o[:reuse] and File.size? @o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa"
    bash sprintf(@o[:simulatorcmd], @o[:simulatorbin], "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}", @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen], "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}")
 end

 # Tag positives
 puts "  * tagging positive reads [thread #{thr_i.to_s}]." unless @o[:q]
 ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i.to_s}-reads.fa", 'r')
 ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i.to_s}", 'w')
 while l = ifh.gets
    if l =~ /^>/
rd = /^>(?<id>\d+) reference=[A-Za-z]+\|(?<genome_id>[A-Za-z0-9_]+)\|.* position=(?<comp>complement\()?(?<from>\d+)\.\.(?<to>\d+)\)? /.match(l)
raise "Cannot parse simulated read's defline, are you using Grinder?: #{l}" if rd.nil?
positive = false
positive_coords[rd[:genome_id].to_sym] ||= []
positive_coords[rd[:genome_id].to_sym].each do |gn|
   left  = rd[:to].to_i - gn[:from]
   right = gn[:to] - rd[:from].to_i
   if (left*right >= 0) and ([left, right].min >= @o[:minovl])
      positive = true
      break
   end
end
l = ">#{thr_i.to_s}_#{rd[:id]}#{positive ? "@%" : ""} " +
   "ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}#{(rd[:comp]=='complement(')?'-':'+'}\n"
    end
    ofh.print l
 end
 ofh.close
 ifh.close
     end # fork
  end # (1 .. thrs).each
  Process.waitall
  # Concatenate results
  ofh = File.open(@o[:baseout] + ".mg.fasta", 'w')
  thr_obj.each do |t|
     raise "Thread failed without error trace: #{t}" unless File.exist? t
     ifh = File.open(t, "r")
     while l = ifh.gets
        ofh.print l
     end
     ifh.close
     File.unlink t
  end
  ofh.close
      end
   end # unless @o[:nosimulate]
   
   # Align references
   unless @o[:noaln]
	 puts "Aligning reference set." unless @o[:q]
	 if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
  puts "  * reusing existing file: #{@o[:baseout]}.ref.aln." unless @o[:q]
	 else
  bash sprintf(@o[:alignercmd], @o[:alignerbin], "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref.aln", @o[:thr])
  puts "  +--\n  | IMPORTANT NOTE: Manually checking the alignment before\n  | the 'compile' step is *strongly* encouraged.\n  +--\n" unless @o[:q]
	 end
   end
   
   # Run similarity search
   unless @o[:nosearch]
	 puts "Running homology search." unless @o[:q]
	 if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
  puts "  * reusing existing file: #{@o[:baseout]}.ref.blast." unless @o[:q]
	 else
  puts "  * preparing database." unless @o[:q]
  bash sprintf(@o[:makedbcmd][@o[:search]], @o[:searchbins], 'prot', "#{@o[:baseout]}.ref.fasta", "#{@o[:baseout]}.ref")
  puts "  * running similarity search." unless @o[:q]
  bash sprintf(@o[:searchcmd][@o[:search]], @o[:searchbins], 'blastx', "#{@o[:baseout]}.mg.fasta", "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr])
	 end
   end
   
   # Clean
   unless @o[:noclean]
	 puts "Cleaning." unless @o[:q]
	 sff  = %w{.src.xml .src.fasta}
	 sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
	 sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
	 sff.each { |sf| File.unlink @o[:baseout] + sf if File.exist? @o[:baseout] + sf }
   end
end

#compile!Object

[ Compile ]


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/rocker/step/compile.rb', line 13

def compile!
   raise "-a/--alignment is mandatory." if @o[:aln].nil?
   raise "-a/--alignment must exist." unless File.exist? @o[:aln]
   if @o[:table].nil?
	 raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil? or not File.exist? @o[:blast]
	 @o[:table] = "#{@o[:blast]}.table"
   else
	 @o[:reuse] = true
   end
   raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]
   raise "-k/--rocker is mandatory." if @o[:rocker].nil?

   puts "Testing environment." unless @o[:q]
   bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"
   bash "echo \"library('pROC')\" | #{@o[:r]} --vanilla", "Please install the 'pROC' library for R first."

   puts "Reading files." unless @o[:q]
   puts "  * loading alignment: #{@o[:aln]}." unless @o[:q]
   aln = Alignment.new
   aln.read_fasta @o[:aln]
   
   if @o[:reuse] and File.exist? @o[:table]
	 puts "  * reusing existing file: #{@o[:table]}." unless @o[:q]
   else
	 puts "  * generating table: #{@o[:table]}." unless @o[:q]
	 blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
   end

   puts "Analyzing data." unless @o[:q]
   puts "  * computing windows." unless @o[:q]
   data = ROCData.new(@o[:table], aln, @o[:win])
   data.nucl = @o[:nucl]
   if @o[:refine]
	 puts "  * refining windows." unless @o[:q]
	 warn "Insufficient hits to refine results." unless data.refine! @o[:table]
   end
   puts "  * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
   data.save @o[:rocker]
end

#ebiFetch(db, ids, format, outfile = nil) ⇒ Object



68
69
70
71
72
73
74
75
76
77
# File 'lib/rocker/step/build.rb', line 68

def ebiFetch(db, ids, format, outfile=nil)
   url = "#{ROCker.ebirest}/dbfetch/dbfetch/#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
   res = self.restcall url
   unless outfile.nil?
	 ohf = File.open(outfile, 'w')
	 ohf.print res
	 ohf.close
   end
   res
end

#filter!Object

[ Filter ]


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/rocker/step/filter.rb', line 13

def filter!
   raise "-k/--rocker is mandatory." if @o[:rocker].nil?
   raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
   raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
   
   puts "Reading ROCker file." unless @o[:q]
   data = ROCData.new @o[:rocker]

   puts "Filtering BLAST." unless @o[:q]
   ih = File.open(@o[:qblast], 'r')
   oh = File.open(@o[:oblast], 'w')
   while ln = ih.gets
	 bh = BlastHit.new(ln, data.aln)
	 oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
   end
   ih.close
   oh.close
end

#genes2genomes(gene_ids) ⇒ Object

[ Utilities ]


36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/rocker/step/build.rb', line 36

def genes2genomes(gene_ids)
   genomes = []
   ids = Array.new(gene_ids)
   while ids.size>0
	 doc = ebiFetch(:uniprotkb, ids.shift(200), :annot).split("\n")
	 genomes += doc.grep( /^DR\s+EMBL;/ ).map do |ln|
  r=ln.split('; ')
  {:genome_id=>r[1], :transl_id=>r[2]}
	 end
   end
   genomes.uniq
end

#genome2taxid(genome_id) ⇒ Object



48
49
50
51
52
# File 'lib/rocker/step/build.rb', line 48

def genome2taxid(genome_id)
   ln = ebiFetch('embl', [genome_id], 'annot').split(/[\n\r]/).grep(/^FT\s+\/db_xref="taxon:/).first
   return ln if ln.nil?
   ln.sub(/.*"taxon:(\d+)".*/, "\\1")
end

#genome2taxon(genome_id, rank = 'species') ⇒ Object



53
54
55
56
# File 'lib/rocker/step/build.rb', line 53

def genome2taxon(genome_id, rank='species')
   xml = ebiFetch('taxonomy', [genome2taxid(genome_id)], 'enataxonomyxml').gsub(/\s*\n\s*/,'')
   xml.scan(/<taxon [^>]+>/).grep(/rank="#{rank}"/).first.sub(/.* taxId="(\d+)".*/,"\\1")
end

#get_coords_from_gff3(genome_ids, protein_ids, transl_ids, thread_id, json_file) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/rocker/step/build.rb', line 78

def get_coords_from_gff3(genome_ids, protein_ids, transl_ids, thread_id, json_file)
   positive_coords = {}
   genomes_org = {}
   i = 0
   genome_ids.each do |genome_id|
	 print "  * scanning #{(i+=1).ordinalize} genome out of #{genome_ids.size} in first thread.  \r" if thread_id==0 and not @o[:q]
	 unless @o[:pertaxon].nil?
  genome_taxon = genome2taxon(genome_id, @o[:pertaxon])
  genomes_org[ genome_taxon.to_sym ] ||= []
  genomes_org[ genome_taxon.to_sym ] << genome_id
	 end
	 genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
	 if @o[:reuse] and File.size? genome_file
  ifh = File.open(genome_file, 'r')
  doc = ifh.readlines.grep(/^[^#]/)
  ifh.close
	 else
  genome_file=nil unless @o[:noclean]
  doc = ebiFetch(:embl, [genome_id], :gff3, genome_file).split("\n").grep(/^[^#]/)
	 end
	 doc.each do |ln|
  next if ln =~ /^#/
  r = ln.chomp.split /\t/
  next if r.size < 9
  prots = r[8].split(/;/).grep(/^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
  p = prots.select{ |id| protein_ids.include? id }.first
  trans = r[8].split(/;/).grep(/^protein_id=/){ |pid| pid.split(/=/)[1] }
  t = trans.select{ |id|  transl_ids.include? id }.first
  next if p.nil? and t.nil?
  positive_coords[ r[0].to_sym ] ||= []
  positive_coords[ r[0].to_sym ] << {
     :prot_id	=> p,
     :tran_id => t,
     :from	=> r[3].to_i,
     :to	=> r[4].to_i,
     :strand	=> r[6]
  }
	 end
   end
   print "\n" if thread_id==0 and not @o[:q]
   ofh = File.open json_file, "w"
   ofh.print({:positive_coords=>positive_coords, :genomes_org=>genomes_org}.to_json)
   ofh.close
end

#plot!Object

[ Search ]


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/rocker/step/plot.rb', line 13

def plot!
   raise "-k/--rocker is mandatory." if o[:rocker].nil?
   if @o[:table].nil?
	 raise "-t/--table is mandatory unless -b is provided." if @o[:blast].nil?
	 @o[:table] = "#{@o[:blast]}.table"
   end
   raise "-b/--blast is mandatory unless -t exists." if @o[:blast].nil? and not File.exist? @o[:table]

   puts "Testing environment." unless @o[:q]
   bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be executable. Is R installed?"

   puts "Reading files." unless @o[:q]
   puts "  * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
   data = ROCData.new @o[:rocker]
   if File.exist? @o[:table]
	 puts "  * reusing existing file: #{@o[:table]}." unless @o[:q]
   else
	 puts "  * generating table: #{@o[:table]}." unless @o[:q]
	 blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
   end

   puts "Plotting matches." unless @o[:q]
   extra = @o[:gformat]=='pdf' ? "" : ", units='in', res=300"
   @o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
   data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, #{@o[:height]}#{extra});"
   data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
   some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
   data.rrun "par(mar=c(0,4,0,0.5)+.1);"
   data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', ylab='Bit score', xaxs='i', xaxt='n');"
   data.rrun "noise <- runif(ncol(x),-.2,.2)"
   data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"})), length=0);"
   data.rrun "points(x$V6, x$V4+noise, col=ifelse(x$V5==1, rgb(0,0,.5,#{@o[:transparency] ? ".5" : "1"}), rgb(.5,0,0,#{@o[:transparency] ? ".5" : "1"})), pch=19, cex=1/4);"

   puts "Plotting windows." unless @o[:q]
   if some_thr
	 data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
	 data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
   end
   data.rrun "legend('bottomright',legend=c('Match span','Match mid-point','Reference','Non-reference')," +
	 "lwd=c(1,NA,1,1),pch=c(NA,19,19,19),col=c('black','black','darkblue','darkred'),ncol=4,bty='n')"

   puts "Plotting alignment." unless @o[:q]
   data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
   data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(1,#{data.aln.seqs.size}),xlab='',ylab='Alignment',xaxs='i',xaxt='n',yaxs='i',yaxt='n',bty='n');"
   i = 0
   data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
   data.aln.seqs.values.each do |s|
      color = s.aln.split(//).map{|c| c=="-" ? "'grey80'" : (@o[:sbj].include?(s.id) ? "'red'" : (@o[:color] ? "clr[#{c.ord-64}]" : "'black'"))}.join(',')
	 data.rrun "rect((1:#{data.aln.cols-1})-0.5, rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
	 i += 1
   end

   puts "Plotting statistics." unless @o[:q]
   data.rrun "par(mar=c(5,4,0,0.5)+.1);"
   data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}),ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") : @o[:ylim]}),xlab='Alignment position (amino acids)',ylab='Precision',xaxs='i');"
   if some_thr
	 sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
	 sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
	 ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
	 unless @o[:q]
  puts "  * sensitivity: #{sn}%"
  puts "  * specificity: #{sp}%"
  puts "  * accuracy: #{ac}%"
	 end
	 data.rrun "pos <- (w$V1+w$V2)/2"
	 if @o[:impact]
  data.rrun "lines(pos[!is.na(w$specificity)], (w$specificity[!is.na(w$specificity)]-#{sp})*w$tp[!is.na(w$specificity)]/sum(w$tp), col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
  data.rrun "lines(pos[!is.na(w$sensitivity)], (w$sensitivity[!is.na(w$sensitivity)]-#{sn})*w$tn[!is.na(w$sensitivity)]/sum(w$tn), col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
  data.rrun "lines(pos[!is.na(w$accuracy)], (w$accuracy[!is.na(w$accuracy)]-#{ac})*(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
	 else
  data.rrun "lines(pos[!is.na(w$specificity)], w$specificity[!is.na(w$specificity)], col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
  data.rrun "lines(pos[!is.na(w$sensitivity)], w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
  data.rrun "lines(pos[!is.na(w$accuracy)], w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
	 end
	 #data.rrun "lines(pos[!is.na(w$precision)], w$precision[!is.na(w$precision)], col='purple', lwd=2, t='o', cex=1/3, pch=19);"
   end
   data.rrun "legend('bottomright',legend=c('Specificity','Sensitivity','Accuracy'),lwd=2,col=c('darkred','darkgreen','darkblue'),ncol=3,bty='n')"
   data.rrun "dev.off();"
end

#restcall(url, outfile = nil) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
# File 'lib/rocker/step/build.rb', line 57

def restcall(url, outfile=nil)
   $stderr.puts "   # Calling: #{url}" if @o[:debug]
   response = RestClient::Request.execute(:method=>:get,  :url=>url, :timeout=>600)
   raise "Unable to reach EBI REST client, error code #{response.code}." unless response.code == 200
   unless outfile.nil?
	 ohf = File.open(outfile, 'w')
	 ohf.print response.to_s
	 ohf.close
   end
   response.to_s
end

#search!Object

[ Search ]


13
14
15
16
17
18
# File 'lib/rocker/step/search.rb', line 13

def search!
   raise "-k/--rocker is mandatory." if @o[:rocker].nil?
   raise "Code Under development..."
   # ToDo
   # [ ... ]
end