Class: BacterialAnnotator

Inherits:
Object
  • Object
show all
Defined in:
lib/bacterial-annotator.rb

Instance Method Summary collapse

Constructor Details

#initialize(options, root) ⇒ BacterialAnnotator

Initialize BacterialAnnotator options, options, ROOT, options, options)



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/bacterial-annotator.rb', line 21

def initialize options, root

  @root = root
  @options = options
  @outdir = @options[:outdir]

  @minlength = @options[:minlength].to_i
  @pidentity = @options[:pidentity].to_f
  @pidentity = @pidentity * 100 if @pidentity < 1.00

  if File.exists? (@outdir)
    if ! options.has_key? :force
      abort "Output directory already exist ! Choose another one or use -f to overwrite"
    else
      puts "Overwriting output directory #{@outdir}"
      FileUtils.remove_dir(@outdir, force=true)
    end
  end
  Dir.mkdir(@outdir)

  @fasta = FastaManip.new(@options[:input], @options[:meta])

  @with_refence_genome = false
  if @options.has_key? :refgenome
    @with_refence_genome = true
    @refgenome = GenbankManip.new(@options[:refgenome], @outdir)
  end

  @prot_synteny = nil
  @annotation_stats = {by_contigs: {},
                       annotated_cds: 0,
                       total_cds: 0,
                       foreign_contigs: [],
                       synteny_contigs: [],
                       short_contigs: []}

  @contig_foreign_cds = {}
  @contig_annotations = {}

end

Instance Method Details

#cumulate_annotation_stats_reference(contig, contig_prots_ann) ⇒ Object

cumulate the stats for the synteny return : unannotated cds array



282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# File 'lib/bacterial-annotator.rb', line 282

def cumulate_annotation_stats_reference contig, contig_prots_ann

  remaining_cds = []
  contig_prots = @fasta.prodigal_files[:prot_ids_by_contig][contig]

  @annotation_stats[:total_cds] += contig_prots.length if contig_prots
  contig_prots_ann.each do |k,v|
    if v != nil
      @annotation_stats[:annotated_cds] += 1
    else
      remaining_cds << k
    end
  end

  # Annotated Contigs
  if contig_prots_ann.keys.length < 1
    @annotation_stats[:foreign_contigs] << contig
  else
    @annotation_stats[:synteny_contigs] << contig
  end

  remaining_cds
end

#extract_externaldb_prot_info(db) ⇒ Object

extract the information on protein from an externaldb



366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# File 'lib/bacterial-annotator.rb', line 366

def extract_externaldb_prot_info db

  # NCBI
  # >gi|103485499|ref|YP_615060.1| chromosomal replication initiation protein [Sphingopyxis alaskensis RB2256]
  # Swissprot
  # >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae GN=blaNDM-1 PE=1 SV=1
  # TrEMBL
  # >tr|E5KIY2|E5KIY2_ECOLX Beta-lactamase NDM-1 OS=Escherichia coli GN=blaNDM-1 PE=1 SV=1

  ref_cds = {}

  File.open(db, "r") do |dbfile|
    while l=dbfile.gets

      if l[0] == ">"

        lA = l.chomp.split("|")
        key_gi = lA[1]
        product_long = lA[-1]

        organism = ""
        product = ""

        if product_long.include? " [" and product_long.include? "]" # NCBI
          organism = product_long[/\[.*?\]/]
          product = product_long.split(" [")[0].strip
        elsif product_long.include? "OS="
          product_tmp = product.split("OS=")
          organism = product_tmp[1].split(/[A-Z][A-Z]=/)[0].strip
          product = product_tmp[0].strip
        elsif product_long.include? "[A-Z][A-Z]="
          product = product_long.split(/[A-Z][A-Z]=/)[0].strip
        end
        org = organism.gsub("[","").gsub("]","")
        product.lstrip!
        ref_cds[key_gi] = {product: product, org: org}

      end

    end

  end                         # end of file reading

  ref_cds

end

#finish_annotation(remaining_cds_file) ⇒ Object

Finishing the annotation of the remaining CDS



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# File 'lib/bacterial-annotator.rb', line 169

def finish_annotation remaining_cds_file

  # only finish the annotation with an external DB
  if @options.has_key? :external_db	# from an external DB

    db_file = @options[:external_db]
    ref_cds = extract_externaldb_prot_info db_file

    externaldb_synteny = SyntenyManip.new(remaining_cds_file, db_file, "Prot-ExternalDB", @pidentity)
    puts "\nRunning BLAT alignment with External Database.."
    externaldb_synteny.run_blat @root, @outdir
    externaldb_synteny.extract_hits :externaldb

    externaldb_synteny.aln_hits.each do |k,v|
      contig_of_protein = k.split("_")[0..-2].join("_")

      if ! @contig_annotations.has_key? contig_of_protein
        @contig_annotations[contig_of_protein] = {}
      end

      hit_gi = v[:hits][0]

      note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"

      # p v
      # p ref_cds[hit_gi]

      if ref_cds[hit_gi][:org] != ""
        note +=  " from #{ref_cds[hit_gi][:org]}"
      end
      @contig_annotations[contig_of_protein][k] = {product: ref_cds[hit_gi][:product],
                                                   gene: nil,
                                                   locustag: nil,
                                                   note: note}

    end


  elsif @options.has_key? :remote_db	# from a remote DB

    # do it by chunk to avoid NCBI CPU exceeding limit
    cds_files = split_remaining_cds_file remaining_cds_file
    @remotedb = @options[:remote_db]

    puts "\n# NCBI Blast on #{@remotedb}"

    cds_files.each do |cds_file|

      # remotedb = @options[:remote_db]
      valid = true
      begin
        # puts "\nNCBI blast on #{@remotedb} for #{cds_file}"
        ncbiblast = RemoteNCBI.new(@remotedb,
                                   cds_file,
                                   "#{cds_file}.#{@remotedb}.xml",
                                   @pidentity)
      rescue
        valid = false
      end

      # ncbi blast didn't worked out
      if !valid
        puts "Problem NCBI blast for foreign proteins"
      else
        ncbiblast.extract_blast_results
        if ! ncbiblast.aln_hits
          puts "Didn't produce the annotation for #{cds_file}"
          next
        end
        ncbiblast.aln_hits.each do |k,v|
          contig_of_protein = k.split("_")[0..-2].join("_")
          # @contig_annotations[contig_of_protein][k][:product] = v[:hits][0][:product]
          if ! @contig_annotations.has_key? contig_of_protein
            @contig_annotations[contig_of_protein] = {}
          end

          note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:gi]}"
          # note = "correspond to gi:#{v[:hits][0][:gi]}"
          if v[:hits][0][:org] != ""
            note +=  " from #{v[:hits][0][:org]}"
          end
          @contig_annotations[contig_of_protein][k] = {product: v[:hits][0][:product],
                                                       gene: nil,
                                                       locustag: nil,
                                                       note: note}
        end

      end

    end

  end

end

#parsing_genbank_filesObject

parse all genbank files



266
267
268
269
270
271
272
273
274
275
276
277
# File 'lib/bacterial-annotator.rb', line 266

def parsing_genbank_files

  puts "\nParsing annotation into genbank files.."
  @contig_annotations.each do |contig, contig_prot_annotations|
    gbk_path = @fasta.prodigal_files[:gbk_path]
    gbk_to_annotate = GenbankManip.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
    reference_locus = nil
    reference_locus = @refgenome.gbk.locus if @with_refence_genome
    gbk_to_annotate.add_annotation contig_prot_annotations, gbk_path, 0, reference_locus
  end

end

#prepare_files_for_annotationObject

Prepare files for the annotation Will run prodigal on the query and prepare reference genome files



64
65
66
67
68
69
70
71
72
# File 'lib/bacterial-annotator.rb', line 64

def prepare_files_for_annotation
  puts "\nRunning Prodigal on your genome.."
  @fasta.run_prodigal @root, @outdir
  puts "Prodigal done."
  if @with_refence_genome
    @refgenome.write_cds_to_file @outdir
    puts "Successfully loaded #{@refgenome.gbk.definition}"
  end
end

print statistics to file



308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# File 'lib/bacterial-annotator.rb', line 308

def print_stats file

  total_nb_contigs = @annotation_stats[:foreign_contigs].length +
                     @annotation_stats[:synteny_contigs].length +
                     @annotation_stats[:short_contigs].length
  p_contigs_annotated = @annotation_stats[:synteny_contigs].length.to_f/total_nb_contigs.to_f
  p_cds_annotated = @annotation_stats[:annotated_cds].to_f/@annotation_stats[:total_cds].to_f

  File.open(file, "w") do |fopen|
    fopen.write("#Contigs annotation based on reference genomes\n")
    fopen.write("Short Contigs (< #{@minlength}) :\t\t" + @annotation_stats[:short_contigs].length.to_s + "\n")
    fopen.write("Foreign Contigs :\t\t" + @annotation_stats[:foreign_contigs].length.to_s + "\n")
    fopen.write("Annotated Contigs :\t\t" + @annotation_stats[:synteny_contigs].length.to_s + "\n")
    fopen.write("Total Contigs :\t\t\t" + total_nb_contigs.to_s + "\n")
    fopen.write("% Contigs annotated :\t\t" + (p_contigs_annotated*100).round(2).to_s + "\n")
    fopen.write("\n")

    fopen.write("#CDS annotations based on reference genomes\n")
    fopen.write("Annotated CDS :\t\t\t" + @annotation_stats[:annotated_cds].to_s + "\n")
    fopen.write("Total CDS :\t\t\t" + @annotation_stats[:total_cds].to_s + "\n")
    fopen.write("% CDS annotated :\t\t" + (p_cds_annotated*100).round(2).to_s + "\n")
    fopen.write("\n")

  end

end

#run_annotationObject

run_alignment of reference genome proteins and the query



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/bacterial-annotator.rb', line 75

def run_annotation

  # process reference genome synteny
  if @with_refence_genome        # Annotation with the Reference Genome

    @prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity)
    puts "\nRunning BLAT alignment with Reference Genome.."
    @prot_synteny.run_blat @root, @outdir
    @prot_synteny.extract_hits :refgenome

    @fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|

      # Skip short contigs
      if @fasta.prodigal_files[:contigs_length][contig_index] < @minlength
        @annotation_stats[:short_contigs] << contig
        next
      end

      contig_prots = @fasta.prodigal_files[:prot_ids_by_contig][contig]

      # contig_prot_annotations = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
      @contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq

      remaining_cds = cumulate_annotation_stats_reference contig, @contig_annotations[contig]

      if ! remaining_cds.empty?
        @contig_foreign_cds[contig] = remaining_cds
      end

    end

    # dump foreign proteins to file
    foreign_cds_file = dump_cds

    # Iterate over each Ref protein and print syntheny
    synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
    synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
    ref_annotated = {}
    @contig_annotations.each do |contig,prot_annotations|
      prot_annotations.each do |key,prot|
        # p key
        # p prot
        ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
      end
    end

    @refgenome.coding_seq.each do |ref_k, ref_v|
      gene = ""
      coverage_ref = ""
      coverage_query = ""
      query_length = ""
      pId = ""
      if ref_annotated[ref_v[:protId]] != nil
        gene = ref_annotated[ref_v[:protId]][:key]
        coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
        query_length = @fasta.prodigal_files[:prot_ids_length][gene]
        coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
        pId = ref_annotated[ref_v[:protId]][:pId]
      end

      synteny_file.write(ref_v[:protId])
      synteny_file.write("\t"+ref_v[:locustag])
      synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
      synteny_file.write("\t"+coverage_ref.to_s)
      synteny_file.write("\t"+pId.to_s)
      synteny_file.write("\t"+gene)
      synteny_file.write("\t"+query_length.to_s)
      synteny_file.write("\t"+coverage_query.to_s)
      synteny_file.write("\n")

    end
    synteny_file.close

  else                        # no reference genome

    # no reference genome .. will process all the CDS
    foreign_cds_file = @fasta.prodigal_files[:proteins]

  end

  # Finishing annotation for foreign proteins
  finish_annotation foreign_cds_file

  # Parse annotations to genbank files
  parsing_genbank_files

  puts "\nPrinting Statistics.."
  print_stats "#{@outdir}/Annotation-Stats.txt"


end