Module: FlAnalysis

Defined in:: lib/full_lengther_next/classes/fl_analysis.rb

Instance Method Summary collapse

Instance Method Details

#analiza_orf_y_fl(seq, blast_query, options, db_name) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 6

def analiza_orf_y_fl(seq, blast_query, options, db_name)
	aas_n_end = options[:distance]
	pident_threshold = options[:ident]
	evalue_threshold = options[:evalue]
	# @verbose = options[:verbose]

	# test_blast_hits(blast_query)
	
	# used to detect if the sequence and the blast are from different query
	if seq.seq_name != blast_query.query_def
		raise "BLAST query name and sequence are different"
	end
	
	q=blast_query
	msgs = ''
	atg_status = ''
	end_status = ''
	final_status = ''
	
	# the fasta sequence is saved
	query_fasta = seq.seq_fasta

	if q.hits[0].nil? # There is no match in blast, the seq go to the next DB
		# puts "#{db_name} -- #{q.query_def} --> NO BLASTX match"
		
		# If the DB is trembl and the seq has annotations from other DB the annotations must be printed
		if (db_name =~ /^tr_/)
			if (seq.get_annotations(:tmp_annotation).empty?)
				if (seq.sec_desc.empty?)
					seq.annotate(:apply_tcode,'')
				else
					seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
				end
			else
				save_last_db_annotations(seq)
			end
		end
		
		return
	end
#----------------------------------------------------------------------------------------------------------
	warnings = ''
	errors = ''
	wrong_seq = false

	# if the sequence has more than one hit, the frames are checked and fixed to get an single hit
	if (q.hits.count > 1)
		
		seq_unida = UneLosHit.new(q, query_fasta, pident_threshold)
		
		wrong_seq = seq_unida.wrong_seq
		is_ok = seq_unida.is_ok
		q_index_start = seq_unida.q_index_start
		full_prot = seq_unida.full_prot
		
		query_fasta = seq_unida.output_seq # repaired fasta
		
		final_hit = seq_unida.final_hit # single hit
		msgs = seq_unida.msgs # warning messages
		x_number = seq_unida.number_x # number of nucleotides used to fix frame errors
		
	else # if there is only one hit

		if (q.hits[0].q_frame.to_i < 0) # si la secuencia esta al reves le damos la vuelta
			(query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end) = reverse_seq(query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end)
			q.hits[0].reversed = true
		end
		
		final_hit = q.hits[0] # single hit
		x_number = 0 # number of nucleotides used to fix frame errors
		
		full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
		(is_ok, q_index_start) = contenidos_en_prot(final_hit, full_prot, q)
	end
	# test_final_hit(final_hit, query_fasta)
#----------------------------------------------------------------------------------------------------------
	if wrong_seq
		warnings = "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence, " + warnings
		# puts "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence"
		errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#1\tcontains sense and antisense hits!!!, putative chimeric sequence, "
		error_log(q, seq, warnings, db_name)
		return
	end
	#----------------------------------------------------------------------------------------------------------
	warnings += msgs
	msgs = ''
	#----------------------------------------------------------------------------------------------------------
	if (x_number < 0)
		warnings = "ERROR#2, unexpected negative index in x_number, " + warnings
		# puts "ERROR#2, unexpected negative index in x_number"
		errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#2\tunexpected negative index in x_number, "
		error_log(q, seq, warnings, db_name)
		return
	end
	#----------------------------------------------------------------------------------------------------------
	if (!is_ok)
		warnings = "ERROR#3, very serious frame error, " + warnings
		# puts "#{q.query_def} ERROR#3, hit was NOT found in the protein"
		errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#3\thit was NOT found in the protein, "
		# error_log(q, seq, warnings, db_name)
		# return
	end
#----------------------------------------------------------------------------------------------------------
	fiable = false
	if ((final_hit.ident >= pident_threshold) && (final_hit.e_val <= evalue_threshold))
		fiable = true
	end
	# if the query protein is large enough at the start of the sequence should have the start codon
	if (final_hit.q_beg/3 + aas_n_end >= final_hit.s_beg.to_i) 
		substring = full_prot[0, q_index_start + 10]
		resto_substring = full_prot[q_index_start + 10, full_prot.length - q_index_start - 10]

		# to look for the beginning of the protein
		(m_substring, atg_status, msgs) = find_start(final_hit.s_beg, substring, fiable, aas_n_end)

		# pasting the substring sequence with the rest of the sequence
		tmp_prot = "#{m_substring}#{resto_substring}"
		# to get the value of the start_ORF index
		final_hit.q_beg = final_hit.q_beg.to_i - ((m_substring.length - 10) * 3)
	else
		# if (@verbose)
			# puts "beginning too short!"
		# end

		atg_status = 'incomplete'
		substring = full_prot[0, q_index_start]
		distance_s_atg = (final_hit.s_beg.to_i - final_hit.q_beg/3) + 1

		if (substring.rindex('*'))
			warnings += "Unexpected stop codon in the beginning of your sequence, "
			# if (@verbose)
				# puts "#{db_name} -- #{q.query_def} --> Unexpected stop codon in the beginning of your sequence"
			# end
		end

		final_hit.q_beg = final_hit.q_beg.to_i - (substring.length * 3)
		tmp_prot = full_prot
	end
#----------------------------------------------------------------------------------------------------------
	# look for the end of the protein
	(resto_substring, end_substring, end_status, warnings, putative_end) = find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
#----------------------------------------------------------------------------------------------------------
	final_prot = "#{resto_substring}#{end_substring}"
	
	warnings += msgs
	
	# to get the value of the end_ORF index
	if (atg_status == 'complete')
		final_hit.q_end = final_hit.q_beg - 3 + (final_prot.length * 3)
	else
		if (putative_end)
			final_hit.q_end = final_hit.q_end - 45 + (putative_end*3)
		end
	end
	
#--------------------------------------------------------------------------------------------------------------
	# decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
	final_status = determine_status(atg_status,end_status)
	#----------------------------------------------------------------------------------------------------------
	if (final_prot.length - 2*aas_n_end > final_hit.full_subject_length)
		warnings += " your sequence is longer than subject: #{final_prot.length} - #{final_hit.full_subject_length}"

	elsif (final_prot.length + aas_n_end < final_hit.full_subject_length)
		warnings += " your sequence is shorter than subject: #{final_prot.length} - #{final_hit.full_subject_length}"
		if (final_prot.length + 100 < final_hit.full_subject_length) || (final_prot.length*2 < final_hit.full_subject_length)
			
			if (final_status == 'Complete')
				final_status = 'Putative Complete'
				warnings += ". Was predicted as Complete, but is very much shorter than de subject"
				# if (@verbose)
				# 	puts "#{db_name} -- #{q.query_def} --> your sequence is 100 aas shorter than the subject or shorter than the half length of the subject"
				# end
			end
		end
	end
	
	# test_final_hit(final_hit, query_fasta)
	print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
	
end

#determine_status(atg_status, end_status) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 448

def determine_status(atg_status,end_status)
	
	if (atg_status == 'complete') && (end_status == 'complete') # proteina completa
		final_status = 'Complete'
	elsif (atg_status == 'putative' && end_status == 'complete') || (atg_status == 'complete' && end_status == 'putative') || (atg_status == 'putative' && end_status == 'putative') # comienzo y/o final putative
		final_status = 'Putative Complete'
	elsif (atg_status == 'incomplete') && (end_status == 'incomplete') # region intermedia
		final_status = 'Internal'
	elsif (atg_status == 'complete') && (end_status == 'incomplete') # tenemos el principio de la proteina
		final_status = 'N-terminus'
	elsif (atg_status == 'putative') && (end_status == 'incomplete') # puede que tengamos el principio de la proteina
		final_status = 'Putative N-terminus'
	elsif (atg_status == 'incomplete') && (end_status == 'complete') # tenemos el final de la proteina
		final_status = 'C-terminus'
	elsif (atg_status == 'incomplete') && (end_status == 'putative') # puede que tengamos el final de la proteina
		final_status = 'Putative C-terminus'
	end
	
	return final_status
end

#error_log(q, seq, warnings, db_name) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 242

def error_log(q, seq, warnings, db_name)
	# seq.annotate(:error,"#{q.query_def}\t#{warnings}\t#{q.hits[0].definition}")
	
	if (db_name =~ /^tr_/)
		if (seq.get_annotations(:tmp_annotation).empty?)
			if (seq.sec_desc.empty?)
				if (!q.hits[0].definition.nil?)
					warnings = "Coding sequence with some errors, #{warnings}"
					seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
					seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true)
				else
					seq.annotate(:apply_tcode,'')
				end
			else
				warnings = "Coding sequence with some errors, #{warnings}"
				tmp_annot = seq.sec_desc.sub('my_warning',"#{warnings}")
				seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
			end
		else
			save_last_db_annotations(seq)
		end
	else
		if (seq.sec_desc.empty?)
			if (!q.hits[0].definition.nil?)
				warnings = "Coding sequence with some errors, #{warnings}"
				seq.sec_desc = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tMisassembled\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t"
			end
		end
	end
	
end

#find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 382

def find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end)
	# aqui vemos lo que queda sin similitud hasta el final
	s_end_resto = (final_hit.full_subject_length - (final_hit.s_end.to_i + 1)) # en el subject, numero de aas que necesito cubrir
	q_end_resto = (q.full_query_length.to_i - final_hit.q_end.to_i)/3 # en el query, numero de aas que tengo
	sq_end_distance = q_end_resto - s_end_resto
	
	cut_in_5p = full_prot.length - tmp_prot.length

	resto_substring = tmp_prot[0..final_hit.q_end/3 - cut_in_5p - 16]
	end_substring  =  tmp_prot[final_hit.q_end/3 - cut_in_5p - 15..tmp_prot.length]
	putative_end = end_substring.index('*')
	
	# si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
	if (sq_end_distance + aas_n_end < 0)
		end_status = 'incomplete'
		if (putative_end)
			warnings += " Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas, "
			end_substring = end_substring[0, putative_end+1] # comentar?
			# if (@verbose)
			# 	puts "#{db_name} -- #{q.query_def} --> Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas"
			# end
		else
			warnings += "Distance to subject end: #{sq_end_distance.abs} aas, "
			# if (@verbose)
			# 	puts "#{db_name} -- #{q.query_def} --> Distance to subject end: #{sq_end_distance.abs} aas"
			# end
		end
		
	else # tenemos suficiente secuencia
		if (putative_end) # tenemos un stop
			q_stop_resto = (putative_end - 15) # distancia entre el stop y el q_end, si es negativo el stop esta antes del q_end
			qs_stop_distance = q_stop_resto - s_end_resto # distancia entre los stops del q y el s
			
			# puts "putative_end: #{putative_end}, q_stop_resto: #{q_stop_resto}, qs_stop_distance: #{qs_stop_distance}"
			
			if (qs_stop_distance + aas_n_end >= 0) # si q_end esta a menos de 15 aas antes o esta despues del s_end; complete
				end_status = 'complete'
			elsif (qs_stop_distance + 2*aas_n_end < 0) # si q_end es mas de 30 aas menor que el s_end; putative/Putative chimeric seq
				end_status = 'putative'
				warnings += " query STOP codon too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence, "
				# if (@verbose)
				# 	puts "#{db_name} -- #{q.query_def} --> query STOP too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence"
				# end
			elsif (qs_stop_distance + aas_n_end < 0) # si q_end es mas de 15 aas menor pero menos de 30 que el s_end; putative
				end_status = 'putative'
				warnings += " query STOP codon is far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, "
				# if (@verbose)
				# 	puts "#{db_name} -- #{q.query_def} --> query STOP far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas"
				# end
			end
			end_substring = end_substring[0, putative_end+1]
			
		else # no tenemos codon de parada pero tenemos suficiente secuencia
			end_status = 'putative'
			warnings += " STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas, "
			# if (@verbose)
			# 	puts "#{db_name} -- #{q.query_def} --> STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas"
			# end
		end
		
	end
	
	return [resto_substring, end_substring, end_status, warnings, putative_end]
end

#find_start(subject_start, substring, fiable, aas_n_end) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 294

def find_start(subject_start, substring, fiable, aas_n_end)
	
	tmp_prot = ''
	msgs = ''
	atg_status = 'incomplete' # complete, incomplete or putative

	# puts "\nsubstring (#{substring.length} aas):\n#{substring}"
	stop_codon = substring.rindex('*')
	
	# marcamos la distancia al s_beg desde el principio del substring
	# s_beg_distance = (substring.length) - subject_start
	s_beg_distance = (substring.length - 10) - subject_start
	# marcamos la distancia al s_beg desde el final del substring
	atg_distance = (subject_start + 1) - (substring.length - 10)
	if (atg_distance <= 0) 
		atg_distance = 0
	else
		# puts "expected atg_distance = 0, your sequence atg_distance = #{atg_distance}; limit (1-15)"
		msgs = "atg_distance in limit (1-15): atg_distance = #{atg_distance}, "
	end
	
	# puts "s_beg_distance:#{s_beg_distance}, stop_codon: #{stop_codon}, subject_start: #{subject_start + 1}, atg_distance: #{atg_distance}"
	#----------------------------------------------------------------------------------------------------------
	# tenemos un codon de parada en el substring 5 prima
	if (stop_codon)
		stop_codon += 1
		# ahora vamos a ver si el stop esta antes o despues del s_beg
		if (stop_codon <= s_beg_distance) # esta antes
			substring = substring[stop_codon, substring.length - stop_codon]
			# puts "\nhay un codon de parada en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon +1}\n#{substring}\n\n"
			
			first_m = substring.index('M')
			
			if (first_m) # tenemos M y stop ---------------------------------------------------------------------------
				substring = substring[first_m, substring.length - first_m]
				
				atg_status = 'complete'
			else # con STOP pero sin M --------------------------------------------------------------------------------
				atg_status = 'putative'
				# puts "there is not a start codon near the expected beginning of your sequence, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
				msgs += "W1: There is no M at the beginning, "
			end
			#----------------------------------------------------------------------------------------------------------
		else # esta despues, un cambio de fase impide analizar el principio
			substring = substring[stop_codon, substring.length - stop_codon] # comentar?
			first_m = substring.index('M') # comentar?
			if (first_m) # tenemos M y unexpected stop # comentar?
				substring = substring[first_m, substring.length - first_m] # comentar?
			end # comentar?
			# TODO esto se puede cambiar!
			atg_status = 'putative'
			msgs += " Unexpected STOP codon in 5 prime region, "
			# puts "\nhay un codon de parada inesperado en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon}, s_beg_distance: #{s_beg_distance +1}, atg_distance: #{atg_distance}"
		end
		#---------------------------------------------------------------------------------------------------------------
	else # no hay stop codon
		first_m = substring.index('M')
		if (first_m) # tenemos M, sin stop
			m_distance = subject_start - (substring.length - 10 - first_m)
			substring = substring[first_m, substring.length - first_m]
			# m_distance = [first_m+1,s_beg_distance].max - [first_m+1,s_beg_distance].min
			
			if (m_distance > aas_n_end*2) # sin STOP, con atg pero muy lejos del inicio que marca el subject ---------------
				# puts "No stop codon before M and M found is too far from subject M, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
				msgs += "No stop codon before M and M found is too far from subject M, "
				atg_status = 'incomplete'
			else
				if (fiable) # Tenemos M y aunque no hay STOP condon el ortologo es fiable ----------------------------------
					# msgs += "No stop codon before M but high homology subject, "
					atg_status = 'complete'
				else # Tenemos M pero no tenemos stop y el ortologo no es fiable -------------------------------------------
					# puts "No stop codon before M and low homology subject, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}"
					msgs += "No stop codon before M and low homology subject, "
					atg_status = 'putative'
				end
			end
		else # sin M ni STOP -------------------------------------------------------------------------------------------
			atg_status = 'putative'
			# puts "your sequence has the subject beginning but there is not start codon at the beginning, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}"
			msgs += "W2: There is no M at the beginning, "
		end
	end
	
	return [substring, atg_status, msgs]
	
end

#print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 470

def print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name)
	name_diff = q.query_def.length - final_hit.acc.length
	if (name_diff > 0)
		spnum = ' '*name_diff.to_i
	else
		spnum = ''
	end
#-------------------------------------------------------------------------------------------------------------------------------------
	# if the sequence is Complete will be printed                 --------------------------------------------------------------------
	if (final_status == 'Complete') 
		seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
		print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)

		if (final_hit.reversed) 
			(kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
		end
		seq.annotate(:complete,"#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}")
		seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
#-------------------------------------------------------------------------------------------------------------------------------------
	else # la proteina no esta completa                      -------------------------------------------------------------------------
		if (!seq.get_annotations(:tmp_annotation).empty?) && (!seq.get_annotations(:tmp_annotation).nil?) # ---> trae informacion de una bd anterior
			if (db_name =~/^tr_/) #                                          --->  estamos usando el trembl, se dejan las anotaciones que trae
				# puts "#{db_name} -- #{q.query_def} --> print_annotations: sequence not complete! recovering annotations from previous database!"
				(kk1, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
				print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)

				(name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
				if (final_hit.reversed) 
					(kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
				end
				
				my_prot = seq.get_annotations(:tmp_annotation).first[:message][1]
				seq.annotate(:protein,my_prot)
				my_align = seq.get_annotations(:tmp_annotation).first[:message][2]
				seq.annotate(:alignment,my_align)
				
				tmp_annot = "#{name}\t#{query_fasta.length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
				seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)
			#-----------------------------------------------------------------------------------------------------------------------------
			# elsif (db_name =~ /^sp_/) #                                       ---> estamos usando el sp, se dejan las anotaciones que trae
				
				# puts "#{db_name} -- #{q.query_def} --> print_annotations: Mantenemos las anotaciones de la BD de usuario y pasamos la secuencia al trembl"
			end
#-------------------------------------------------------------------------------------------------------------------------------------
		elsif (seq.get_annotations(:tmp_annotation).empty?) #                                ---> NO trae informacion de una bd anterior
			if (db_name =~ /^tr_/) #                                                                         ---> estamos usando el trembl
				# puts "#{db_name} -- #{q.query_def} --> print_annotations: #{q.query_def} is not complete!! se anota con trembl"
				print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)

				if (final_hit.reversed) 
					(kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i)
				end

				seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n")
				seq.annotate(:protein,">#{q.query_def}\n#{final_prot}")
				tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
				seq.annotate(:tmp_annotation,[tmp_annot, '','',''])
#-------------------------------------------------------------------------------------------------------------------------------------
			else #                                                                               cargamos anotaciones para la siguiente BD
				tmp_prot = ">#{q.query_def}\n#{final_prot}"
				tmp_align = "#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n"
				tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}"
				seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tMisassembled\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t"
				seq.annotate(:tmp_annotation,[tmp_annot, tmp_prot,tmp_align,[q, final_hit, final_prot, query_fasta, final_status]])
				
				# puts "\n\n\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.---#{q.query_def}\t#{final_status}\n#{tmp_prot}"
				# puts "#{db_name} -- #{q.query_def} --> print_annotations: cargamos anotaciones para utilizarlas en la siguiente BD"
			end
		end
	end
end

#print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 543

def print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status)
	
	bad_atg = false
#-------------------------------------------------------------------------------------------------------------  ATG

	if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'Putative N-terminus') || (final_status == 'N-terminus')
		# puts "entra aqui, final_status: #{final_status}"
		my_seq_n = query_fasta[final_hit.q_beg - 5..final_hit.q_beg + 5]

		beg5 = false
		# -------------------------------------   si my_seq_n = nil puede ser porque q_beg sea < 5
		if (final_hit.q_beg < 6)
			my_seq_n = query_fasta[0..10]
			beg5 = true
			# puts "empieza en el borde de la seq"
		end

		atg_found = my_seq_n.index(/ATG/i)
		atg_found_rv = my_seq_n.rindex(/ATG/i)
		my_atg_index = nil
	end

	if (!atg_found.nil?)
		if (beg5)

			my_seq_n.sub!(/ATG/i,'_-_ATG')
			my_atg_index = atg_found
			my_seq = my_seq_n + query_fasta[11..query_fasta.length + 1]

		elsif (atg_found == atg_found_rv)

			my_seq_n.sub!(/ATG/i,'_-_ATG')
			my_atg_index = final_hit.q_beg - 5 + atg_found
			
			my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]
			
			# puts "my_seq despues de encontrar el atg: #{my_seq}"
		elsif (atg_found == 5) || (atg_found_rv == 5)

			my_seq_n = my_seq_n[0..4]+'_-_'+my_seq_n[5..10]
			my_atg_index = final_hit.q_beg - 5 + atg_found
			my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1]

		else

			# puts "#{q.query_def}  tiene mas de un ATG  my_seq_n: #{my_seq_n}"
			bad_atg = true
			my_seq = query_fasta
		end

	else

		bad_atg = true
		# puts "#{q.query_def}  NO TIENE ATG  my_seq_n: #{my_seq_n}"
		my_seq = query_fasta

	end
#-------------------------------------------------------------------------------------------------------------  STOP
stop_c = nil
	if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')

		if (bad_atg == true)
			stop_c = my_seq[final_hit.q_end - 2..final_hit.q_end]
			stop_c_longer = my_seq[final_hit.q_end - 7..final_hit.q_end + 5]
		else
			stop_c = my_seq[final_hit.q_end + 3..final_hit.q_end + 5]
			stop_c_longer = my_seq[final_hit.q_end - 2..final_hit.q_end + 10]
		end

	end

	if (!stop_c.nil?)
		# puts stop_c
		# puts stop_c_longer
		if (stop_c.translate == '*')

			if (bad_atg == true)
				my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
				seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
			else

				my_seq = my_seq[0..final_hit.q_end + 5] +'___'+ my_seq[final_hit.q_end + 6..my_seq.length + 1]
				my_prot = my_seq.sub(/\w+_\-_/,'')
				my_prot = my_prot.sub(/___\w+/,'')
				my_prot = my_prot.translate
				my_prot = my_prot.sub(/x$/,'')

				simliar_fragment = final_prot.lcs(my_prot)

				if (simliar_fragment.length == final_prot.length) && (simliar_fragment.length == my_prot.length)
					seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\t\t\t\t\t\t#{my_seq}")
				else
					seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tthe nucleotide sequence contain a lot of errors\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
					# puts "nt seq: was no possible to find stop codon, the nucleotide sequence contain a lot of errors"
				end

			end

		else
			if (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus')

				if (bad_atg == true)
					stop_c = my_seq[final_hit.q_end+1..final_hit.q_end+3]
					stop_c_longer = my_seq[final_hit.q_end - 4..final_hit.q_end + 8]
				else
					stop_c = my_seq[final_hit.q_end + 7..final_hit.q_end + 9]
					stop_c_longer = my_seq[final_hit.q_end..final_hit.q_end + 13]
				end
				
				if (!stop_c.nil?)
					if (stop_c.translate == '*')
						final_hit.q_end = final_hit.q_end + 3
						if (bad_atg == true)
							my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1]
							seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
						else
							seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
						end
					else
						if (bad_atg == true)
							seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
							# puts "find nt end: NO ATG, NO exact STOP"
						else
							seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}")
							# puts "find nt end: GOOD ATG, NO exact STOP"
						end
					end
				end
			end
			

		end

	else

		if (bad_atg == true)
			seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP\t\t\t\t\t#{my_seq}")
		else
			seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP\t\t\t\t\t#{my_seq}")
		end

	end
	
end

#save_last_db_annotations(seq) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 275

def save_last_db_annotations(seq)

	# puts "sequence not complete! recovering annotations from previous database! sldba!!"
	(q, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3]
	print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___)

	(name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t")
	if (final_hit.reversed) 
		(kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i)
	end

	seq.annotate(:protein,seq.get_annotations(:tmp_annotation).first[:message][1])
	seq.annotate(:alignment,seq.get_annotations(:tmp_annotation).first[:message][2])
	tmp_annot = "#{name}\t#{fasta_length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}"
	seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true)

end

#test_blast_hits(q) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 188

def test_blast_hits(q)
	
		puts "query_def: #{q.query_def} full_query_length: #{q.full_query_length} ------------------------------------------------"
		
		q.hits.each do |h|
			puts "\t subject_id: #{h.acc}"
			puts "\t acc: #{h.acc}"
			puts "\t full_subject_length: #{h.full_subject_length}"
			puts "\t q_beg: #{h.q_beg + 1}"
			puts "\t q_end: #{h.q_end + 1}"
			puts "\t q_frame: #{h.q_frame}"
			puts "\t s_beg: #{h.s_beg + 1}"
			puts "\t s_end: #{h.s_end + 1}"
			puts "\t s_frame: #{h.s_frame}"
			puts "\t align_len: #{h.align_len}"
			puts "\t gaps: #{h.gaps}"
			puts "\t mismatches: #{h.mismatches}"
			puts "\t reversed: #{h.reversed}"
			puts "\t score: #{h.score}"
			puts "\t bit_score: #{h.bit_score}"
			puts "\t ident: #{h.ident}"
			puts "\t e_val: #{h.e_val}"
			puts "\t definition: #{h.definition}"
			puts "\t q_seq: #{h.q_seq}"
			puts "\t s_seq: #{h.s_seq}"
			
		end

end

#test_final_hit(final_hit, query_fasta) ⇒ `Object`

# File 'lib/full_lengther_next/classes/fl_analysis.rb', line 219

def test_final_hit(final_hit, query_fasta)
	
	puts "\t acc: #{final_hit.acc}"
	puts "\t full_subject_length: #{final_hit.full_subject_length}"

	puts "\n\t q_frame: #{final_hit.q_frame}"
	puts "\t reversed: #{final_hit.reversed}"
	
	puts "\n\t q_beg-q_end: #{final_hit.q_beg + 1} - #{final_hit.q_end + 1}"
	puts "\t s_beg - s_end: #{final_hit.s_beg + 1} - #{final_hit.s_end + 1}"

	puts "\n\t score: #{final_hit.score}, bit_score: #{final_hit.bit_score}, ident: #{final_hit.ident}, e_val: #{final_hit.e_val}"

	puts "\n\t definition: #{final_hit.definition}"
	puts "\t q_seq: #{final_hit.q_seq}"
	puts "\t s_seq: #{final_hit.s_seq}"
	
	puts "\nnt q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end]}"
	puts "\n\nprot q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end].translate}"
	
end

Module: FlAnalysis

Instance Method Summary collapse

Instance Method Details

#analiza_orf_y_fl(seq, blast_query, options, db_name) ⇒ Object

#determine_status(atg_status, end_status) ⇒ Object

#error_log(q, seq, warnings, db_name) ⇒ Object

#find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end) ⇒ Object

#find_start(subject_start, substring, fiable, aas_n_end) ⇒ Object

#print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name) ⇒ Object

#print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) ⇒ Object

#save_last_db_annotations(seq) ⇒ Object

#test_blast_hits(q) ⇒ Object

#test_final_hit(final_hit, query_fasta) ⇒ Object

#analiza_orf_y_fl(seq, blast_query, options, db_name) ⇒ `Object`

#determine_status(atg_status, end_status) ⇒ `Object`

#error_log(q, seq, warnings, db_name) ⇒ `Object`

#find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end) ⇒ `Object`

#find_start(subject_start, substring, fiable, aas_n_end) ⇒ `Object`

#print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name) ⇒ `Object`

#print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) ⇒ `Object`

#save_last_db_annotations(seq) ⇒ `Object`

#test_blast_hits(q) ⇒ `Object`

#test_final_hit(final_hit, query_fasta) ⇒ `Object`