Top Level Namespace

Includes:: ChimericSeqs, CommonFunctions, FlAnalysis, FlnStats, NcRna

Defined Under Namespace

Modules: Bio, ChimericSeqs, CommonFunctions, FlAnalysis, FlnStats, FullLengtherNext, NcRna, ScbiMapreduce Classes: Cdhit, ExoBlastHit, ExonerateResult, Mapping, Mpileup, MyWorker, MyWorkerEst, MyWorkerManagerEst, MyWorkerManagerFln, Orf, Seq, Sequence, String, TestCode, UneLosHit

Constant Summary collapse

FAILED =

-5

OTHER =

-4

UNMAPPED =

-3

CHIMERA =

-2

MISASSEMBLED =

-1

UNKNOWN =

COMPLETE =

N_TERMINAL =

C_TERMINAL =

INTERNAL =

NCRNA =

CODING =

OPERATION =

QUERY =

TARGET =

Constants included from FlnStats

FlnStats::REPORT_FOLDER

Constants included from ChimericSeqs

ChimericSeqs::BEG, ChimericSeqs::HIT, ChimericSeqs::STOP

Instance Method Summary collapse

Instance Method Details

#analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 67

def analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering)
	clusters_seqs_annot_DB = clustering_by_id(seqs_annotation_DB)
	representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB)
	if pfam_clustering
		clusters_seqs_annot_DB = clustering_by_annot(representative_seqs_annot_DB, :pfam_id) # pfam id, fix get the annotation guide on my_worker_manager_fln (@@func_annot_type) to this scope
		representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB) # merge clusters by id and by pfam
	end
	stats_hash[key_stats] += representative_seqs_annot_DB.length
	report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
	write_fasta(representative_seqs_annot_DB, reptrans_fasta, 'w')
end

#artifact?(seq, query, db_name, db_path, options, new_seqs) ⇒ `Boolean`

MAIN FUNCTION

Returns:

(Boolean)

# File 'lib/full_lengther_next/artifacts.rb', line 9

def artifact?(seq, query, db_name, db_path, options, new_seqs)
	artifact = false
	# UNMAPPED CONTIG DETECTION
	if query.nil? && seq.unmapped? #If seq is misassembled stop chimera analisys
		seq.hit = nil
		artifact = true
		seq.type = UNMAPPED
	end

	if !query.nil?
		# MISASSEMBLED DETECTION
		if !artifact && misassembled_detection(query) #If seq is misassembled stop chimera analisys
			seq.hit = query.hits.first
			artifact = true
			seq.type = MISASSEMBLED
			seq.warnings('ERROR#1')
		end

		# OVERLAPPING HSPS ON SUBJECT DETECTION
=begin
		if !artifact
			hit_reference = query.hits.first.dup
			query, overlapping = overlapping_hsps_on_subject(query)
			if overlapping
				if query.hits.first.nil?
					seq.hit = hit_reference
				else
					seq.hit = query.hits.first
				end
				artifact = true
				seq.type = OTHER
				seq.warnings('ERROR#2')
			end
		end
=end

		# MULTIPLE HSP DETECTION
		if !artifact && multiple_hsps(query, 3)   
			seq.hit = query.hits.first
			seq.warnings('ERROR#3')
		end

		# CHIMERA DETECTION
		if !artifact && !options[:chimera].include?('d')  
			chimera = search_chimeras(seq, query, options, db_name, db_path)			
			if !chimera.nil?   
				new_seqs.concat(chimera)
				seq.db_name = db_name
				seq.type = CHIMERA
				artifact = true
			end
		end
	end
	if artifact
		if $verbose > 1
			puts seq.prot_annot_calification
		end
		seq.db_name = db_name
		seq.save_fasta = false
		seq.ignore = true
	end
	return artifact
end

#clean_by_identity(blast_result, ident) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 137

def clean_by_identity(blast_result, ident)
	blast_result.querys.each do |query|
		if !query.hits.first.nil?
			new_hits = query.hits.select{|hit| hit.ident > ident}	
			new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil]
			query.hits = new_hits
		end
		query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer
	end
end

#clean_by_query_length_match(blast_result, min_len_nt) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 148

def clean_by_query_length_match(blast_result, min_len_nt)
	blast_result.querys.each do |query|
		if !query.hits.first.nil?
			new_hits = query.hits.select{|hit| hit.align_len * 3 > min_len_nt}	
			new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil]
			query.hits = new_hits
		end
		query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer

	end
end

#clean_hsp_by_identity(hit, identity) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 298

def clean_hsp_by_identity(hit, identity)
	hit.select!{|hsp| hsp.ident >= identity}
	return hit
end

#clean_overlapping_hsps(blast_result, keep_if_diff_sense = false) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 161

def clean_overlapping_hsps(blast_result, keep_if_diff_sense = false)
	blast_result.querys.each do |query|
		if query.hits.length > 1
			query.hits.each_with_index do |hit, j|
				if hit.nil?
					next
				end
				query.hits.each_with_index do |second_hit, i|
					if second_hit.nil? || i == j #Same hit
						next
					end
					if same_query_hsp(hit, second_hit) #|| same_subject_hsp(hit, second_hit)
						if keep_if_diff_sense 
							if same_sense?(hit, second_hit) #Delete second_hit if is into the hit and has same sense 
								query.hits[i] = nil
							end
						else
							query.hits[i] = nil
						end
					end
				end
			end
			query.hits.compact!
		end
	end
end

#clean_subject_overlapping_hsps(complete_hit, cleaned_hits) ⇒ `Object`

COMPLEMENTARY FUNCTIONS

# File 'lib/full_lengther_next/blast_functions.rb', line 246

def clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
	if complete_hit.length > 1
		complete_hit, overlapping = subject_overlapping_hsps(complete_hit)
	end
	cleaned_hits.concat(complete_hit)
	return complete_hit, overlapping
end

#cluster_hsps(hsps) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 303

def cluster_hsps(hsps)
	hits = []
	last_acc = ''
	hsps.each do |hsp|
		if hsp.acc != last_acc
			hits << [hsp]
		else
			hits.last << hsp
		end
		last_acc = hsp.acc
	end
	return hits
end

#clustering_by_annot(seqs_with_hit, annotation_type) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 165

def clustering_by_annot(seqs_with_hit, annotation_type)
	clusters = []
	annot_id = []
	no_annotation_clusters = []
	seqs_with_hit.each do |seq|
		annot = seq.functional_annotations[annotation_type]
		annot = annot.split(';').sort.join(';') if !annot.nil?
		if annot == '-' || annot.nil?
			no_annotation_clusters << [seq]
		else
			position = annot_id.index(annot)
			if position.nil?
				annot_id << annot
				clusters << [seq]
			else
				clusters[position] << seq 
			end
		end
	end
	clusters.concat(no_annotation_clusters)
	return clusters
end

#clustering_by_id(seqs_with_hit) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 150

def clustering_by_id(seqs_with_hit)
	clusters=[]
	hit_id=[]
	seqs_with_hit.each do |seq|
		position=hit_id.index(seq.get_acc)
		if position.nil?
			hit_id << seq.get_acc
			clusters << [seq]
		else
			clusters[position] << seq 
		end
	end	
	return clusters
end

#count_cpu(options) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 206

def count_cpu(options)
	cpu = 0
	if options[:workers].class.to_s == 'Array'
		cpu = options[:workers].length + 1
	else
		cpu = options[:workers]
	end
	return cpu
end

#do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) ⇒ `Object`

Second server to representative transcriptome

# File 'lib/full_lengther_next/reptrans.rb', line 101

def do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) # Second server to representative transcriptome
	$LOG.info 'Starting server for EST analysis'
		custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker_EST.rb')
		options[:chimera] = nil #Inactive chimeras system on RepTrans, this resume the BLAST's output
	
		MyWorkerManagerEst.init_work_manager(putative_seqs, options, blast_path)
		server_EST = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerEst, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
		server_EST.chunk_size = options[:chunk_size]
		server_EST.start_server
	$LOG.info 'Closing server for EST analysis'

	seqs_with_EST, putative_seqs = MyWorkerManagerEst.get_array_seqs
	if !seqs_with_EST.empty?
		analysis_over_DB_annotated_seqs(seqs_with_EST, reptrans_fasta, cluster_EST_annotated_path, stats_hash, 'est_annotated') 
	end
	return putative_seqs
end

#do_makeblastdb(seqs, output, dbtype) ⇒ `Object`

# File 'lib/full_lengther_next/handle_db.rb', line 35

def do_makeblastdb(seqs, output, dbtype)
	cmd="makeblastdb -in - -out #{output} -title #{File.basename(output)} -dbtype #{dbtype} -parse_seqids"
	IO.popen(cmd,'w+') {|makedb|
		makedb.sync = true
		makedb.write(seqs)
		makedb.close_write
		puts makedb.readlines
		makedb.close_read
	}
end

#filter_hits(query, select_hits = 10) ⇒ `Object`

Select best hits

# File 'lib/full_lengther_next/blast_functions.rb', line 4

def filter_hits(query, select_hits=10) # Select best hits
	hits = query.hits
	if !hits.first.nil?
		hits = cluster_hsps(hits)
		hits = hits[0..select_hits]
		hits = select_hits_by_identity_query(hits)
		hits = select_hits_by_coverage_subject(hits)
	end
	if hits.empty? 
		if select_hits >= query.hits.length || select_hits >= 100 # Condition to stop a infinite recursive function
			hits = [cluster_hsps(query.hits).first]
		else
			hits = filter_hits(query, select_hits+10)
		end
	end
	return hits
end

#find_hit(hit_acc, ar_hits) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 317

def find_hit(hit_acc, ar_hits)
	selected_hit = nil
	ar_hits.each do |hit|
		if hit.first.acc == hit_acc
			selected_hit = hit
			break
		end
	end
	return selected_hit
end

#get_coverage_subject(hit) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 22

def get_coverage_subject(hit)
	perc_identity = hit.align_len*100.0/hit.s_len
	if perc_identity > 100 && hit.class.to_s == 'ExoBlastHit' && !hit.q_frameshift.empty?
		hit.q_frameshift.length.times do |n| #Align len correction by frameshift. FS can create a gap in alignment adding extra aa. FS can be deletions or insertions so we check until get a perc_identity of 100
			align_len = hit.align_len- (n + 1)
			perc_identity = align_len*100.0/hit.s_len
			break if perc_identity <= 100
		end
	end 
	return perc_identity
end

#go_for_graph(sequences_by_ontologies, fpkm = {}) ⇒ `Object`

# File 'lib/full_lengther_next/go_methods.rb', line 1

def go_for_graph(sequences_by_ontologies, fpkm = {})
	container = {}
	go_data = [
		[:function_go, 'F:'],
		[:component_go, 'C:'],
		[:process_go, 'P:']
	]

	go_data.each do |key, prefix|		
		go_ontology = sequences_by_ontologies.select{|go, seq_ids| go =~ /^#{prefix}/}
		go_names = [] 
		go_vals = []
		go_ontology.each do |go_name, seq_names|
			go_label = go_name.gsub(prefix, '')
			if fpkm.empty?
				go_vals << seq_names.length
				go_names << go_label
			else
				sum = seq_names.map{|seq_name| fpkm[seq_name].first }.inject { |sum, n| sum + n }
				if sum > 0
					go_vals << sum 
					go_names << go_label
				end
			end
		end
		go_table = []
		go_names.each_with_index do |name, index|
			go_table << [name, go_vals[index]]
		end
		go_table.sort!{|v1, v2| v2[1] <=> v1[1]}
		go_table.unshift([key.to_s, 'GO'])
		if !go_names.empty?
			container[key] = go_table 
		else
			container[key] = [
				[key.to_s, 'GO'],
				['No_data', 1]
			] 
		end
	end
	return container
end

#hsps_relationship_subject(hit) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 271

def hsps_relationship_subject(hit)
	hsps = []
	hit.each_with_index do |hsp, j|
		hit.each_with_index do |second_hsp, i|
			if i == j #Same hit
				next
			end
			if same_subject_hsp(hsp, second_hsp)
				if !hsps.include?([hsp, second_hsp]) && !hsps.include?([second_hsp, hsp]) # Save if no exists direct relationship or his inverse
					hsps << [hsp, second_hsp]
				end
			end
		end
	end
	return hsps
end

#load_cd_hit_sequences_names(file) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 120

def load_cd_hit_sequences_names(file)
	names=[]
	File.open(file).readlines.each do |line|
		if line =~ /^>/
			line.chomp!
			line.gsub!('>','')
			names << line
		end	
	end
	return names
end

#load_isoform_hash(file) ⇒ `Object`

# File 'lib/full_lengther_next/handle_db.rb', line 3

def load_isoform_hash(file)
	isoform_hash = {}
	if File.exists?(file)
		fasta = ScbiZcatFile.new(file)
	    filtered_fasta = ''
		seq_name = nil
		seq = ''
		while !fasta.eof
		        line = fasta.readline.chomp
		        if line[0] == '>'		                
						load_seq_in_hash(seq_name, seq, isoform_hash) if !seq_name.nil?
		                seq_name = line
		                seq = ''
		        else
		                seq << line
		        end
		end
		load_seq_in_hash(seq_name, seq, isoform_hash)
	end
	return isoform_hash
end

#load_seq_in_hash(seq_name, seq, isoform_hash) ⇒ `Object`

# File 'lib/full_lengther_next/handle_db.rb', line 25

def load_seq_in_hash(seq_name, seq, isoform_hash)
	name, desc = seq_name.split(' ', 2) 
	name =~ /(\w+\|(\w+)\-\d+\|)/
	if isoform_hash[$2].nil?
		isoform_hash[$2] = ">#{$1}#{desc}\n#{seq}"
	else
		isoform_hash[$2] += "\n>#{$1}#{desc}\n#{seq}"
	end
end

#misassembled_detection(query) ⇒ `Object`

DETECTION FUNCTIONS

# File 'lib/full_lengther_next/blast_functions.rb', line 192

def misassembled_detection(query)
	miss=false
	hits = cluster_hsps(query.hits)
	misassembled_hits = []
	hits.each do |hit|
		if hit.length > 1
			negative_frame = hit.select{|hsp| hsp.q_frame < 0}
			if negative_frame.length > 0 && negative_frame.length != hit.length
				misassembled_hits << hit.first.acc
			end
		end
	end
	if misassembled_hits.length*1.0/ hits.length > 0.5
		miss = true
	else #Remove missassembled hits to avoid broken analysis
		query.hits.reverse_each do |hsp|
			if misassembled_hits.include?(hsp.acc)
				query.hits.delete(hsp)
			end
		end
	end
	return miss
end

#multiple_hsps(query, num) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 216

def multiple_hsps(query, num)
	multiple = false
	hsps = query.hits.select{|h| h.acc == query.hits.first.acc}	
	if hsps.length >= num
		multiple = true
	end
	return multiple
end

#overlapping_hsps_on_subject(query) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 225

def overlapping_hsps_on_subject(query)
	overlapping = false
	current_hit = query.hits.first.acc
	complete_hit = []
	cleaned_hits = []
	query.hits.each do |hit|
		if hit.acc != current_hit
			complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
			complete_hit = []
		end
		complete_hit << hit
		current_hit = hit.acc
	end
	complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
	query.hits = cleaned_hits
	return query, overlapping
end

#reduce_pool_sequences(putative_seqs, main_path, cpu) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 88

def reduce_pool_sequences(putative_seqs, main_path, cpu)
	temp_fasta = File.join(main_path, 'temp.fasta')
	temp_fasta_clean = File.join(main_path, 'temp_cln.fasta')
	log_file = File.join(main_path, 'log_cd_hit_Cod_Unk')
	write_fasta(putative_seqs, temp_fasta, 'w')
	$LOG.info "Start cd-hit with coding and unknow sequences"	
	system("cd-hit -i #{temp_fasta} -o #{temp_fasta_clean} -c 0.95 -M 0 -T #{cpu} > #{log_file}") if !File.exists?(temp_fasta_clean)
	$LOG.info "Ended cd-hit with coding and unknow sequences"	
	cd_hit_names_putative_seqs = load_cd_hit_sequences_names(temp_fasta_clean)
	putative_seqs = select_seqs_with_name(putative_seqs, cd_hit_names_putative_seqs)
	return putative_seqs
end

#report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 79

def	report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
	cluster_file = File.open(cluster_file_path, 'w')
	representative_seqs_annot_DB.each_with_index do |rep_seq, i|
		cluster_seqs = clusters_seqs_annot_DB[i].map{|seq| seq.seq_name}.join(';')
		cluster_file.puts "#{rep_seq.seq_name}\t#{cluster_seqs}"
	end
	cluster_file.close
end

#reptrans(seqs_annotation_prot, seqs_some_coding, seqs_unknown, options) ⇒ `Object`

MAIN FUNCTION

# File 'lib/full_lengther_next/reptrans.rb', line 9

def reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
	cpus = count_cpu(options)
	stats_hash = initialize_stats_hash_reptrans
	# Paths
	#---------------------------------------------	
	main_path = File.join(Dir.pwd, 'fln_results')
	reptrans_fasta = File.join(main_path, 'Representative_transcriptome.fasta')
	blast_path = File.join(main_path, 'ESTdb')
	cluster_prot_annotated_path =File.join(main_path, 'Prot_clusters')
	cluster_EST_annotated_path =File.join(main_path, 'EST_clusters')
	html_file = File.join(main_path, 'Representative_transcriptome_stats.html')
	txt_file = File.join(main_path, 'Representative_transcriptome_stats.txt')

	# Prot annotations sequence analysis
	#---------------------------------------------
	analysis_over_DB_annotated_seqs(seqs_annotation_prot, reptrans_fasta, cluster_prot_annotated_path, stats_hash, 'prot_annotated', options[:high_clustering])
	seqs_annotation_prot = nil

	# NOT Prot annotations sequence analysis
	#---------------------------------------------
	putative_seqs = seqs_some_coding	
	if !options[:est_db].nil? # WITH EST DATABASE
		putative_seqs += seqs_unknown # Coding & unknown 
		putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus)
		if !File.exists?(blast_path +'.nsq')
			$LOG.info "Start makeblastdb over EST DB"
			system("makeblastdb -in #{options[:est_db]} -out #{blast_path} -dbtype nucl -parse_seqids > #{File.join(main_path, 'log_makeblast_db')}")
			$LOG.info "Ended makeblastdb over EST DB"
		end
		putative_seqs = do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash)
	end

	# Coding sequence analysis
	#---------------------------------------------
	if !putative_seqs.nil? && !putative_seqs.empty?
		putative_seqs = select_seqs_more_500pb(putative_seqs)
		putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus) if options[:est_db].nil? # NOT EST database
		putative_seqs.sort!{|s1, s2| #Order by testcode (first) and sequence length (last)
			if s2.t_code == s1.t_code
				s2.fasta_length <=> s1.fasta_length
			else
				s2.t_code <=> s1.t_code
			end
		}
		count = 0
		putative_seqs.each do |coding_seq|
			coding_stats_reptrans(coding_seq, stats_hash)
			count +=1
		end
 
		write_fasta(putative_seqs, reptrans_fasta, 'a')
	end
	write_reptrans_stats(stats_hash, html_file, txt_file)
end

#same_query_hsp(hit, second_hit) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 117

def same_query_hsp(hit, second_hit)
	same = false
	if hit.acc == second_hit.acc
		if hit.q_beg <= second_hit.q_beg && hit.q_end >= hit.q_end && (second_hit.q_beg - hit.q_end).abs > 1
			same = true
		end
	end
	return same		
end

#same_sense?(hit, second_hit) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/full_lengther_next/blast_functions.rb', line 127

def same_sense?(hit, second_hit)
	same= false
	hit_sense = hit.q_frame <=> 0
	second_hit_sense = second_hit.q_frame <=> 0
	if hit_sense == second_hit_sense
		same = true
	end
	return same
end

#same_subject_hsp(hit, second_hit) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 107

def same_subject_hsp(hit, second_hit)
	same = false
	if hit.acc == second_hit.acc
		if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1
			same = true
		end
	end
	return same
end

#select_hits_by_coverage_subject(hits) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 34

def select_hits_by_coverage_subject(hits)
	selected_hits = []
	coverage_thresold = get_coverage_subject(hits.first.first)
	coverage_thresold = 100 if coverage_thresold > 100

	hits.map{|hit|
		hit.each do |hsp|
			coverage = get_coverage_subject(hsp)
			if coverage > 100
				next
			end
			if  coverage >= coverage_thresold
				selected_hits << hit
				break
			end
		end	
	}
	return selected_hits
end

#select_hits_by_evalue(hits, evalue) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 68

def select_hits_by_evalue(hits, evalue)
	selected_hits = []
	hits.map{|hit|
		hit.each do |hsp|
			if hsp.e_val <= evalue 
				selected_hits << hit
			end
		end
	}
	return selected_hits
end

#select_hits_by_identity_query(hits) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 54

def select_hits_by_identity_query(hits)
	selected_hits = []
	identity = hits.first.first.ident
	hits.map{|hit|
		hit.each do |hsp|
			if hsp.ident >= identity
				selected_hits << hit
				break
			end
		end
	}
	return selected_hits
end

#select_hsps_by_id(hits, selected_ids) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 80

def select_hsps_by_id(hits, selected_ids)
	selected_hits = []
	hits.map{|hsp|
		if selected_ids.include?(hsp.acc)
			selected_hits << hsp
		end
	}
	return selected_hits
end

#select_representative(clusters_seqs_annot_prot) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 188

def select_representative(clusters_seqs_annot_prot)
	seqs = []	
	clusters_seqs_annot_prot.each do |cluster|
		if !cluster.first.coverage_analysis.empty? # filtering by mapping coverage
			max_transcript_mean_coverage = cluster.map{|seq| seq.coverage_analysis[3] }.max - 0.05 # Relaxed limit of 5%
			cluster.select!{|seq| seq.coverage_analysis[3] >= max_transcript_mean_coverage}
		end
		seq = cluster.select{|s| s.type == COMPLETE}.sort{|fl1, fl2| fl2.seq_fasta <=> fl1.seq_fasta}.first # Take longest full-length, s -> sequence, fl -> full-lentgh
		if seq.nil?
			cluster.sort!{|cl1, cl2| cl2.get_pident <=> cl1.get_pident}
			best_pident = cluster.first.get_pident
			seq = cluster.select{|s| s.get_pident == best_pident}.sort{|s1, s2| s2.seq_fasta <=> s1.seq_fasta}.first
		end
		seqs << seq
	end
	return seqs
end

#select_seqs_more_500pb(seqs_array) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 132

def select_seqs_more_500pb(seqs_array)
	seqs = seqs_array.select{|seq| seq.fasta_length > 500 }	
	return seqs
end

#select_seqs_with_name(array_seqs, array_names) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 137

def select_seqs_with_name(array_seqs, array_names)
	seqs = array_seqs.select{|seq| array_names.include?(seq.seq_name)}
	return seqs
end

#set_thresold_evalue(hits) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 90

def set_thresold_evalue(hits)
	evalue = 100
	hits.map{|hit|
		if hit.e_val != 0 && hit.e_val < evalue
			evalue = hit.e_val
		end
	}
	if evalue == 100
		evalue = 0
	else
		exp = Math.log10(evalue).abs.to_i
		min_exp = (exp/10.0).ceil
		evalue = 10.0**-(exp-min_exp)
	end
	return evalue
end

#subject_overlapping_hsps(hit) ⇒ `Object`

# File 'lib/full_lengther_next/blast_functions.rb', line 254

def subject_overlapping_hsps(hit)
	overlapping = false
	hsp_table = hsps_relationship_subject(hit)
	if !hsp_table.empty?
		hit = clean_hsp_by_identity(hit, 55)
		if hit.empty?
			overlapping = true
		else 
			hsp_table = hsps_relationship_subject(hit)
			if !hsp_table.empty?
				overlapping = true
			end	
		end
	end
	return hit, overlapping
end

#write_fasta(seqs_array, file_name, mode) ⇒ `Object`

# File 'lib/full_lengther_next/reptrans.rb', line 142

def write_fasta(seqs_array, file_name, mode)
	file=File.open(file_name, mode)
	seqs_array.each do |seq|
		file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
	end
	file.close
end

Top Level Namespace

Defined Under Namespace

Constant Summary collapse

Constants included from FlnStats

Constants included from ChimericSeqs

Instance Method Summary collapse

Methods included from FlnStats

Methods included from CommonFunctions

Methods included from NcRna

Methods included from FlAnalysis

Methods included from ChimericSeqs

Instance Method Details

#analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering) ⇒ Object

#artifact?(seq, query, db_name, db_path, options, new_seqs) ⇒ Boolean

#clean_by_identity(blast_result, ident) ⇒ Object

#clean_by_query_length_match(blast_result, min_len_nt) ⇒ Object

#clean_hsp_by_identity(hit, identity) ⇒ Object

#clean_overlapping_hsps(blast_result, keep_if_diff_sense = false) ⇒ Object

#clean_subject_overlapping_hsps(complete_hit, cleaned_hits) ⇒ Object

#cluster_hsps(hsps) ⇒ Object

#clustering_by_annot(seqs_with_hit, annotation_type) ⇒ Object

#clustering_by_id(seqs_with_hit) ⇒ Object

#count_cpu(options) ⇒ Object

#do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) ⇒ Object

#do_makeblastdb(seqs, output, dbtype) ⇒ Object

#filter_hits(query, select_hits = 10) ⇒ Object

#find_hit(hit_acc, ar_hits) ⇒ Object

#get_coverage_subject(hit) ⇒ Object

#go_for_graph(sequences_by_ontologies, fpkm = {}) ⇒ Object

#hsps_relationship_subject(hit) ⇒ Object

#load_cd_hit_sequences_names(file) ⇒ Object

#load_isoform_hash(file) ⇒ Object

#load_seq_in_hash(seq_name, seq, isoform_hash) ⇒ Object

#misassembled_detection(query) ⇒ Object

#multiple_hsps(query, num) ⇒ Object

#overlapping_hsps_on_subject(query) ⇒ Object

#reduce_pool_sequences(putative_seqs, main_path, cpu) ⇒ Object

#report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB) ⇒ Object

#reptrans(seqs_annotation_prot, seqs_some_coding, seqs_unknown, options) ⇒ Object

#same_query_hsp(hit, second_hit) ⇒ Object

#same_sense?(hit, second_hit) ⇒ Boolean

#same_subject_hsp(hit, second_hit) ⇒ Object

#select_hits_by_coverage_subject(hits) ⇒ Object

#select_hits_by_evalue(hits, evalue) ⇒ Object

#select_hits_by_identity_query(hits) ⇒ Object

#select_hsps_by_id(hits, selected_ids) ⇒ Object

#select_representative(clusters_seqs_annot_prot) ⇒ Object

#select_seqs_more_500pb(seqs_array) ⇒ Object

#select_seqs_with_name(array_seqs, array_names) ⇒ Object

#set_thresold_evalue(hits) ⇒ Object

#subject_overlapping_hsps(hit) ⇒ Object

#write_fasta(seqs_array, file_name, mode) ⇒ Object