Top Level Namespace

Defined Under Namespace

Modules: Pets Classes: Cohort, Cohort_Parser, Genomic_Feature, Reference_parser

Constant Summary collapse

COMMON_OPTPARSE = Needs define ROOT_PATH constant in file requiring this file

File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets', 'common_optparse.rb'))

REPORT_FOLDER =

File.expand_path(File.join(ROOT_PATH, '..', 'templates'))

EXTERNAL_DATA =

File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))

EXTERNAL_CODE =

File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))

HPO_FILE =

File.join(EXTERNAL_DATA, 'hp.json')

MONDO_FILE =

File.join(EXTERNAL_DATA, 'mondo.obo')

IC_FILE =

File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')

Instance Method Summary collapse

Instance Method Details

#add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata) ⇒ `Object`

# File 'lib/pets/phen2reg_methods.rb', line 48

def add_parentals_of_not_found_hpos_in_regions(
	patient_hpo_profile, 
	trainingData, 
	region2hpo, 
	regionAttributes, 
	association_scores,
	hpo_metadata # hpo_code => [phenotype, relations]
	)
	new_hpos = []
	region2hpo.each do |regionID, hpos|
		hpos_not_found = patient_hpo_profile - hpos
		parental_hpos = []
		hpo_scores = {}
		hpos_not_found.each do |hpo|
			region, parental_hpo = get_region_with_parental_hpo(hpo, regionID, trainingData , hpo_metadata)
			if !region.nil? && 
				!parental_hpos.include?(parental_hpo) && 
				!patient_hpo_profile.include?(parental_hpo)
				parental_hpos << parental_hpo
				hpo_scores[parental_hpo] = region.last
			end
		end
		hpos.concat(parental_hpos)
		new_hpos.concat(parental_hpos)
		association_scores[regionID].merge!(hpo_scores)
	end
	patient_hpo_profile.concat(new_hpos.uniq)
end

#add_record(hash, key, record, uniq = false) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 16

def add_record(hash, key, record, uniq=false)
	query = hash[key]
	if query.nil?
		hash[key] = [record]
	elsif !uniq # We not take care by repeated entries
		query << record
	elsif !query.include?(record) # We want uniq entries
		query << record
	end
end

#binom(n, k) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 174

def binom(n,k)
	if k > 0 && k < n
		res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
	else
		res = 1
	end
end

#calculate_coverage(regions_data, delete_thresold = 0) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 138

def calculate_coverage(regions_data, delete_thresold = 0)
	raw_coverage = {}
	n_regions = 0
	patients = 0
	nt = 0
	regions_data.each do |start, stop, chr, reg_id|
		number_of_patients = reg_id.split('.').last.to_i
		if number_of_patients <= delete_thresold
			number_of_patients = 0
		else
			n_regions += 1
			nt += stop - start			
		end
    add_record(raw_coverage, chr, [start, stop, number_of_patients])
		patients += number_of_patients
	end
	return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
end

#calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number) ⇒ `Object`

# File 'lib/pets/phen2reg_methods.rb', line 262

def calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number)     
  records_to_delete = []
  counter = 0
  adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
    hpo_coincidences = patient_original_phenotypes & hpo_list
    original_hpo_recovery_percentage = hpo_coincidences.length / patient_original_phenotypes.length.to_f * 100
    records_to_delete << counter if original_hpo_recovery_percentage < min_hpo_recovery_percentage
    query = predicted_hpo_percentage[patient_number] 
    if query.nil?
     predicted_hpo_percentage[patient_number] = [original_hpo_recovery_percentage]
    else
     query << original_hpo_recovery_percentage
    end   
    counter += 1 
  end
  records_to_delete.reverse_each do |record_number|
    adjacent_regions_joined.delete_at(record_number)
  end
end

#compute_hyper_prob(a, b, c, d, n) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 167

def compute_hyper_prob(a, b, c, d, n)
	binomA = binom(a + b, a)
	binomC = binom(c + d, c)
	divisor = binom(n, a + c)
	return (binomA * binomC).fdiv(divisor)
end

#compute_IC_values(patient_data, total_patients) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 28

def compute_IC_values(patient_data, total_patients)
	patients_per_hpo = Hash.new(0)
	last_patient_ID = ''
	patient_data.each do |patient_ID, metadata|
		patient, count = patient_ID.split('_i')
		if patient != last_patient_ID
			hpos, chr, start, stop = metadata
			hpos.each do |h|
				patients_per_hpo[h] += 1
			end
		end
		last_patient_ID = patient
	end
	# STDERR.puts patients_per_hpo.inspect
	# Process.exit
	patients_per_hpo.each do |hpo, patient_number|
		patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients))
	end
	return patients_per_hpo
end

#compute_pathway_enrichment(genes_clusters, genes_with_kegg) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 109

def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
	pathways_genes_in_predictions = {}
	genes_in_predictions = []
	genes_clusters.each do |cluster|
		cluster.each do |geneID, data|
			geneName, description, pathways, geneSyns = data
			pathways.each do |pathway|
				query = pathways_genes_in_predictions[pathway]
				if query.nil?
					pathways_genes_in_predictions[pathway] = [geneID]
				else
					query << geneID if !query.include?(geneID)
				end
			end
			genes_in_predictions << geneID if !genes_in_predictions.include?(geneID)
		end
	end
	genes_out_of_predictions = genes_with_kegg.keys - genes_in_predictions
	gene_number = genes_with_kegg.length
	stats = []
	pathways_genes_in_predictions.each do |pathway, pathway_predicted_genes|
		pathway_id, pathway_name = pathway
		no_pathway_predicted_genes = genes_in_predictions - pathway_predicted_genes 
		pathway_no_predicted_genes_count = 0
		no_pathway_no_predicted_genes_count = 0
		genes_out_of_predictions.each do |geneID|
			query = genes_with_kegg[geneID]
			if query[2].map{|pathway_info| pathway_info.first}.include?(pathway_id)
				pathway_no_predicted_genes_count += 1
			else
				no_pathway_no_predicted_genes_count += 1
			end
		end
		#Fisher => http://www.biostathandbook.com/fishers.html
		no_pathway_predicted_genes_count = no_pathway_predicted_genes.length
		pathway_predicted_genes_count = pathway_predicted_genes.length
		accumulated_prob = 0
		pathway_no_predicted_genes_count.times do |n|
			no_pathway_predicted_genes_count_shifted = no_pathway_predicted_genes_count - n
			pathway_predicted_genes_count_shifted = pathway_predicted_genes_count - n
			if no_pathway_predicted_genes_count_shifted >= 0 && pathway_predicted_genes_count_shifted >= 0
				accumulated_prob += compute_hyper_prob(
					n, 
					no_pathway_predicted_genes_count_shifted, 
					pathway_predicted_genes_count_shifted, 
					no_pathway_no_predicted_genes_count + n, 
					gene_number
				)
			else
				break
			end
		end
		contigency = [pathway_no_predicted_genes_count, no_pathway_predicted_genes_count, pathway_predicted_genes_count, no_pathway_no_predicted_genes_count]
		stats << [pathway, pathway_predicted_genes, contigency, accumulated_prob]
	end
	return stats
end

#coor_overlap?(ref_start, ref_stop, start, stop) ⇒ `Boolean`

Common methods for predictors Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5

Indexing by chr (region)

Returns:

(Boolean)

# File 'lib/pets/io.rb', line 295

def coor_overlap?(ref_start, ref_stop, start, stop)
  overlap = false
  if (stop > ref_start && stop <= ref_stop) ||
    (start >= ref_start && start < ref_stop) ||
    (start <= ref_start && stop >= ref_stop) ||
    (start > ref_start && stop < ref_stop)
    overlap = true
  end
  return overlap
end

#download(ftp_server, path, name) ⇒ `Object`

# File 'lib/pets/io.rb', line 475

def download(ftp_server, path, name)
  ftp = Net::FTP.new()
  ftp.connect(ftp_server)
  ftp.login
  ftp.getbinaryfile(path, name)
  ftp.close
end

#dummy_cluster_patients(patient_data, matrix_file, clust_pat_file) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 87

def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
  if !File.exists?(matrix_file)
    pat_hpo_matrix, pat_id, hp_id  = patient_data.to_bmatrix
    x_axis_file = matrix_file.gsub('.npy','_x.lst')
    y_axis_file = matrix_file.gsub('.npy','_y.lst')
    pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
  end
  system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
  clustered_patients = load_clustered_patients(clust_pat_file)
  return(clustered_patients)
end

#generate_gene_locations(gene_location) ⇒ `Object`

# File 'lib/pets/reg2phen_methods.rb', line 56

def generate_gene_locations(gene_location)
  gene_locations = {}
  gene_location.each do |chr, genesInfo|
    genesInfo.each do |geneID, geneStart, geneStop|
      gene_locations[geneID] = [chr, geneStart, geneStop]
    end
  end
  return gene_locations
end

#generate_genes_dictionary(gene_location, genes_with_kegg) ⇒ `Object`

# File 'lib/pets/reg2phen_methods.rb', line 66

def generate_genes_dictionary(gene_location, genes_with_kegg)
  reestructured_gene_locations = generate_gene_locations(gene_location)
  genes_dictionary = {}
  genes_with_kegg.each do |geneID, annotInfo|
    #STDERR.puts annotInfo.shift.inspect
    gene_location_data = reestructured_gene_locations[geneID]
    unless gene_location_data.nil?
      geneName, description, pathways, geneSyns = annotInfo
      genes_dictionary[geneName] = gene_location_data.dup.concat([false])
      geneSyns.each do |gene_symbol|
        genes_dictionary[gene_symbol] = gene_location_data.dup.concat([true])
      end
    end
  end
  return genes_dictionary
end

#generate_hpo_region_matrix(region2hpo, association_scores, info2predict, null_value = 0) ⇒ `Object`

# File 'lib/pets/phen2reg_methods.rb', line 107

def generate_hpo_region_matrix(region2hpo, association_scores, info2predict, null_value=0)
	# #method for creating the hpo to region matrix for plotting
	# #info2predict = hpo list from user
	# #hpo_associated_regions = [[chr, start, stop, [hpos_list], [weighted_association_scores]]]
	hpo_region_matrix = []
	region2hpo.each do |regionID, hpos_list|
		row = []
		info2predict.each do |user_hpo|
			value = association_scores[regionID][user_hpo]
			if value.nil?
				row << null_value
			else
				row << value
			end
		end
		hpo_region_matrix << row
	end
	return hpo_region_matrix
end

#get_and_parse_external_data(all_paths) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 185

def get_and_parse_external_data(all_paths)
	sources = [
	  ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
	  ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
	  ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]]
	]
	sources.each do |server, path, output|
	  download(server, path, output) if !File.exists?(output)
	end

	genes_with_kegg = {}
	gene_location = {}
	if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location])
	  gene_list, gene_location = load_gene_data(all_paths[:gene_data])
	  ### kegg_data = parse_kegg_data(genes_found_attributes.keys)
	  kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info])
	  genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data)
	  write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways])
	  write_compressed_plain_file(gene_location, all_paths[:gene_location])
	else
	  gene_location = read_compressed_json(all_paths[:gene_location])
	  genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways])
	end
	return gene_location, genes_with_kegg
end

#get_cluster_metadata(clusters_info) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 284

def get_cluster_metadata(clusters_info)
  average_hp_per_pat_distribution = []
  clusters_info.each do |cl_id, pat_info|
      hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
      hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
      average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
  end
  return average_hp_per_pat_distribution
end

#get_detailed_similarity(profile, candidates, evidences, hpo) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 211

def get_detailed_similarity(profile, candidates, evidences, hpo)
	profile_length = profile.length
	matrix = []
	profile_length.times{ matrix << Array.new(candidates.length, 0)}
	cand_number = 0
	candidates.each do |candidate_id, similarity|
		local_sim = []
		candidate_evidence = evidences[candidate_id]
		profile.each do |profile_term|
			candidate_evidence.each do |candidate_term|
				term_sim = hpo.compare([candidate_term], [profile_term], sim_type: :lin, bidirectional: false)
				local_sim << [profile_term, candidate_term, term_sim]
			end
		end
		local_sim.sort!{|s1, s2| s2.last <=> s1.last}

		final_pairs = []
		processed_profile_terms = []
		processed_candidate_terms = []
		local_sim.each do |pr_term, cd_term, sim|
			if !processed_profile_terms.include?(pr_term) && !processed_candidate_terms.include?(cd_term)
				final_pairs << [pr_term, cd_term, sim]
				processed_profile_terms << pr_term
				processed_candidate_terms << cd_term
			end
			break if profile_length == processed_profile_terms.length
		end
		final_pairs.each do |pr_term, cd_term, similarity|
			matrix[profile.index(pr_term)][cand_number] = similarity
		end
		cand_number += 1
	end
	return matrix
end

#get_final_coverage(raw_coverage, bin_size) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 110

def get_final_coverage(raw_coverage, bin_size)
	coverage_to_plot = []
	raw_coverage.each do |chr, coverages|
		coverages.each do |start, stop, coverage|
			bin_start = start - start % bin_size
			bin_stop = stop - stop%bin_size
			while bin_start < bin_stop
				coverage_to_plot << [chr, bin_start, coverage]
				bin_start += bin_size
			end
		end
	end
	return coverage_to_plot
end

#get_genes_by_coordinates(genes_dictionary, chr, start, stop) ⇒ `Object`

# File 'lib/pets/reg2phen_methods.rb', line 34

def get_genes_by_coordinates(genes_dictionary, chr, start, stop)
  genes_in_sor = genes_dictionary.select do |gene_sym, attributes|
    gene_chr, gene_start, gene_stop, is_GeneSynom = attributes
    if gene_chr == chr && !is_GeneSynom
      coor_overlap?(gene_start, gene_stop, start, stop)
    else
      false
    end
  end
  return genes_in_sor.keys
end

#get_mean_size(all_sizes) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 99

def get_mean_size(all_sizes)
    accumulated_size = 0
    number = 0
    all_sizes.each do |size, occurrences|
      accumulated_size += size *occurrences
      number += occurrences
    end
    return accumulated_size.fdiv(number)
end

#get_profile_ic(hpo_names, phenotype_ic) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 60

def get_profile_ic(hpo_names, phenotype_ic)
  ic = 0
  profile_length = 0
  hpo_names.each do |hpo_id|
    hpo_ic = phenotype_ic[hpo_id]
    raise("The term #{hpo_id} not exists in the given ic table") if hpo_ic.nil?
    ic += hpo_ic 
    profile_length += 1
  end
  profile_length = 1 if profile_length == 0
  return ic.fdiv(profile_length)
end

#get_region_with_parental_hpo(hpo, regionID, trainingData, hpo_metadata) ⇒ `Object`

# File 'lib/pets/phen2reg_methods.rb', line 77

def get_region_with_parental_hpo(hpo, regionID, trainingData , hpo_metadata)
	region = nil
	final_hpo = nil
	hpos = [hpo]
	while !hpos.empty?
		temp = []
		hpos.each do |hp| 
			hpo_data = hpo_metadata[hp]
			if !hpo_data.nil?
				main_hpo_code, phenotype, relations = hpo_data
				temp.concat(relations.map{|rel| rel.first})
			end
		end
		temp.each do |temp_hpo|
			regions = trainingData[temp_hpo]
			if !regions.nil?
				final_reg = regions.select{|reg| reg[3] == regionID}
				if !final_reg.empty?
					region = final_reg.first
					final_hpo = temp_hpo
					temp = []
					break
				end
			end
		end
		hpos = temp
	end
	return region, final_hpo
end

#get_semantic_similarity_clustering(options, patient_data, temp_folder) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 181

def get_semantic_similarity_clustering(options, patient_data, temp_folder)
  template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
  hpo = Cohort.get_ontology(Cohort.act_ont)
  reference_profiles = nil
  reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
  Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
    matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
    profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
    clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
    if !File.exists?(matrix_filename)
      if reference_profiles.nil? 
        profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
      else # AS reference profiles are constant, the sematic comparation will be A => B (A reference). So, we have to invert the elements to perform the comparation
        ont = Cohort.get_ontology(:hpo)
        pat_profiles = ont.profiles
        ont.load_profiles(reference_profiles, reset_stored: true)
        profiles_similarity = ont.compare_profiles(sim_type: method_name.to_sym, 
          external_profiles: pat_profiles, 
          bidirectional: false)
        ont.load_profiles(pat_profiles, reset_stored: true)
        profiles_similarity = invert_nested_hash(profiles_similarity)
      end
      remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
      write_profile_pairs(profiles_similarity, profiles_similarity_filename)
      if reference_profiles.nil?
        axis_file = matrix_filename.gsub('.npy','.lst')
        similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
        similarity_matrix.save(matrix_filename, axis_names, axis_file)
      else
        axis_file_x = matrix_filename.gsub('.npy','_x.lst')
        axis_file_y = matrix_filename.gsub('.npy','_y.lst')
        similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
        similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
      end
    end
    ext_var = ''
    if method_name == 'resnik'
      ext_var = '-m max'
    elsif method_name == 'lin'
      ext_var = '-m comp1'
    end
    cluster_file = "#{method_name}_clusters.txt"
    if !reference_profiles.nil?
      ext_var << ' -s' 
      axis_file = "#{axis_file_y},#{axis_file_x}"
      cluster_file = "#{method_name}_clusters_rows.txt"
    end
    out_file = File.join(temp_folder, method_name)
    system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file +  '_heatmap.png')
    clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)  
    write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
    out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
    system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
    sim_mat4cluster = {}
    if options[:detailed_clusters]
      clusters_codes.each do |cluster|
        cluster_cohort = Cohort.new
        clID, patient_number, patient_ids, hpo_codes = cluster
        patient_ids.each_with_index {|patID, i| cluster_cohort.add_record([patID, hpo_codes[i], []])}
        cluster_profiles = cluster_cohort.profiles
        ref_profile = cluster_cohort.get_general_profile
        hpo.load_profiles({ref: ref_profile}, reset_stored: true)    
        similarities = hpo.compare_profiles(external_profiles: cluster_profiles, sim_type: :lin, bidirectional: false)
        candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(ref_profile, similarities[:ref], cluster_profiles, hpo, 100, 100)
        candidate_sim_matrix.unshift(['HP'] + candidates_ids)
        sim_mat4cluster[clID] = candidate_sim_matrix
      end
    end


    clusters = translate_codes(clusters_codes, hpo)
    container = {
      :temp_folder => temp_folder,
      :cluster_name => method_name,
      :clusters => clusters,
      :hpo => hpo,
      :sim_mat4cluster => sim_mat4cluster
     }

    report = Report_html.new(container, 'Patient clusters report')
    report.build(template)
    report.write(options[:output_file]+"_#{method_name}_clusters.html")
    system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
  end
end

#get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {}) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 246

def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {})
		candidates = similarities.to_a
		if other_scores.empty?
			candidates.sort!{|s1, s2| s2.last <=> s1.last}
			candidates = candidates.first(candidate_limit)
		else # Prioritize first by the external list of scores, select the candidates and then rioritize by similarities
			selected_candidates = []
			candidates.each do |cand|
				cand_id = cand[0]
				cand_lab = id2label[cand_id.to_s]
				next if cand_lab.nil?
				other_score = other_scores[cand_lab]
				next if other_score.nil?
				cand << other_score
				selected_candidates << cand
			end
			selected_candidates.sort!{|e1, e2| e2[2] <=> e1[2]}
			candidates = selected_candidates.first(candidate_limit)
			candidates.sort!{|e1, e2| e2[1] <=> e1[1]}
			candidates.each{|c| c.pop}
		end
		candidates_ids = candidates.map{|c| c.first}
		candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo)
		candidate_similarity_matrix.each_with_index do |row, i|
			row.unshift(hpo.translate_id(reference_prof[i]))
		end
		candidate_similarity_matrix.sort!{|r1,r2| r2[1..r2.length].inject(0){|sum,n| sum +n} <=> r1[1..r1.length].inject(0){|sum,n| sum +n}}
		candidate_similarity_matrix = candidate_similarity_matrix.first(term_limit)
		return candidate_similarity_matrix, candidates, candidates_ids
end

#get_sor_length_distribution(raw_coverage) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 125

def get_sor_length_distribution(raw_coverage)
	all_cnvs_length = []
	cnvs_count = []
	raw_coverage.each do |chr, coords_info|
		coords_info.each do |start, stop, pat_records|
			region_length = stop - start + 1
			all_cnvs_length << [region_length, pat_records]
		end
	end
	all_cnvs_length.sort!{|c1, c2| c1[1] <=> c2[1]}
	return all_cnvs_length
end

#get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 73

def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
  stats = []
  stats << ['Unique HPO terms', hpo_stats.length]
  stats << ['Cohort size', patient_data.profiles.length]
  stats << ['Rejected patients by empty profile', rejected_patients.length]
  stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
  stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
  stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
  stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
  stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
  stats << ['Number of unknown phenotypes', rejected_hpos.length]
  return stats
end

#get_top_dummy_clusters_stats(top_clust_phen) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 157

def get_top_dummy_clusters_stats(top_clust_phen)
  new_cluster_phenotypes = {}
  top_clust_phen.each_with_index do |cluster, clusterID|
    phenotypes_frequency = Hash.new(0)
    total_patients = cluster.length
    cluster.each do |phenotypes|
      phenotypes.each do |p|
        phenotypes_frequency[p] += 1
      end
    end
    new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
  end
  return new_cluster_phenotypes
end

#group_by_region(hpo_regions) ⇒ `Object`

# File 'lib/pets/phen2reg_methods.rb', line 18

def group_by_region(hpo_regions)
	#hpo_regions-> hpo => [[chr, start, stop, regID, score], [...]]
	region2hpo = {}
	regionAttributes = {}
	association_scores = {}
	hpo_regions.each do |hpo, regions|
		regions.each do |chr, start, stop, regionID, association_score|
			query = region2hpo[regionID]
			if query.nil?
				region2hpo[regionID] = [hpo]
			else
				query << hpo
			end
			query = regionAttributes[regionID]
			if query.nil?
				total_patients_in_region = regionID.split('.')[3].to_i
				region_length = stop - start
				regionAttributes[regionID] = [chr, start, stop, total_patients_in_region, region_length]
			end
			query = association_scores[regionID]
			if query.nil?
				association_scores[regionID] = {hpo => association_score}
			else
				query[hpo] = association_score 
			end
		end		
	end
	return region2hpo, regionAttributes, association_scores
end

#hpo_quality_control(prediction_data, hpos_ci_values, hpo) ⇒ `Object`

def hpo_quality_control(prediction_data, hpo_metadata_file, information_coefficient_file)

# File 'lib/pets/phen2reg_methods.rb', line 230

def hpo_quality_control(prediction_data, hpos_ci_values, hpo)
	characterised_hpos = []
	##information_coef_file= hpo_code, ci
	##prediction_data = [hpo1, hpo2, hpo3...]
	# hpos_ci_values = load_hpo_ci_values(information_coefficient_file)
	prediction_data.each do |hpo_code|
		# names, rejected = hpo.translate_codes2names([hpo_code])
		names, rejected = hpo.translate_ids([hpo_code])
		tmp = [names.first, hpo_code] # col hpo name, col hpo code
		ci = hpos_ci_values[hpo_code]
		unless ci.nil? # col exists? and ci values
			tmp.concat(["yes", ci])
		else
			tmp.concat(["no", "-"])
		end
		# parent = prediction_data & hpo.get_parents(hpo_code)
		parent = prediction_data & hpo.get_ancestors(hpo_code, true)
		if parent.empty?
			parent << "-"
		else
			# n, r = hpo.translate_codes2names(parent)
			n, r = hpo.translate_ids(parent)
			parent = parent.zip(n) # Combine code ids with hpo names
		end
		tmp << parent # col parents
		specific_childs = hpo.get_childs_table([hpo_code], true)
		tmp << specific_childs.first.last
		characterised_hpos << tmp
	end
	return characterised_hpos
end

#invert_nested_hash(h) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 267

def invert_nested_hash(h)
  new_h = {}
  h.each do |k1, vals1|
    vals1.each do |v1|
      vals1.each do |k2, vals2|
        query = new_h[k2]
        if query.nil?
          new_h[k2] = {k1 => vals2}
        else
          query[k1] = vals2
        end
      end
    end
  end
  return new_h
end

#join_regions(regions) ⇒ `Object`

# File 'lib/pets/phen2reg_methods.rb', line 197

def join_regions(regions)
	#[chr, start, stop, association_values.keys, association_values.values, score]
	merged_regions = []
	sorted_regions = regions.sort_by{|reg | [reg[0], reg[1]]}
	ref_reg = sorted_regions.shift
	while !sorted_regions.empty?
		next_region = sorted_regions.shift
		if ref_reg[0] == next_region[0] &&
			(ref_reg[2] - next_region[1]).abs <= 1 &&
			(ref_reg[5] - next_region[5]).abs.fdiv([ref_reg[5], next_region[5]].max) <= 0.05 &&
			ref_reg[3] == next_region[3]

			ref_reg[2] = next_region[2]
			ref_assoc_values = ref_reg[4] 
			next_assoc_values = next_region[4]
			assoc_values = []
			ref_assoc_values.each_with_index do |ref_val, i|
				#assoc_values << (ref_val + next_assoc_values[i]).fdiv(2)
				assoc_values << [ref_val, next_assoc_values[i]].max
			end
			ref_reg[4] = assoc_values
			#ref_reg[5] = (ref_reg[5] + next_region[5]).fdiv(2)
			ref_reg[5] = [ref_reg[5], next_region[5]].max
		else
			merged_regions << ref_reg
			ref_reg = next_region
		end
	end
	merged_regions << ref_reg
	return merged_regions
end

#load_biosystem2gene_dictionary(biosystems_gene_path) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 81

def load_biosystem2gene_dictionary(biosystems_gene_path)
	gene2kegg = {}
	infile = open(biosystems_gene_path)
	gz = Zlib::GzipReader.new(infile)
	gz.each_line do |line|
		line.chomp!
		biosystem, gene, score = line.split("\t")
		query = gene2kegg[gene]
		if query.nil?
			gene2kegg[gene] = [biosystem]
		else
			query << biosystem
		end
	end
	return gene2kegg
end

#load_clustered_patients(file) ⇒ `Object`

# File 'lib/pets/io.rb', line 370

def load_clustered_patients(file)
  clusters = {}
  File.open(file).each do |line|
    line.chomp!
    pat_id, cluster_id = line.split("\t")
    query = clusters[cluster_id]
    if query.nil?
      clusters[cluster_id] = [pat_id]
    else
      query << pat_id
    end
  end
  return clusters
end

#load_coordinates(file_path) ⇒ `Object`

# File 'lib/pets/io.rb', line 256

def load_coordinates(file_path)
  coordinates = {}
  header = true
  File.open(file_path).each do |line|
    fields = line.chomp.split("\t")
    if header
      header = false
    else
      entity, chr, strand, start, stop = fields
      if chr == 'NA'
        STDERR.puts "Warning: Record #{fields.inspect} is undefined"
        next
      end
      coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
    end
  end
  return coordinates
end

#load_evidence_profiles(file_path, hpo) ⇒ `Object`

# File 'lib/pets/io.rb', line 275

def load_evidence_profiles(file_path, hpo)
  profiles = {}
  id2label = {}
  #count = 0
  File.open(file_path).each do |line|
    id, label, profile = line.chomp.split("\t")
    hpos = profile.split(',').map{|a| a.to_sym}
    hpos, rejected_hpos = hpo.check_ids(hpos)
    if !hpos.empty?
      hpos = hpo.clean_profile(hpos)
      profiles[id] = hpos if !hpos.empty?
      id2label[id] = label
    end
  end
  return profiles, id2label
end

#load_evidences(evidences_path, hpo) ⇒ `Object`

# File 'lib/pets/io.rb', line 238

def load_evidences(evidences_path, hpo)
  genomic_coordinates = {}
  coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
  coord_files.each do |cd_f|
    entity = File.basename(cd_f, '.coords')
    coordinates = load_coordinates(cd_f)
    genomic_coordinates[entity] = coordinates
  end
  evidences = {}
  evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
  evidence_files.each do |e_f|
    pair = File.basename(e_f, '.txt')
    profiles, id2label = load_evidence_profiles(e_f, hpo)
    evidences[pair] = {prof: profiles, id2lab: id2label}
  end
  return evidences, genomic_coordinates
end

#load_gene_data(gene_data_path) ⇒ `Object`

# File 'lib/pets/io.rb', line 385

def load_gene_data(gene_data_path)
  gene_list = {} #geneID => attr
  gene_location = {} # chr => gene
  infile = open(gene_data_path)
  gz = Zlib::GzipReader.new(infile)
  current_chr = nil
  genes = []
  gz.each_line do |line|
    line.chomp!
    next if line =~ /^#/ 
    fields = line.split("\t")
    if fields[8].include?('genome=chromosome')
      chr = fields[8].split(';')[1].split('=').last
      gene_location[current_chr] = genes
      genes = []
      current_chr = chr
    elsif fields[2] == 'gene'
      attributes = {}
      fields[8].split(';').each do |pair| 
        key, value = pair.split('=')
        attributes[key] = value
      end
      geneName = nil
      geneName = attributes['gene'] if !attributes['gene'].nil?
      geneSyns = []
      geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
      description = attributes['description']
      description = URI.unescape(description) if !description.nil?
      attributes['Dbxref'] =~ /GeneID:(\d+)/
      gene_list[$1] = [geneName, geneSyns, description]
      genes << [$1, fields[3].to_i, fields[4].to_i]
    end
  end
  gene_location[current_chr] = genes
  return gene_list, gene_location
end

#load_hpo_ci_values(information_coefficient_file) ⇒ `Object`

# File 'lib/pets/io.rb', line 360

def load_hpo_ci_values(information_coefficient_file)
  hpos_ci_values = {}
  File.open(information_coefficient_file).each do |line|
    line.chomp!
    hpo_code, ci = line.split("\t")
    hpos_ci_values[hpo_code.to_sym] = ci.to_f
  end
  return hpos_ci_values
end

#load_hpo_ontology(hpo_file, excluded_hpo_file) ⇒ `Object`

# File 'lib/pets/io.rb', line 4

def load_hpo_ontology(hpo_file, excluded_hpo_file)
  hpo = nil
  if !hpo_file.include?('.json')
    if !excluded_hpo_file.nil?
      hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
    else
      hpo = Ontology.new(file: hpo_file, load_file: true)
    end
  else
    hpo = Ontology.new
    hpo.read(hpo_file)
    if !excluded_hpo_file.nil?
      hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
      hpo.remove_removable()
      hpo.build_index()
    end
  end
  return hpo
end

#load_profiles(file_path, hpo) ⇒ `Object`

# File 'lib/pets/io.rb', line 188

def load_profiles(file_path, hpo)
  profiles = {}
  #count = 0
  File.open(file_path).each do |line|
    id, profile = line.chomp.split("\t")
    hpos = profile.split(',').map{|a| a.to_sym}
    hpos, rejected_hpos = hpo.check_ids(hpos)
    if !hpos.empty?
      hpos = hpo.clean_profile(hpos)
      profiles[id] = hpos if !hpos.empty?
    end
  end
  return profiles
end

#load_tabular_vars(path) ⇒ `Object`

# File 'lib/pets/io.rb', line 217

def load_tabular_vars(path)
  vars = []
  File.open(path).each do |line|
    fields = line.chomp.split("\t")
    chr = fields[0].gsub('chr','')
    start = fields[1].to_i
    vars << [chr, start, start]
  end
  return vars
end

#load_training_file4HPO(training_file, thresold = 0) ⇒ `Object`

Indexing by hpo (code)

prepare training file for analysis using phenotype2region prediction

# File 'lib/pets/io.rb', line 323

def load_training_file4HPO(training_file, thresold=0)
  training_set = {}
  information = loadFile(training_file, thresold)
  information.each do |info|
    hpoCode = info.delete_at(4)
    query = training_set[hpoCode]
    if query.nil?
      training_set[hpoCode] = [info]  
    else
      query << info
    end
  end
  # STDERR.puts training_set.keys.inspect
  return training_set
end

#load_training_file4regions(training_file) ⇒ `Object`

# File 'lib/pets/io.rb', line 306

def load_training_file4regions(training_file)
  training_set = {}
  posInfo = loadFile(training_file)
  posInfo.each do |info|
    chr = info.shift
    query = training_set[chr]
    if query.nil?
      training_set[chr] = [info]
    else
      query << info
    end
  end
  return training_set
end

#load_variants(variant_folder) ⇒ `Object`

# File 'lib/pets/io.rb', line 203

def load_variants(variant_folder)
  variants = {}
  Dir.glob(File.join(variant_folder, '*.{tab,vcf,vcf.gz}')).each do |path|
    profile_id, ext = File.basename(path).split(".", 2)
    if ext == 'tab' || ext == 'txt'
      vars = load_tabular_vars(path)
    elsif ext == 'vcf' || ext == 'vcf.gz'
      vars = load_vcf(path, ext)
    end
    variants[profile_id] = Genomic_Feature.new(vars)
  end
  return variants
end

#load_vcf(path, ext) ⇒ `Object`

Some compressed files are fragmented internally. If so, VCFfile only reads first fragment

# File 'lib/pets/io.rb', line 228

def load_vcf(path, ext) # Some compressed files are fragmented internally. If so, VCFfile only reads first fragment
  vars = []             # Use zcat original.vcf.gz | gzip > new.vcf.gz to obtain a contigous file
  vcf  = BioVcf::VCFfile.new(file: path, is_gz: ext == 'vcf.gz' ? true : false )
  vcf.each do |var|
    vars << [var.chrom.gsub('chr',''), var.pos, var.pos]
  end
  puts vars.length
  return vars
end

#loadBiosistemsInfo(biosystems_info_path, filterDB) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 67

def loadBiosistemsInfo(biosystems_info_path, filterDB)
	bsid2attributes = {}
	infile = open(biosystems_info_path)
	gz = Zlib::GzipReader.new(infile)
	gz.each_line do |line|
		line.chomp!
		#STDERR.puts line.inspect
		fields = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').split("\t")
		bsid = fields.shift
		bsid2attributes[bsid] = [fields[1], fields[2]] if filterDB == fields[0]
	end
	return bsid2attributes
end

#loadFile(file, thresold = 0) ⇒ `Object`

Load training info file:

Chr;Start;Stop;HPO;Association;node

# File 'lib/pets/io.rb', line 342

def loadFile(file, thresold=0)
  information = []
  File.open(file).each do |line|
    line.chomp!
    allInfo = line.split("\t")
    associationValue = allInfo[4].to_f
    if associationValue >= thresold
      chr = allInfo[0]
      startPos = allInfo[1].to_i
      stopPos = allInfo[2].to_i
      hpoCode = allInfo[3]
      nodeID = allInfo[5]
      information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
    end
  end
  return information
end

#merge_genes_with_kegg_data(gene_list, kegg_data) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 98

def merge_genes_with_kegg_data(gene_list, kegg_data)
	merged_data = {}
	gene_list.each do |geneID, values|
		geneName, geneSyn, description = values
		kegg_entry = kegg_data[geneID]	
		kegg_entry = [] if kegg_entry.nil?
		merged_data[geneID] = [geneName, description, kegg_entry, geneSyn]
	end
	return merged_data
end

#parse_clusters_file(clusters_file, patient_data) ⇒ `Object`

# File 'lib/pets/io.rb', line 167

def parse_clusters_file(clusters_file, patient_data)
  clusters_info = {}
  clusters_table = []
  File.open(clusters_file).each do |line|
    line.chomp!
    patientID, clusterID = line.split("\t")
    patientHPOProfile = patient_data.get_profile(patientID)
    query = clusters_info[clusterID]
    if query.nil? 
      clusters_info[clusterID] = {patientID => patientHPOProfile}
    else
      query[patientID] = patientHPOProfile
    end
  end
  clusters_info.each do |clusterID, patients_info|
    patients_per_cluster = patients_info.keys.length
    clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
  end
  return clusters_table, clusters_info
end

#parse_kegg_data(query_genes) ⇒ `Object`

# File 'lib/pets/io.rb', line 422

def parse_kegg_data(query_genes)
  kegg_data = {} #gene => attb
    while !query_genes.empty?
      gene_set = query_genes.shift(10)
      url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
      uri = URI(url)
      response = Net::HTTP.get(uri) 
    geneID = nil
    gene_names = []
    definition = nil
    pathways = []
    parsing_pathway_field = false
    response.squeeze(' ').each_line do |line|
      line.chomp!
      if line =~ /^ENTRY/
        geneID = line.split(' ')[1]
      elsif line =~ /^NAME/
        gene_names = line.split(' ', 2).last.split(', ')
      elsif line =~ /^DEFINITION/
        definition = line.split(' ', 2)[1]
      elsif line =~ /^PATHWAY/
        pathways << line.split(' ', 3)[1..2]
        parsing_pathway_field = true
      elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
        parsing_pathway_field = false
      elsif parsing_pathway_field
        pathways << line.strip.split(' ', 2)
      elsif line == '///'
        parsing_pathway_field = false
        kegg_data[geneID] = [gene_names, definition, pathways]
        pathways = []
        gene_names = []
      end
    end
  end
  return kegg_data
end

#parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 50

def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
	kegg_data = {}
	gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path)
	keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG')
	keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/}

	gene2biosystems.each do |geneID, pathways|
		kegg_pathways = []
		pathways.each do |biosystem|
			kAttrib = keggAttributes[biosystem]
			kegg_pathways << kAttrib if !kAttrib.nil?
		end
		kegg_data[geneID] = kegg_pathways
	end
	return kegg_data
end

#parse_patient_results(results) ⇒ `Object`

# File 'lib/pets/reg2phen_methods.rb', line 83

def parse_patient_results(results)
  patient_results = []
  patient_results << ['Record', 'Region', 'Chromosome', 'Prediction start', 'Prediction stop', 'Phenotype (HPO)', 'P-value', 'SOR start', 'SOR stop', 'Number of patients sharing SOR', 'Genes']
  counter = 1
  results.each do |k, val|
    val.each_with_index do |v, i|
      if i == 0
        patient_results << v.unshift(counter, k)
      else
        patient_results << v.unshift(counter, '')
      end
      counter += 1 
    end
  end
  return patient_results
end

#predict_patient(predictions, training_set, threshold, transform, genes, genes_dictionary) ⇒ `Object`

# File 'lib/pets/reg2phen_methods.rb', line 3

def predict_patient(predictions, training_set, threshold, transform, genes, genes_dictionary)
  results = {}
  predictions.each do |info|
    if genes
      geneID = info.shift
      info = genes_dictionary[geneID]
    end 
    chr, pt_start, pt_stop, is_GeneSynom = info
    sors = training_set[chr]
    next if sors.nil?
    sors.each do |sor_start, sor_stop, nodeID, hpo_code, association_score|
      next if !coor_overlap?(pt_start, pt_stop, sor_start, sor_stop) 
      patientsNum = nodeID.split('.').last
      if association_score >= threshold 
        association_score = 10**(-association_score) if transform
        genes_in_sor = get_genes_by_coordinates(genes_dictionary, chr, sor_start, sor_stop)
        record2save = [chr, pt_start, pt_stop, hpo_code, association_score, sor_start, sor_stop, patientsNum, genes_in_sor.join(',')]
        key = info[0...3].join(":")
        key.concat(" (#{geneID})") if genes
        query_res = results[key]
        if query_res.nil?
            results[key] = [record2save]
        else
            query_res << record2save
        end
      end
    end
  end
  return results
end

#process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 42

def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
  chrs = Hash.new(0)
  all_phens = []
  profile_ics = []
  profile_lengths = []
  patient_ids.each do |pat_id|
    phenotypes = patient_data.get_profile(pat_id) 
    profile_ics << get_profile_ic(phenotypes, phenotype_ic)
    profile_lengths << phenotypes.length
    if processed_clusters < options[:clusters2show_detailed_phen_data]
      phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
      all_phens << phen_names
    end  
    patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
  end
  return chrs, all_phens, profile_ics, profile_lengths 
end

#process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) ⇒ `Object`

get ic and chromosomes

# File 'lib/pets/coPatReporterMethods.rb', line 16

def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
  ont = Cohort.get_ontology(Cohort.act_ont)
  all_ics = []
  all_lengths = []
  top_cluster_phenotypes = []
  cluster_data_by_chromosomes = []
  multi_chromosome_patients = 0
  processed_clusters = 0
  clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
    num_of_patients = patient_ids.length
    next if num_of_patients == 1
    chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
    top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
    all_ics << profile_ics
    all_lengths << profile_lengths
    if !options[:chromosome_col].nil?
      multi_chromosome_patients += num_of_patients if chrs.length > 1
      chrs.each do |chr, count|
        cluster_data_by_chromosomes << [cluster_id, num_of_patients, chr, count]
      end
    end
    processed_clusters += 1
  end
  return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
end

#read_compressed_json(path) ⇒ `Object`

# File 'lib/pets/io.rb', line 468

def read_compressed_json(path)
  infile = open(path)
  gz = Zlib::GzipReader.new(infile)
  object = JSON.parse(gz.read)
  return object
end

#read_excluded_hpo_file(file) ⇒ `Object`

# File 'lib/pets/io.rb', line 24

def read_excluded_hpo_file(file)
  excluded_hpo = []
  File.open(file).each do |line|
    excluded_hpo << line.chomp
  end
  return excluded_hpo
end

#remove_nested_entries(nested_hash) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 172

def remove_nested_entries(nested_hash)
  empty_root_ids = []
  nested_hash.each do |root_id, entries|
    entries.select!{|id, val| yield(id, val)}
    empty_root_ids << root_id if entries.empty?
  end
  empty_root_ids.each{|id| nested_hash.delete(id)}
end

#report_data(container, html_file) ⇒ `Object`

# File 'lib/pets/phen2reg_methods.rb', line 282

def report_data(characterised_hpos, hpo_associated_regions, html_file, hpo, genes_with_kegg_data, pathway_stats)
	container = {:characterised_hpos => characterised_hpos,
	 	:merged_regions => hpo_associated_regions,
	 	:hpo => hpo,
	 	:genes_with_kegg_data => genes_with_kegg_data,
	 	:pathway_stats => pathway_stats
	 }
	template = File.open(File.join(REPORT_FOLDER, 'patient_report.erb')).read
	report = Report_html.new(container, 'Patient HPO profile summary')
	report.build(template)
	report.write(html_file)
end

#save_patient_matrix(output, patient_hpo_profile, regionAttributes, hpo_region_matrix) ⇒ `Object`

# File 'lib/pets/phen2reg_methods.rb', line 295

def save_patient_matrix(output, patient_hpo_profile, regionAttributes, hpo_region_matrix)
	File.open(output, "w") do |f|
		f.puts "Region\t#{patient_hpo_profile.join("\t")}"
		regionAttributes_array = regionAttributes.values
		hpo_region_matrix.each_with_index do |association_values, i|
		  chr, start, stop = regionAttributes_array[i]
		  f.puts "#{chr}:#{start}-#{stop}\t#{association_values.join("\t")}"
		end
	end
end

#scoring_regions(regionAttributes, hpo_region_matrix, scoring_system, pvalue_cutoff, freedom_degree, null_value = 0) ⇒ `Object`

# File 'lib/pets/phen2reg_methods.rb', line 127

def scoring_regions(regionAttributes, hpo_region_matrix, scoring_system, pvalue_cutoff, freedom_degree, null_value=0)
	#hpo_associated_regions = [[chr, start, stop, [hpos_list], [weighted_association_scores]]]
	#hpo_region_matrix = [[0, 0.4, 0, 0.4], [0, 0, 0.5, 0.4]]
	# STDERR.puts "EH"
	regionAttributes_array = regionAttributes.values
	max_cluster_length = hpo_region_matrix.map{|x| x.count {|i| i != 0}}.max if freedom_degree == 'maxnum'
	hpo_region_matrix.each_with_index do |associations, i|
		sample_length = nil
		if freedom_degree == 'prednum'
			sample_length = associations.length
		elsif freedom_degree == 'phennum'
			sample_length = associations.count{|s| s != 0}
		elsif freedom_degree == 'maxnum'
			sample_length = max_cluster_length
		else
			abort("Invalid freedom degree calculation method: #{freedom_degree}")
		end

		if scoring_system == 'mean'
			mean_association = associations.inject(0){|s,x| s + x } / sample_length
			regionAttributes_array[i] << mean_association
		elsif scoring_system == 'fisher'
			#hyper must be ln not log10 from net analyzer
			#https://en.wikipedia.org/wiki/Fisher%27s_method
			# STDERR.puts associations.inspect
			lns = associations.map{|a| Math.log(10 ** -a)} #hyper values come as log10 values
			sum = lns.inject(0){|s, a| s + a} 
			combined_pvalue = Statistics2.chi2_x(sample_length *2, -2*sum)	
			regionAttributes_array[i] << combined_pvalue
		elsif scoring_system == 'stouffer'
			sum = associations.inject(0){|s,x| s + x}
			combined_z_score = sum/Math.sqrt(sample_length)
			regionAttributes_array[i] << combined_z_score
		elsif scoring_system == 'geommean'
			#NOTE: if troubles with number size, use BigDecimal
			geommean_mult = associations.inject(1){|s,x| s * x}
			geommean_association = geommean_mult.to_f ** ( sample_length ** -1 )
			regionAttributes_array[i] << geommean_association
		elsif scoring_system == 'median'
			median_value = associations.length / 2
			if median_value % 2 == 0
				median_up = associations.sort[median_value]
				median_down = associations.sort[median_value - 1]
				pair_median = ( median_up + median_down ) / 2
				median_association = associations.sort[pair_median.ceil]
			else
				median_association = associations.sort[median_value.ceil]
			end
			regionAttributes_array[i] << median_association
		elsif scoring_system == 'maxnum'
			max_association = associations.max
			regionAttributes_array[i] << max_association
		elsif scoring_system == 'minnum'
			min_association = associations.min
			regionAttributes_array[i] << min_association
		else 
			abort("Invalid ranking method: #{scoring_system}")
		end
	end
	if scoring_system == 'mean' || 
		scoring_system == 'geommean' ||
		scoring_system == 'maxnum' ||
		scoring_system == 'minnum'
		regionAttributes.select!{|regionID, attributes| attributes.last >= pvalue_cutoff}
	elsif scoring_system == 'fisher'
		regionAttributes.select!{|regionID, attributes| attributes.last <= pvalue_cutoff}
	end
	#Combined p-value: less value equals better association -> not due randomly.
end

#search4HPO(info2predict, trainingData) ⇒ `Object`

require ‘bigdecimal’

# File 'lib/pets/phen2reg_methods.rb', line 6

def search4HPO(info2predict, trainingData)
	#search if there are profile HPOs within the association file 
	hpo_regions = {}
	info2predict.each do |hpo|
		regions = trainingData[hpo]
		if !regions.nil?
			hpo_regions[hpo] = regions
		end
	end
	return hpo_regions
end

#system_call(code_folder, script, args_string) ⇒ `Object`

# File 'lib/pets/generalMethods.rb', line 8

def system_call(code_folder, script, args_string)
	cmd = File.join(code_folder, script) + ' ' + args_string
	puts "==> #{cmd}"
	Benchmark.bm do |x|
		x.report{	system(cmd) }
	end
end

#translate_codes(clusters, hpo) ⇒ `Object`

# File 'lib/pets/coPatReporterMethods.rb', line 2

def translate_codes(clusters, hpo)
  translated_clusters = []
  clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
        translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
        translated_clusters << [clusterID, 
          num_of_pats, 
          patientIDs_ary, 
          patient_hpos_ary, 
          translate_codes
        ]
  end
  return translated_clusters
end

#translate_hpos_in_results(results, hpo) ⇒ `Object`

# File 'lib/pets/reg2phen_methods.rb', line 46

def translate_hpos_in_results(results, hpo)
  results.each do |coords, data|
    data.each do |info|
      # hpo_name, rejected = hpo.translate_codes2names([info[3]])
      info[3] = hpo.translate_id(info[3])
      # info[3] = hpo_name.first
    end
  end
end

#write_array(array, file_path) ⇒ `Object`

# File 'lib/pets/io.rb', line 41

def write_array(array, file_path)
  File.open(file_path, 'w') do |handler|
    array.each do |record|
      if record.class == String
        line = record
      else
        line = record.join("\t")
      end
      handler.puts line  
    end
  end
end

#write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name) ⇒ `Object`

# File 'lib/pets/io.rb', line 128

def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
  File.open(filename, 'w') do |f|
    f.puts "#{x_axis_name}\t#{y_axis_name}"
    x_axis_value.each_with_index do |value,i|
      y_value = y_axis_value[i]
      raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
      f.puts [value, y_value].join("\t")
    end
  end
end

#write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit) ⇒ `Object`

# File 'lib/pets/io.rb', line 76

def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
  File.open(cluster_chromosome_data_file, 'w') do |f|
    f.puts %w[cluster_id chr count].join("\t")
    index = 0
    last_id = cluster_data.first.first unless cluster_data.empty?
    cluster_data.each do |cluster_id, patient_number, chr, count|
      index += 1 if cluster_id != last_id 
      break if index == limit
      f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
      last_id = cluster_id
    end
  end
end

#write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit) ⇒ `Object`

# File 'lib/pets/io.rb', line 63

def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
  File.open(cluster_ic_data_file, 'w') do |f|
    f.puts %w[cluster_id ic Plen].join("\t")
    all_ics.each_with_index do |cluster_ics, i|
      break if i == limit
      cluster_length = cluster_ics.length
      cluster_ics.each_with_index do |clust_ic, j|
        f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
      end
    end
  end
end

#write_compressed_plain_file(data, path) ⇒ `Object`

# File 'lib/pets/io.rb', line 460

def write_compressed_plain_file(data, path)
  File.open(path, 'w') do |f|
    gz = Zlib::GzipWriter.new(f)
    gz.write data.to_json
    gz.close
  end
end

#write_coverage_data(coverage_to_plot, coverage_to_plot_file) ⇒ `Object`

# File 'lib/pets/io.rb', line 90

def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
  File.open(coverage_to_plot_file, 'w') do |f|
    coverage_to_plot.each do |chr, position, freq|
     f.puts "#{chr}\t#{position}\t#{freq}"
   end
  end
end

#write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) ⇒ `Object`

# File 'lib/pets/io.rb', line 99

def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
  CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
    suggested_childs.each do |pat_id, suggestions|
      warning = nil
      warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
      csv << ["PATIENT #{pat_id}", "#{warning}"]
      csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
      suggestions.each do |parent, childs|
        parent_code, parent_name = parent
        if childs.empty?
          csv << ["#{parent_name} (#{parent_code})", '-']
        else
          parent_writed = false
          childs.each do |child_code, child_name|
            if !parent_writed
              parent_field = "#{parent_name} (#{parent_code})"
              parent_writed = true
            else
              parent_field = ""
            end
            csv << [parent_field, "#{child_name} (#{child_code})"]
          end
        end
      end
      csv << ["", ""]
    end
  end
end

#write_hash(hash, file_path, header = []) ⇒ `Object`

# File 'lib/pets/io.rb', line 32

def write_hash(hash, file_path, header = [])
  File.open(file_path, 'w') do |handler|
    handler.puts header.join("\t") if !header.empty?
    hash.each do |key, array|
      handler.puts "#{key}\t#{array.join("\t")}"
    end
  end
end

#write_matrix_for_R(matrix, x_names, y_names, file) ⇒ `Object`

# File 'lib/pets/io.rb', line 54

def write_matrix_for_R(matrix, x_names, y_names, file)
  File.open(file, 'w') do |f|
    f.puts x_names.join("\t")
    matrix.each_with_index do |row, i|
      f.puts [y_names[i]].concat(row).join("\t")
    end
  end
end

#write_patient_hpo_stat(average_hp_per_pat_distribution, output_file) ⇒ `Object`

# File 'lib/pets/io.rb', line 158

def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file)
  File.open(output_file, 'w') do |f|
    f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
    average_hp_per_pat_distribution.each do |patient_num, ave|
      f.puts "#{patient_num}\t#{ave}"
    end
  end
end

#write_profile_pairs(similarity_pairs, filename) ⇒ `Object`

# File 'lib/pets/io.rb', line 148

def write_profile_pairs(similarity_pairs, filename)
  File.open(filename, 'w') do |f|
    similarity_pairs.each do |pairsA, pairsB_and_values|
      pairsB_and_values.each do |pairsB, values|
        f.puts "#{pairsA}\t#{pairsB}\t#{values}"
      end
    end
  end
end

#write_similarity_matrix(similarity_matrix, similarity_matrix_file) ⇒ `Object`

# File 'lib/pets/io.rb', line 140

def write_similarity_matrix(similarity_matrix, similarity_matrix_file)  
  File.open(similarity_matrix_file, 'w') do |f|
    similarity_matrix.each do |row|
      f.puts row.join("\t")
    end
  end
end

Top Level Namespace

Defined Under Namespace

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata) ⇒ Object

#add_record(hash, key, record, uniq = false) ⇒ Object

#binom(n, k) ⇒ Object

#calculate_coverage(regions_data, delete_thresold = 0) ⇒ Object

#calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number) ⇒ Object

#compute_hyper_prob(a, b, c, d, n) ⇒ Object

#compute_IC_values(patient_data, total_patients) ⇒ Object

#compute_pathway_enrichment(genes_clusters, genes_with_kegg) ⇒ Object

#coor_overlap?(ref_start, ref_stop, start, stop) ⇒ Boolean

#download(ftp_server, path, name) ⇒ Object

#dummy_cluster_patients(patient_data, matrix_file, clust_pat_file) ⇒ Object

#generate_gene_locations(gene_location) ⇒ Object

#generate_genes_dictionary(gene_location, genes_with_kegg) ⇒ Object

#generate_hpo_region_matrix(region2hpo, association_scores, info2predict, null_value = 0) ⇒ Object

#get_and_parse_external_data(all_paths) ⇒ Object

#get_cluster_metadata(clusters_info) ⇒ Object

#get_detailed_similarity(profile, candidates, evidences, hpo) ⇒ Object

#get_final_coverage(raw_coverage, bin_size) ⇒ Object

#get_genes_by_coordinates(genes_dictionary, chr, start, stop) ⇒ Object

#get_mean_size(all_sizes) ⇒ Object

#get_profile_ic(hpo_names, phenotype_ic) ⇒ Object

#get_region_with_parental_hpo(hpo, regionID, trainingData, hpo_metadata) ⇒ Object

#get_semantic_similarity_clustering(options, patient_data, temp_folder) ⇒ Object

#get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {}) ⇒ Object

#get_sor_length_distribution(raw_coverage) ⇒ Object

#get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos) ⇒ Object

#get_top_dummy_clusters_stats(top_clust_phen) ⇒ Object

#group_by_region(hpo_regions) ⇒ Object

#hpo_quality_control(prediction_data, hpos_ci_values, hpo) ⇒ Object

#invert_nested_hash(h) ⇒ Object

#join_regions(regions) ⇒ Object

#load_biosystem2gene_dictionary(biosystems_gene_path) ⇒ Object

#load_clustered_patients(file) ⇒ Object

#load_coordinates(file_path) ⇒ Object

#load_evidence_profiles(file_path, hpo) ⇒ Object

#load_evidences(evidences_path, hpo) ⇒ Object

#load_gene_data(gene_data_path) ⇒ Object

#load_hpo_ci_values(information_coefficient_file) ⇒ Object

#load_hpo_ontology(hpo_file, excluded_hpo_file) ⇒ Object

#load_profiles(file_path, hpo) ⇒ Object

#load_tabular_vars(path) ⇒ Object

#load_training_file4HPO(training_file, thresold = 0) ⇒ Object

#load_training_file4regions(training_file) ⇒ Object

#load_variants(variant_folder) ⇒ Object

#load_vcf(path, ext) ⇒ Object

#loadBiosistemsInfo(biosystems_info_path, filterDB) ⇒ Object

#loadFile(file, thresold = 0) ⇒ Object

#merge_genes_with_kegg_data(gene_list, kegg_data) ⇒ Object

#parse_clusters_file(clusters_file, patient_data) ⇒ Object

#parse_kegg_data(query_genes) ⇒ Object

#parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path) ⇒ Object

#parse_patient_results(results) ⇒ Object

#predict_patient(predictions, training_set, threshold, transform, genes, genes_dictionary) ⇒ Object

#process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters) ⇒ Object

#process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) ⇒ Object

#read_compressed_json(path) ⇒ Object

#read_excluded_hpo_file(file) ⇒ Object

#remove_nested_entries(nested_hash) ⇒ Object

#report_data(container, html_file) ⇒ Object

#save_patient_matrix(output, patient_hpo_profile, regionAttributes, hpo_region_matrix) ⇒ Object

#scoring_regions(regionAttributes, hpo_region_matrix, scoring_system, pvalue_cutoff, freedom_degree, null_value = 0) ⇒ Object

#search4HPO(info2predict, trainingData) ⇒ Object

#system_call(code_folder, script, args_string) ⇒ Object

#translate_codes(clusters, hpo) ⇒ Object

#translate_hpos_in_results(results, hpo) ⇒ Object

#write_array(array, file_path) ⇒ Object

#write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name) ⇒ Object

#write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit) ⇒ Object

#write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit) ⇒ Object

#write_compressed_plain_file(data, path) ⇒ Object

#write_coverage_data(coverage_to_plot, coverage_to_plot_file) ⇒ Object

#write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) ⇒ Object

#write_hash(hash, file_path, header = []) ⇒ Object

#write_matrix_for_R(matrix, x_names, y_names, file) ⇒ Object

#write_patient_hpo_stat(average_hp_per_pat_distribution, output_file) ⇒ Object