Top Level Namespace
Defined Under Namespace
Modules: Pets Classes: Cohort, Cohort_Parser, Genomic_Feature, Reference_parser
Constant Summary collapse
- COMMON_OPTPARSE =
Needs define ROOT_PATH constant in file requiring this file
File.(File.join(ROOT_PATH, '..', 'lib', 'pets', 'common_optparse.rb'))
- REPORT_FOLDER =
File.(File.join(ROOT_PATH, '..', 'templates'))
- EXTERNAL_DATA =
File.(File.join(ROOT_PATH, '..', 'external_data'))
- EXTERNAL_CODE =
File.(File.join(ROOT_PATH, '..', 'external_code'))
- HPO_FILE =
File.join(EXTERNAL_DATA, 'hp.json')
- MONDO_FILE =
File.join(EXTERNAL_DATA, 'mondo.obo')
- IC_FILE =
File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
Instance Method Summary collapse
- #add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata) ⇒ Object
- #add_record(hash, key, record, uniq = false) ⇒ Object
- #binom(n, k) ⇒ Object
- #calculate_coverage(regions_data, delete_thresold = 0) ⇒ Object
- #calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number) ⇒ Object
- #compute_hyper_prob(a, b, c, d, n) ⇒ Object
- #compute_IC_values(patient_data, total_patients) ⇒ Object
- #compute_pathway_enrichment(genes_clusters, genes_with_kegg) ⇒ Object
-
#coor_overlap?(ref_start, ref_stop, start, stop) ⇒ Boolean
Common methods for predictors Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5 1.
- #download(ftp_server, path, name) ⇒ Object
- #dummy_cluster_patients(patient_data, matrix_file, clust_pat_file) ⇒ Object
- #generate_gene_locations(gene_location) ⇒ Object
- #generate_genes_dictionary(gene_location, genes_with_kegg) ⇒ Object
- #generate_hpo_region_matrix(region2hpo, association_scores, info2predict, null_value = 0) ⇒ Object
- #get_and_parse_external_data(all_paths) ⇒ Object
- #get_cluster_metadata(clusters_info) ⇒ Object
- #get_detailed_similarity(profile, candidates, evidences, hpo) ⇒ Object
- #get_final_coverage(raw_coverage, bin_size) ⇒ Object
- #get_genes_by_coordinates(genes_dictionary, chr, start, stop) ⇒ Object
- #get_mean_size(all_sizes) ⇒ Object
- #get_profile_ic(hpo_names, phenotype_ic) ⇒ Object
- #get_region_with_parental_hpo(hpo, regionID, trainingData, hpo_metadata) ⇒ Object
- #get_semantic_similarity_clustering(options, patient_data, temp_folder) ⇒ Object
- #get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {}) ⇒ Object
- #get_sor_length_distribution(raw_coverage) ⇒ Object
- #get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos) ⇒ Object
- #get_top_dummy_clusters_stats(top_clust_phen) ⇒ Object
- #group_by_region(hpo_regions) ⇒ Object
-
#hpo_quality_control(prediction_data, hpos_ci_values, hpo) ⇒ Object
def hpo_quality_control(prediction_data, hpo_metadata_file, information_coefficient_file).
- #invert_nested_hash(h) ⇒ Object
- #join_regions(regions) ⇒ Object
- #load_biosystem2gene_dictionary(biosystems_gene_path) ⇒ Object
- #load_clustered_patients(file) ⇒ Object
- #load_coordinates(file_path) ⇒ Object
- #load_evidence_profiles(file_path, hpo) ⇒ Object
- #load_evidences(evidences_path, hpo) ⇒ Object
- #load_gene_data(gene_data_path) ⇒ Object
- #load_hpo_ci_values(information_coefficient_file) ⇒ Object
- #load_hpo_ontology(hpo_file, excluded_hpo_file) ⇒ Object
- #load_profiles(file_path, hpo) ⇒ Object
- #load_tabular_vars(path) ⇒ Object
-
#load_training_file4HPO(training_file, thresold = 0) ⇒ Object
2.
- #load_training_file4regions(training_file) ⇒ Object
- #load_variants(variant_folder) ⇒ Object
-
#load_vcf(path, ext) ⇒ Object
Some compressed files are fragmented internally.
- #loadBiosistemsInfo(biosystems_info_path, filterDB) ⇒ Object
-
#loadFile(file, thresold = 0) ⇒ Object
3.
- #merge_genes_with_kegg_data(gene_list, kegg_data) ⇒ Object
- #parse_clusters_file(clusters_file, patient_data) ⇒ Object
- #parse_kegg_data(query_genes) ⇒ Object
- #parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path) ⇒ Object
- #parse_patient_results(results) ⇒ Object
- #predict_patient(predictions, training_set, threshold, transform, genes, genes_dictionary) ⇒ Object
- #process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters) ⇒ Object
-
#process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) ⇒ Object
get ic and chromosomes.
- #read_compressed_json(path) ⇒ Object
- #read_excluded_hpo_file(file) ⇒ Object
- #remove_nested_entries(nested_hash) ⇒ Object
- #report_data(container, html_file) ⇒ Object
- #save_patient_matrix(output, patient_hpo_profile, regionAttributes, hpo_region_matrix) ⇒ Object
- #scoring_regions(regionAttributes, hpo_region_matrix, scoring_system, pvalue_cutoff, freedom_degree, null_value = 0) ⇒ Object
-
#search4HPO(info2predict, trainingData) ⇒ Object
require ‘bigdecimal’.
- #system_call(code_folder, script, args_string) ⇒ Object
- #translate_codes(clusters, hpo) ⇒ Object
- #translate_hpos_in_results(results, hpo) ⇒ Object
- #write_array(array, file_path) ⇒ Object
- #write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name) ⇒ Object
- #write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit) ⇒ Object
- #write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit) ⇒ Object
- #write_compressed_plain_file(data, path) ⇒ Object
- #write_coverage_data(coverage_to_plot, coverage_to_plot_file) ⇒ Object
- #write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) ⇒ Object
- #write_hash(hash, file_path, header = []) ⇒ Object
- #write_matrix_for_R(matrix, x_names, y_names, file) ⇒ Object
- #write_patient_hpo_stat(average_hp_per_pat_distribution, output_file) ⇒ Object
- #write_profile_pairs(similarity_pairs, filename) ⇒ Object
- #write_similarity_matrix(similarity_matrix, similarity_matrix_file) ⇒ Object
Instance Method Details
#add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/pets/phen2reg_methods.rb', line 48 def add_parentals_of_not_found_hpos_in_regions( patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, # hpo_code => [phenotype, relations] ) new_hpos = [] region2hpo.each do |regionID, hpos| hpos_not_found = patient_hpo_profile - hpos parental_hpos = [] hpo_scores = {} hpos_not_found.each do |hpo| region, parental_hpo = get_region_with_parental_hpo(hpo, regionID, trainingData , ) if !region.nil? && !parental_hpos.include?(parental_hpo) && !patient_hpo_profile.include?(parental_hpo) parental_hpos << parental_hpo hpo_scores[parental_hpo] = region.last end end hpos.concat(parental_hpos) new_hpos.concat(parental_hpos) association_scores[regionID].merge!(hpo_scores) end patient_hpo_profile.concat(new_hpos.uniq) end |
#add_record(hash, key, record, uniq = false) ⇒ Object
16 17 18 19 20 21 22 23 24 25 |
# File 'lib/pets/generalMethods.rb', line 16 def add_record(hash, key, record, uniq=false) query = hash[key] if query.nil? hash[key] = [record] elsif !uniq # We not take care by repeated entries query << record elsif !query.include?(record) # We want uniq entries query << record end end |
#binom(n, k) ⇒ Object
174 175 176 177 178 179 180 |
# File 'lib/pets/generalMethods.rb', line 174 def binom(n,k) if k > 0 && k < n res = (1+n-k..n).inject(:*)/(1..k).inject(:*) else res = 1 end end |
#calculate_coverage(regions_data, delete_thresold = 0) ⇒ Object
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/pets/coPatReporterMethods.rb', line 138 def calculate_coverage(regions_data, delete_thresold = 0) raw_coverage = {} n_regions = 0 patients = 0 nt = 0 regions_data.each do |start, stop, chr, reg_id| number_of_patients = reg_id.split('.').last.to_i if number_of_patients <= delete_thresold number_of_patients = 0 else n_regions += 1 nt += stop - start end add_record(raw_coverage, chr, [start, stop, number_of_patients]) patients += number_of_patients end return raw_coverage, n_regions, nt, patients.fdiv(n_regions) end |
#calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number) ⇒ Object
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
# File 'lib/pets/phen2reg_methods.rb', line 262 def calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number) records_to_delete = [] counter = 0 adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score| hpo_coincidences = patient_original_phenotypes & hpo_list original_hpo_recovery_percentage = hpo_coincidences.length / patient_original_phenotypes.length.to_f * 100 records_to_delete << counter if original_hpo_recovery_percentage < min_hpo_recovery_percentage query = predicted_hpo_percentage[patient_number] if query.nil? predicted_hpo_percentage[patient_number] = [original_hpo_recovery_percentage] else query << original_hpo_recovery_percentage end counter += 1 end records_to_delete.reverse_each do |record_number| adjacent_regions_joined.delete_at(record_number) end end |
#compute_hyper_prob(a, b, c, d, n) ⇒ Object
167 168 169 170 171 172 |
# File 'lib/pets/generalMethods.rb', line 167 def compute_hyper_prob(a, b, c, d, n) binomA = binom(a + b, a) binomC = binom(c + d, c) divisor = binom(n, a + c) return (binomA * binomC).fdiv(divisor) end |
#compute_IC_values(patient_data, total_patients) ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/pets/generalMethods.rb', line 28 def compute_IC_values(patient_data, total_patients) patients_per_hpo = Hash.new(0) last_patient_ID = '' patient_data.each do |patient_ID, | patient, count = patient_ID.split('_i') if patient != last_patient_ID hpos, chr, start, stop = hpos.each do |h| patients_per_hpo[h] += 1 end end last_patient_ID = patient end # STDERR.puts patients_per_hpo.inspect # Process.exit patients_per_hpo.each do |hpo, patient_number| patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients)) end return patients_per_hpo end |
#compute_pathway_enrichment(genes_clusters, genes_with_kegg) ⇒ Object
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
# File 'lib/pets/generalMethods.rb', line 109 def compute_pathway_enrichment(genes_clusters, genes_with_kegg) pathways_genes_in_predictions = {} genes_in_predictions = [] genes_clusters.each do |cluster| cluster.each do |geneID, data| geneName, description, pathways, geneSyns = data pathways.each do |pathway| query = pathways_genes_in_predictions[pathway] if query.nil? pathways_genes_in_predictions[pathway] = [geneID] else query << geneID if !query.include?(geneID) end end genes_in_predictions << geneID if !genes_in_predictions.include?(geneID) end end genes_out_of_predictions = genes_with_kegg.keys - genes_in_predictions gene_number = genes_with_kegg.length stats = [] pathways_genes_in_predictions.each do |pathway, pathway_predicted_genes| pathway_id, pathway_name = pathway no_pathway_predicted_genes = genes_in_predictions - pathway_predicted_genes pathway_no_predicted_genes_count = 0 no_pathway_no_predicted_genes_count = 0 genes_out_of_predictions.each do |geneID| query = genes_with_kegg[geneID] if query[2].map{|pathway_info| pathway_info.first}.include?(pathway_id) pathway_no_predicted_genes_count += 1 else no_pathway_no_predicted_genes_count += 1 end end #Fisher => http://www.biostathandbook.com/fishers.html no_pathway_predicted_genes_count = no_pathway_predicted_genes.length pathway_predicted_genes_count = pathway_predicted_genes.length accumulated_prob = 0 pathway_no_predicted_genes_count.times do |n| no_pathway_predicted_genes_count_shifted = no_pathway_predicted_genes_count - n pathway_predicted_genes_count_shifted = pathway_predicted_genes_count - n if no_pathway_predicted_genes_count_shifted >= 0 && pathway_predicted_genes_count_shifted >= 0 accumulated_prob += compute_hyper_prob( n, no_pathway_predicted_genes_count_shifted, pathway_predicted_genes_count_shifted, no_pathway_no_predicted_genes_count + n, gene_number ) else break end end contigency = [pathway_no_predicted_genes_count, no_pathway_predicted_genes_count, pathway_predicted_genes_count, no_pathway_no_predicted_genes_count] stats << [pathway, pathway_predicted_genes, contigency, accumulated_prob] end return stats end |
#coor_overlap?(ref_start, ref_stop, start, stop) ⇒ Boolean
Common methods for predictors Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
-
Indexing by chr (region)
295 296 297 298 299 300 301 302 303 304 |
# File 'lib/pets/io.rb', line 295 def coor_overlap?(ref_start, ref_stop, start, stop) overlap = false if (stop > ref_start && stop <= ref_stop) || (start >= ref_start && start < ref_stop) || (start <= ref_start && stop >= ref_stop) || (start > ref_start && stop < ref_stop) overlap = true end return overlap end |
#download(ftp_server, path, name) ⇒ Object
475 476 477 478 479 480 481 |
# File 'lib/pets/io.rb', line 475 def download(ftp_server, path, name) ftp = Net::FTP.new() ftp.connect(ftp_server) ftp.login ftp.getbinaryfile(path, name) ftp.close end |
#dummy_cluster_patients(patient_data, matrix_file, clust_pat_file) ⇒ Object
87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/pets/coPatReporterMethods.rb', line 87 def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file) if !File.exists?(matrix_file) pat_hpo_matrix, pat_id, hp_id = patient_data.to_bmatrix x_axis_file = matrix_file.gsub('.npy','_x.lst') y_axis_file = matrix_file.gsub('.npy','_y.lst') pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file) end system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file) clustered_patients = load_clustered_patients(clust_pat_file) return(clustered_patients) end |
#generate_gene_locations(gene_location) ⇒ Object
56 57 58 59 60 61 62 63 64 |
# File 'lib/pets/reg2phen_methods.rb', line 56 def generate_gene_locations(gene_location) gene_locations = {} gene_location.each do |chr, genesInfo| genesInfo.each do |geneID, geneStart, geneStop| gene_locations[geneID] = [chr, geneStart, geneStop] end end return gene_locations end |
#generate_genes_dictionary(gene_location, genes_with_kegg) ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/pets/reg2phen_methods.rb', line 66 def generate_genes_dictionary(gene_location, genes_with_kegg) reestructured_gene_locations = generate_gene_locations(gene_location) genes_dictionary = {} genes_with_kegg.each do |geneID, annotInfo| #STDERR.puts annotInfo.shift.inspect gene_location_data = reestructured_gene_locations[geneID] unless gene_location_data.nil? geneName, description, pathways, geneSyns = annotInfo genes_dictionary[geneName] = gene_location_data.dup.concat([false]) geneSyns.each do |gene_symbol| genes_dictionary[gene_symbol] = gene_location_data.dup.concat([true]) end end end return genes_dictionary end |
#generate_hpo_region_matrix(region2hpo, association_scores, info2predict, null_value = 0) ⇒ Object
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/pets/phen2reg_methods.rb', line 107 def generate_hpo_region_matrix(region2hpo, association_scores, info2predict, null_value=0) # #method for creating the hpo to region matrix for plotting # #info2predict = hpo list from user # #hpo_associated_regions = [[chr, start, stop, [hpos_list], [weighted_association_scores]]] hpo_region_matrix = [] region2hpo.each do |regionID, hpos_list| row = [] info2predict.each do |user_hpo| value = association_scores[regionID][user_hpo] if value.nil? row << null_value else row << value end end hpo_region_matrix << row end return hpo_region_matrix end |
#get_and_parse_external_data(all_paths) ⇒ Object
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/pets/generalMethods.rb', line 185 def get_and_parse_external_data(all_paths) sources = [ ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]], ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]], ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]] ] sources.each do |server, path, output| download(server, path, output) if !File.exists?(output) end genes_with_kegg = {} gene_location = {} if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location]) gene_list, gene_location = load_gene_data(all_paths[:gene_data]) ### kegg_data = parse_kegg_data(genes_found_attributes.keys) kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info]) genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data) write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways]) write_compressed_plain_file(gene_location, all_paths[:gene_location]) else gene_location = read_compressed_json(all_paths[:gene_location]) genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways]) end return gene_location, genes_with_kegg end |
#get_cluster_metadata(clusters_info) ⇒ Object
284 285 286 287 288 289 290 291 292 |
# File 'lib/pets/coPatReporterMethods.rb', line 284 def (clusters_info) average_hp_per_pat_distribution = [] clusters_info.each do |cl_id, pat_info| hp_per_pat_in_clust = pat_info.values.map{|a| a.length} hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length) average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave] end return average_hp_per_pat_distribution end |
#get_detailed_similarity(profile, candidates, evidences, hpo) ⇒ Object
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 |
# File 'lib/pets/generalMethods.rb', line 211 def get_detailed_similarity(profile, candidates, evidences, hpo) profile_length = profile.length matrix = [] profile_length.times{ matrix << Array.new(candidates.length, 0)} cand_number = 0 candidates.each do |candidate_id, similarity| local_sim = [] candidate_evidence = evidences[candidate_id] profile.each do |profile_term| candidate_evidence.each do |candidate_term| term_sim = hpo.compare([candidate_term], [profile_term], sim_type: :lin, bidirectional: false) local_sim << [profile_term, candidate_term, term_sim] end end local_sim.sort!{|s1, s2| s2.last <=> s1.last} final_pairs = [] processed_profile_terms = [] processed_candidate_terms = [] local_sim.each do |pr_term, cd_term, sim| if !processed_profile_terms.include?(pr_term) && !processed_candidate_terms.include?(cd_term) final_pairs << [pr_term, cd_term, sim] processed_profile_terms << pr_term processed_candidate_terms << cd_term end break if profile_length == processed_profile_terms.length end final_pairs.each do |pr_term, cd_term, similarity| matrix[profile.index(pr_term)][cand_number] = similarity end cand_number += 1 end return matrix end |
#get_final_coverage(raw_coverage, bin_size) ⇒ Object
110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/pets/coPatReporterMethods.rb', line 110 def get_final_coverage(raw_coverage, bin_size) coverage_to_plot = [] raw_coverage.each do |chr, coverages| coverages.each do |start, stop, coverage| bin_start = start - start % bin_size bin_stop = stop - stop%bin_size while bin_start < bin_stop coverage_to_plot << [chr, bin_start, coverage] bin_start += bin_size end end end return coverage_to_plot end |
#get_genes_by_coordinates(genes_dictionary, chr, start, stop) ⇒ Object
34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/pets/reg2phen_methods.rb', line 34 def get_genes_by_coordinates(genes_dictionary, chr, start, stop) genes_in_sor = genes_dictionary.select do |gene_sym, attributes| gene_chr, gene_start, gene_stop, is_GeneSynom = attributes if gene_chr == chr && !is_GeneSynom coor_overlap?(gene_start, gene_stop, start, stop) else false end end return genes_in_sor.keys end |
#get_mean_size(all_sizes) ⇒ Object
99 100 101 102 103 104 105 106 107 |
# File 'lib/pets/coPatReporterMethods.rb', line 99 def get_mean_size(all_sizes) accumulated_size = 0 number = 0 all_sizes.each do |size, occurrences| accumulated_size += size *occurrences number += occurrences end return accumulated_size.fdiv(number) end |
#get_profile_ic(hpo_names, phenotype_ic) ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/pets/coPatReporterMethods.rb', line 60 def get_profile_ic(hpo_names, phenotype_ic) ic = 0 profile_length = 0 hpo_names.each do |hpo_id| hpo_ic = phenotype_ic[hpo_id] raise("The term #{hpo_id} not exists in the given ic table") if hpo_ic.nil? ic += hpo_ic profile_length += 1 end profile_length = 1 if profile_length == 0 return ic.fdiv(profile_length) end |
#get_region_with_parental_hpo(hpo, regionID, trainingData, hpo_metadata) ⇒ Object
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/pets/phen2reg_methods.rb', line 77 def get_region_with_parental_hpo(hpo, regionID, trainingData , ) region = nil final_hpo = nil hpos = [hpo] while !hpos.empty? temp = [] hpos.each do |hp| hpo_data = [hp] if !hpo_data.nil? main_hpo_code, phenotype, relations = hpo_data temp.concat(relations.map{|rel| rel.first}) end end temp.each do |temp_hpo| regions = trainingData[temp_hpo] if !regions.nil? final_reg = regions.select{|reg| reg[3] == regionID} if !final_reg.empty? region = final_reg.first final_hpo = temp_hpo temp = [] break end end end hpos = temp end return region, final_hpo end |
#get_semantic_similarity_clustering(options, patient_data, temp_folder) ⇒ Object
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 |
# File 'lib/pets/coPatReporterMethods.rb', line 181 def get_semantic_similarity_clustering(, patient_data, temp_folder) template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read hpo = Cohort.get_ontology(Cohort.act_ont) reference_profiles = nil reference_profiles = load_profiles([:reference_profiles], hpo) if ![:reference_profiles].nil? Parallel.each([:clustering_methods], in_processes: [:threads] ) do |method_name| matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy") profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt')) clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt')) if !File.exists?(matrix_filename) if reference_profiles.nil? profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles) else # AS reference profiles are constant, the sematic comparation will be A => B (A reference). So, we have to invert the elements to perform the comparation ont = Cohort.get_ontology(:hpo) pat_profiles = ont.profiles ont.load_profiles(reference_profiles, reset_stored: true) profiles_similarity = ont.compare_profiles(sim_type: method_name.to_sym, external_profiles: pat_profiles, bidirectional: false) ont.load_profiles(pat_profiles, reset_stored: true) profiles_similarity = invert_nested_hash(profiles_similarity) end remove_nested_entries(profiles_similarity){|id, sim| sim >= [:sim_thr] } if ![:sim_thr].nil? write_profile_pairs(profiles_similarity, profiles_similarity_filename) if reference_profiles.nil? axis_file = matrix_filename.gsub('.npy','.lst') similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true) similarity_matrix.save(matrix_filename, axis_names, axis_file) else axis_file_x = matrix_filename.gsub('.npy','_x.lst') axis_file_y = matrix_filename.gsub('.npy','_y.lst') similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true) similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x) end end ext_var = '' if method_name == 'resnik' ext_var = '-m max' elsif method_name == 'lin' ext_var = '-m comp1' end cluster_file = "#{method_name}_clusters.txt" if !reference_profiles.nil? ext_var << ' -s' axis_file = "#{axis_file_y},#{axis_file_x}" cluster_file = "#{method_name}_clusters_rows.txt" end out_file = File.join(temp_folder, method_name) system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png') clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data) write_patient_hpo_stat((clusters_info), clusters_distribution_filename) out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_')) system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file) sim_mat4cluster = {} if [:detailed_clusters] clusters_codes.each do |cluster| cluster_cohort = Cohort.new clID, patient_number, patient_ids, hpo_codes = cluster patient_ids.each_with_index {|patID, i| cluster_cohort.add_record([patID, hpo_codes[i], []])} cluster_profiles = cluster_cohort.profiles ref_profile = cluster_cohort.get_general_profile hpo.load_profiles({ref: ref_profile}, reset_stored: true) similarities = hpo.compare_profiles(external_profiles: cluster_profiles, sim_type: :lin, bidirectional: false) candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(ref_profile, similarities[:ref], cluster_profiles, hpo, 100, 100) candidate_sim_matrix.unshift(['HP'] + candidates_ids) sim_mat4cluster[clID] = candidate_sim_matrix end end clusters = translate_codes(clusters_codes, hpo) container = { :temp_folder => temp_folder, :cluster_name => method_name, :clusters => clusters, :hpo => hpo, :sim_mat4cluster => sim_mat4cluster } report = Report_html.new(container, 'Patient clusters report') report.build(template) report.write([:output_file]+"_#{method_name}_clusters.html") system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png')) end end |
#get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {}) ⇒ Object
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 |
# File 'lib/pets/generalMethods.rb', line 246 def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {}) candidates = similarities.to_a if other_scores.empty? candidates.sort!{|s1, s2| s2.last <=> s1.last} candidates = candidates.first(candidate_limit) else # Prioritize first by the external list of scores, select the candidates and then rioritize by similarities selected_candidates = [] candidates.each do |cand| cand_id = cand[0] cand_lab = id2label[cand_id.to_s] next if cand_lab.nil? other_score = other_scores[cand_lab] next if other_score.nil? cand << other_score selected_candidates << cand end selected_candidates.sort!{|e1, e2| e2[2] <=> e1[2]} candidates = selected_candidates.first(candidate_limit) candidates.sort!{|e1, e2| e2[1] <=> e1[1]} candidates.each{|c| c.pop} end candidates_ids = candidates.map{|c| c.first} candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo) candidate_similarity_matrix.each_with_index do |row, i| row.unshift(hpo.translate_id(reference_prof[i])) end candidate_similarity_matrix.sort!{|r1,r2| r2[1..r2.length].inject(0){|sum,n| sum +n} <=> r1[1..r1.length].inject(0){|sum,n| sum +n}} candidate_similarity_matrix = candidate_similarity_matrix.first(term_limit) return candidate_similarity_matrix, candidates, candidates_ids end |
#get_sor_length_distribution(raw_coverage) ⇒ Object
125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/pets/coPatReporterMethods.rb', line 125 def get_sor_length_distribution(raw_coverage) all_cnvs_length = [] cnvs_count = [] raw_coverage.each do |chr, coords_info| coords_info.each do |start, stop, pat_records| region_length = stop - start + 1 all_cnvs_length << [region_length, pat_records] end end all_cnvs_length.sort!{|c1, c2| c1[1] <=> c2[1]} return all_cnvs_length end |
#get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos) ⇒ Object
73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/pets/coPatReporterMethods.rb', line 73 def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos) stats = [] stats << ['Unique HPO terms', hpo_stats.length] stats << ['Cohort size', patient_data.profiles.length] stats << ['Rejected patients by empty profile', rejected_patients.length] stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size] stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)] stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)] stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')] stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')] stats << ['Number of unknown phenotypes', rejected_hpos.length] return stats end |
#get_top_dummy_clusters_stats(top_clust_phen) ⇒ Object
157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# File 'lib/pets/coPatReporterMethods.rb', line 157 def get_top_dummy_clusters_stats(top_clust_phen) new_cluster_phenotypes = {} top_clust_phen.each_with_index do |cluster, clusterID| phenotypes_frequency = Hash.new(0) total_patients = cluster.length cluster.each do |phenotypes| phenotypes.each do |p| phenotypes_frequency[p] += 1 end end new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}] end return new_cluster_phenotypes end |
#group_by_region(hpo_regions) ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/pets/phen2reg_methods.rb', line 18 def group_by_region(hpo_regions) #hpo_regions-> hpo => [[chr, start, stop, regID, score], [...]] region2hpo = {} regionAttributes = {} association_scores = {} hpo_regions.each do |hpo, regions| regions.each do |chr, start, stop, regionID, association_score| query = region2hpo[regionID] if query.nil? region2hpo[regionID] = [hpo] else query << hpo end query = regionAttributes[regionID] if query.nil? total_patients_in_region = regionID.split('.')[3].to_i region_length = stop - start regionAttributes[regionID] = [chr, start, stop, total_patients_in_region, region_length] end query = association_scores[regionID] if query.nil? association_scores[regionID] = {hpo => association_score} else query[hpo] = association_score end end end return region2hpo, regionAttributes, association_scores end |
#hpo_quality_control(prediction_data, hpos_ci_values, hpo) ⇒ Object
def hpo_quality_control(prediction_data, hpo_metadata_file, information_coefficient_file)
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 |
# File 'lib/pets/phen2reg_methods.rb', line 230 def hpo_quality_control(prediction_data, hpos_ci_values, hpo) characterised_hpos = [] ##information_coef_file= hpo_code, ci ##prediction_data = [hpo1, hpo2, hpo3...] # hpos_ci_values = load_hpo_ci_values(information_coefficient_file) prediction_data.each do |hpo_code| # names, rejected = hpo.translate_codes2names([hpo_code]) names, rejected = hpo.translate_ids([hpo_code]) tmp = [names.first, hpo_code] # col hpo name, col hpo code ci = hpos_ci_values[hpo_code] unless ci.nil? # col exists? and ci values tmp.concat(["yes", ci]) else tmp.concat(["no", "-"]) end # parent = prediction_data & hpo.get_parents(hpo_code) parent = prediction_data & hpo.get_ancestors(hpo_code, true) if parent.empty? parent << "-" else # n, r = hpo.translate_codes2names(parent) n, r = hpo.translate_ids(parent) parent = parent.zip(n) # Combine code ids with hpo names end tmp << parent # col parents specific_childs = hpo.get_childs_table([hpo_code], true) tmp << specific_childs.first.last characterised_hpos << tmp end return characterised_hpos end |
#invert_nested_hash(h) ⇒ Object
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
# File 'lib/pets/coPatReporterMethods.rb', line 267 def invert_nested_hash(h) new_h = {} h.each do |k1, vals1| vals1.each do |v1| vals1.each do |k2, vals2| query = new_h[k2] if query.nil? new_h[k2] = {k1 => vals2} else query[k1] = vals2 end end end end return new_h end |
#join_regions(regions) ⇒ Object
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
# File 'lib/pets/phen2reg_methods.rb', line 197 def join_regions(regions) #[chr, start, stop, association_values.keys, association_values.values, score] merged_regions = [] sorted_regions = regions.sort_by{|reg | [reg[0], reg[1]]} ref_reg = sorted_regions.shift while !sorted_regions.empty? next_region = sorted_regions.shift if ref_reg[0] == next_region[0] && (ref_reg[2] - next_region[1]).abs <= 1 && (ref_reg[5] - next_region[5]).abs.fdiv([ref_reg[5], next_region[5]].max) <= 0.05 && ref_reg[3] == next_region[3] ref_reg[2] = next_region[2] ref_assoc_values = ref_reg[4] next_assoc_values = next_region[4] assoc_values = [] ref_assoc_values.each_with_index do |ref_val, i| #assoc_values << (ref_val + next_assoc_values[i]).fdiv(2) assoc_values << [ref_val, next_assoc_values[i]].max end ref_reg[4] = assoc_values #ref_reg[5] = (ref_reg[5] + next_region[5]).fdiv(2) ref_reg[5] = [ref_reg[5], next_region[5]].max else merged_regions << ref_reg ref_reg = next_region end end merged_regions << ref_reg return merged_regions end |
#load_biosystem2gene_dictionary(biosystems_gene_path) ⇒ Object
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/pets/generalMethods.rb', line 81 def load_biosystem2gene_dictionary(biosystems_gene_path) gene2kegg = {} infile = open(biosystems_gene_path) gz = Zlib::GzipReader.new(infile) gz.each_line do |line| line.chomp! biosystem, gene, score = line.split("\t") query = gene2kegg[gene] if query.nil? gene2kegg[gene] = [biosystem] else query << biosystem end end return gene2kegg end |
#load_clustered_patients(file) ⇒ Object
370 371 372 373 374 375 376 377 378 379 380 381 382 383 |
# File 'lib/pets/io.rb', line 370 def load_clustered_patients(file) clusters = {} File.open(file).each do |line| line.chomp! pat_id, cluster_id = line.split("\t") query = clusters[cluster_id] if query.nil? clusters[cluster_id] = [pat_id] else query << pat_id end end return clusters end |
#load_coordinates(file_path) ⇒ Object
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 |
# File 'lib/pets/io.rb', line 256 def load_coordinates(file_path) coordinates = {} header = true File.open(file_path).each do |line| fields = line.chomp.split("\t") if header header = false else entity, chr, strand, start, stop = fields if chr == 'NA' STDERR.puts "Warning: Record #{fields.inspect} is undefined" next end coordinates[entity] = [chr, start.to_i, stop.to_i, strand] end end return coordinates end |
#load_evidence_profiles(file_path, hpo) ⇒ Object
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 |
# File 'lib/pets/io.rb', line 275 def load_evidence_profiles(file_path, hpo) profiles = {} id2label = {} #count = 0 File.open(file_path).each do |line| id, label, profile = line.chomp.split("\t") hpos = profile.split(',').map{|a| a.to_sym} hpos, rejected_hpos = hpo.check_ids(hpos) if !hpos.empty? hpos = hpo.clean_profile(hpos) profiles[id] = hpos if !hpos.empty? id2label[id] = label end end return profiles, id2label end |
#load_evidences(evidences_path, hpo) ⇒ Object
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 |
# File 'lib/pets/io.rb', line 238 def load_evidences(evidences_path, hpo) genomic_coordinates = {} coord_files = Dir.glob(File.join(evidences_path, '*.coords')) coord_files.each do |cd_f| entity = File.basename(cd_f, '.coords') coordinates = load_coordinates(cd_f) genomic_coordinates[entity] = coordinates end evidences = {} evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt')) evidence_files.each do |e_f| pair = File.basename(e_f, '.txt') profiles, id2label = load_evidence_profiles(e_f, hpo) evidences[pair] = {prof: profiles, id2lab: id2label} end return evidences, genomic_coordinates end |
#load_gene_data(gene_data_path) ⇒ Object
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 |
# File 'lib/pets/io.rb', line 385 def load_gene_data(gene_data_path) gene_list = {} #geneID => attr gene_location = {} # chr => gene infile = open(gene_data_path) gz = Zlib::GzipReader.new(infile) current_chr = nil genes = [] gz.each_line do |line| line.chomp! next if line =~ /^#/ fields = line.split("\t") if fields[8].include?('genome=chromosome') chr = fields[8].split(';')[1].split('=').last gene_location[current_chr] = genes genes = [] current_chr = chr elsif fields[2] == 'gene' attributes = {} fields[8].split(';').each do |pair| key, value = pair.split('=') attributes[key] = value end geneName = nil geneName = attributes['gene'] if !attributes['gene'].nil? geneSyns = [] geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil? description = attributes['description'] description = URI.unescape(description) if !description.nil? attributes['Dbxref'] =~ /GeneID:(\d+)/ gene_list[$1] = [geneName, geneSyns, description] genes << [$1, fields[3].to_i, fields[4].to_i] end end gene_location[current_chr] = genes return gene_list, gene_location end |
#load_hpo_ci_values(information_coefficient_file) ⇒ Object
360 361 362 363 364 365 366 367 368 |
# File 'lib/pets/io.rb', line 360 def load_hpo_ci_values(information_coefficient_file) hpos_ci_values = {} File.open(information_coefficient_file).each do |line| line.chomp! hpo_code, ci = line.split("\t") hpos_ci_values[hpo_code.to_sym] = ci.to_f end return hpos_ci_values end |
#load_hpo_ontology(hpo_file, excluded_hpo_file) ⇒ Object
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
# File 'lib/pets/io.rb', line 4 def load_hpo_ontology(hpo_file, excluded_hpo_file) hpo = nil if !hpo_file.include?('.json') if !excluded_hpo_file.nil? hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file)) else hpo = Ontology.new(file: hpo_file, load_file: true) end else hpo = Ontology.new hpo.read(hpo_file) if !excluded_hpo_file.nil? hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file)) hpo.remove_removable() hpo.build_index() end end return hpo end |
#load_profiles(file_path, hpo) ⇒ Object
188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# File 'lib/pets/io.rb', line 188 def load_profiles(file_path, hpo) profiles = {} #count = 0 File.open(file_path).each do |line| id, profile = line.chomp.split("\t") hpos = profile.split(',').map{|a| a.to_sym} hpos, rejected_hpos = hpo.check_ids(hpos) if !hpos.empty? hpos = hpo.clean_profile(hpos) profiles[id] = hpos if !hpos.empty? end end return profiles end |
#load_tabular_vars(path) ⇒ Object
217 218 219 220 221 222 223 224 225 226 |
# File 'lib/pets/io.rb', line 217 def load_tabular_vars(path) vars = [] File.open(path).each do |line| fields = line.chomp.split("\t") chr = fields[0].gsub('chr','') start = fields[1].to_i vars << [chr, start, start] end return vars end |
#load_training_file4HPO(training_file, thresold = 0) ⇒ Object
-
Indexing by hpo (code)
prepare training file for analysis using phenotype2region prediction
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 |
# File 'lib/pets/io.rb', line 323 def load_training_file4HPO(training_file, thresold=0) training_set = {} information = loadFile(training_file, thresold) information.each do |info| hpoCode = info.delete_at(4) query = training_set[hpoCode] if query.nil? training_set[hpoCode] = [info] else query << info end end # STDERR.puts training_set.keys.inspect return training_set end |
#load_training_file4regions(training_file) ⇒ Object
306 307 308 309 310 311 312 313 314 315 316 317 318 319 |
# File 'lib/pets/io.rb', line 306 def load_training_file4regions(training_file) training_set = {} posInfo = loadFile(training_file) posInfo.each do |info| chr = info.shift query = training_set[chr] if query.nil? training_set[chr] = [info] else query << info end end return training_set end |
#load_variants(variant_folder) ⇒ Object
203 204 205 206 207 208 209 210 211 212 213 214 215 |
# File 'lib/pets/io.rb', line 203 def load_variants(variant_folder) variants = {} Dir.glob(File.join(variant_folder, '*.{tab,vcf,vcf.gz}')).each do |path| profile_id, ext = File.basename(path).split(".", 2) if ext == 'tab' || ext == 'txt' vars = load_tabular_vars(path) elsif ext == 'vcf' || ext == 'vcf.gz' vars = load_vcf(path, ext) end variants[profile_id] = Genomic_Feature.new(vars) end return variants end |
#load_vcf(path, ext) ⇒ Object
Some compressed files are fragmented internally. If so, VCFfile only reads first fragment
228 229 230 231 232 233 234 235 236 |
# File 'lib/pets/io.rb', line 228 def load_vcf(path, ext) # Some compressed files are fragmented internally. If so, VCFfile only reads first fragment vars = [] # Use zcat original.vcf.gz | gzip > new.vcf.gz to obtain a contigous file vcf = BioVcf::VCFfile.new(file: path, is_gz: ext == 'vcf.gz' ? true : false ) vcf.each do |var| vars << [var.chrom.gsub('chr',''), var.pos, var.pos] end puts vars.length return vars end |
#loadBiosistemsInfo(biosystems_info_path, filterDB) ⇒ Object
67 68 69 70 71 72 73 74 75 76 77 78 79 |
# File 'lib/pets/generalMethods.rb', line 67 def loadBiosistemsInfo(biosystems_info_path, filterDB) bsid2attributes = {} infile = open(biosystems_info_path) gz = Zlib::GzipReader.new(infile) gz.each_line do |line| line.chomp! #STDERR.puts line.inspect fields = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').split("\t") bsid = fields.shift bsid2attributes[bsid] = [fields[1], fields[2]] if filterDB == fields[0] end return bsid2attributes end |
#loadFile(file, thresold = 0) ⇒ Object
-
Load training info file:
Chr;Start;Stop;HPO;Association;node
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 |
# File 'lib/pets/io.rb', line 342 def loadFile(file, thresold=0) information = [] File.open(file).each do |line| line.chomp! allInfo = line.split("\t") associationValue = allInfo[4].to_f if associationValue >= thresold chr = allInfo[0] startPos = allInfo[1].to_i stopPos = allInfo[2].to_i hpoCode = allInfo[3] nodeID = allInfo[5] information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue] end end return information end |
#merge_genes_with_kegg_data(gene_list, kegg_data) ⇒ Object
98 99 100 101 102 103 104 105 106 107 |
# File 'lib/pets/generalMethods.rb', line 98 def merge_genes_with_kegg_data(gene_list, kegg_data) merged_data = {} gene_list.each do |geneID, values| geneName, geneSyn, description = values kegg_entry = kegg_data[geneID] kegg_entry = [] if kegg_entry.nil? merged_data[geneID] = [geneName, description, kegg_entry, geneSyn] end return merged_data end |
#parse_clusters_file(clusters_file, patient_data) ⇒ Object
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
# File 'lib/pets/io.rb', line 167 def parse_clusters_file(clusters_file, patient_data) clusters_info = {} clusters_table = [] File.open(clusters_file).each do |line| line.chomp! patientID, clusterID = line.split("\t") patientHPOProfile = patient_data.get_profile(patientID) query = clusters_info[clusterID] if query.nil? clusters_info[clusterID] = {patientID => patientHPOProfile} else query[patientID] = patientHPOProfile end end clusters_info.each do |clusterID, patients_info| patients_per_cluster = patients_info.keys.length clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values] end return clusters_table, clusters_info end |
#parse_kegg_data(query_genes) ⇒ Object
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 |
# File 'lib/pets/io.rb', line 422 def parse_kegg_data(query_genes) kegg_data = {} #gene => attb while !query_genes.empty? gene_set = query_genes.shift(10) url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}" uri = URI(url) response = Net::HTTP.get(uri) geneID = nil gene_names = [] definition = nil pathways = [] parsing_pathway_field = false response.squeeze(' ').each_line do |line| line.chomp! if line =~ /^ENTRY/ geneID = line.split(' ')[1] elsif line =~ /^NAME/ gene_names = line.split(' ', 2).last.split(', ') elsif line =~ /^DEFINITION/ definition = line.split(' ', 2)[1] elsif line =~ /^PATHWAY/ pathways << line.split(' ', 3)[1..2] parsing_pathway_field = true elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/ parsing_pathway_field = false elsif parsing_pathway_field pathways << line.strip.split(' ', 2) elsif line == '///' parsing_pathway_field = false kegg_data[geneID] = [gene_names, definition, pathways] pathways = [] gene_names = [] end end end return kegg_data end |
#parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path) ⇒ Object
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/pets/generalMethods.rb', line 50 def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path) kegg_data = {} gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path) keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG') keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/} gene2biosystems.each do |geneID, pathways| kegg_pathways = [] pathways.each do |biosystem| kAttrib = keggAttributes[biosystem] kegg_pathways << kAttrib if !kAttrib.nil? end kegg_data[geneID] = kegg_pathways end return kegg_data end |
#parse_patient_results(results) ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/pets/reg2phen_methods.rb', line 83 def parse_patient_results(results) patient_results = [] patient_results << ['Record', 'Region', 'Chromosome', 'Prediction start', 'Prediction stop', 'Phenotype (HPO)', 'P-value', 'SOR start', 'SOR stop', 'Number of patients sharing SOR', 'Genes'] counter = 1 results.each do |k, val| val.each_with_index do |v, i| if i == 0 patient_results << v.unshift(counter, k) else patient_results << v.unshift(counter, '') end counter += 1 end end return patient_results end |
#predict_patient(predictions, training_set, threshold, transform, genes, genes_dictionary) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
# File 'lib/pets/reg2phen_methods.rb', line 3 def predict_patient(predictions, training_set, threshold, transform, genes, genes_dictionary) results = {} predictions.each do |info| if genes geneID = info.shift info = genes_dictionary[geneID] end chr, pt_start, pt_stop, is_GeneSynom = info sors = training_set[chr] next if sors.nil? sors.each do |sor_start, sor_stop, nodeID, hpo_code, association_score| next if !coor_overlap?(pt_start, pt_stop, sor_start, sor_stop) patientsNum = nodeID.split('.').last if association_score >= threshold association_score = 10**(-association_score) if transform genes_in_sor = get_genes_by_coordinates(genes_dictionary, chr, sor_start, sor_stop) record2save = [chr, pt_start, pt_stop, hpo_code, association_score, sor_start, sor_stop, patientsNum, genes_in_sor.join(',')] key = info[0...3].join(":") key.concat(" (#{geneID})") if genes query_res = results[key] if query_res.nil? results[key] = [record2save] else query_res << record2save end end end end return results end |
#process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters) ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/pets/coPatReporterMethods.rb', line 42 def process_cluster(patient_ids, patient_data, phenotype_ic, , ont, processed_clusters) chrs = Hash.new(0) all_phens = [] profile_ics = [] profile_lengths = [] patient_ids.each do |pat_id| phenotypes = patient_data.get_profile(pat_id) profile_ics << get_profile_ic(phenotypes, phenotype_ic) profile_lengths << phenotypes.length if processed_clusters < [:clusters2show_detailed_phen_data] phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional all_phens << phen_names end patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if ![:chromosome_col].nil? end return chrs, all_phens, profile_ics, profile_lengths end |
#process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) ⇒ Object
get ic and chromosomes
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/pets/coPatReporterMethods.rb', line 16 def process_dummy_clustered_patients(, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes ont = Cohort.get_ontology(Cohort.act_ont) all_ics = [] all_lengths = [] top_cluster_phenotypes = [] cluster_data_by_chromosomes = [] multi_chromosome_patients = 0 processed_clusters = 0 clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids| num_of_patients = patient_ids.length next if num_of_patients == 1 chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, , ont, processed_clusters) top_cluster_phenotypes << all_phens if processed_clusters < [:clusters2show_detailed_phen_data] all_ics << profile_ics all_lengths << profile_lengths if ![:chromosome_col].nil? multi_chromosome_patients += num_of_patients if chrs.length > 1 chrs.each do |chr, count| cluster_data_by_chromosomes << [cluster_id, num_of_patients, chr, count] end end processed_clusters += 1 end return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients end |
#read_compressed_json(path) ⇒ Object
468 469 470 471 472 473 |
# File 'lib/pets/io.rb', line 468 def read_compressed_json(path) infile = open(path) gz = Zlib::GzipReader.new(infile) object = JSON.parse(gz.read) return object end |
#read_excluded_hpo_file(file) ⇒ Object
24 25 26 27 28 29 30 |
# File 'lib/pets/io.rb', line 24 def read_excluded_hpo_file(file) excluded_hpo = [] File.open(file).each do |line| excluded_hpo << line.chomp end return excluded_hpo end |
#remove_nested_entries(nested_hash) ⇒ Object
172 173 174 175 176 177 178 179 |
# File 'lib/pets/coPatReporterMethods.rb', line 172 def remove_nested_entries(nested_hash) empty_root_ids = [] nested_hash.each do |root_id, entries| entries.select!{|id, val| yield(id, val)} empty_root_ids << root_id if entries.empty? end empty_root_ids.each{|id| nested_hash.delete(id)} end |
#report_data(container, html_file) ⇒ Object
282 283 284 285 286 287 288 289 290 291 292 293 |
# File 'lib/pets/phen2reg_methods.rb', line 282 def report_data(characterised_hpos, hpo_associated_regions, html_file, hpo, genes_with_kegg_data, pathway_stats) container = {:characterised_hpos => characterised_hpos, :merged_regions => hpo_associated_regions, :hpo => hpo, :genes_with_kegg_data => genes_with_kegg_data, :pathway_stats => pathway_stats } template = File.open(File.join(REPORT_FOLDER, 'patient_report.erb')).read report = Report_html.new(container, 'Patient HPO profile summary') report.build(template) report.write(html_file) end |
#save_patient_matrix(output, patient_hpo_profile, regionAttributes, hpo_region_matrix) ⇒ Object
295 296 297 298 299 300 301 302 303 304 |
# File 'lib/pets/phen2reg_methods.rb', line 295 def save_patient_matrix(output, patient_hpo_profile, regionAttributes, hpo_region_matrix) File.open(output, "w") do |f| f.puts "Region\t#{patient_hpo_profile.join("\t")}" regionAttributes_array = regionAttributes.values hpo_region_matrix.each_with_index do |association_values, i| chr, start, stop = regionAttributes_array[i] f.puts "#{chr}:#{start}-#{stop}\t#{association_values.join("\t")}" end end end |
#scoring_regions(regionAttributes, hpo_region_matrix, scoring_system, pvalue_cutoff, freedom_degree, null_value = 0) ⇒ Object
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
# File 'lib/pets/phen2reg_methods.rb', line 127 def scoring_regions(regionAttributes, hpo_region_matrix, scoring_system, pvalue_cutoff, freedom_degree, null_value=0) #hpo_associated_regions = [[chr, start, stop, [hpos_list], [weighted_association_scores]]] #hpo_region_matrix = [[0, 0.4, 0, 0.4], [0, 0, 0.5, 0.4]] # STDERR.puts "EH" regionAttributes_array = regionAttributes.values max_cluster_length = hpo_region_matrix.map{|x| x.count {|i| i != 0}}.max if freedom_degree == 'maxnum' hpo_region_matrix.each_with_index do |associations, i| sample_length = nil if freedom_degree == 'prednum' sample_length = associations.length elsif freedom_degree == 'phennum' sample_length = associations.count{|s| s != 0} elsif freedom_degree == 'maxnum' sample_length = max_cluster_length else abort("Invalid freedom degree calculation method: #{freedom_degree}") end if scoring_system == 'mean' mean_association = associations.inject(0){|s,x| s + x } / sample_length regionAttributes_array[i] << mean_association elsif scoring_system == 'fisher' #hyper must be ln not log10 from net analyzer #https://en.wikipedia.org/wiki/Fisher%27s_method # STDERR.puts associations.inspect lns = associations.map{|a| Math.log(10 ** -a)} #hyper values come as log10 values sum = lns.inject(0){|s, a| s + a} combined_pvalue = Statistics2.chi2_x(sample_length *2, -2*sum) regionAttributes_array[i] << combined_pvalue elsif scoring_system == 'stouffer' sum = associations.inject(0){|s,x| s + x} combined_z_score = sum/Math.sqrt(sample_length) regionAttributes_array[i] << combined_z_score elsif scoring_system == 'geommean' #NOTE: if troubles with number size, use BigDecimal geommean_mult = associations.inject(1){|s,x| s * x} geommean_association = geommean_mult.to_f ** ( sample_length ** -1 ) regionAttributes_array[i] << geommean_association elsif scoring_system == 'median' median_value = associations.length / 2 if median_value % 2 == 0 median_up = associations.sort[median_value] median_down = associations.sort[median_value - 1] pair_median = ( median_up + median_down ) / 2 median_association = associations.sort[pair_median.ceil] else median_association = associations.sort[median_value.ceil] end regionAttributes_array[i] << median_association elsif scoring_system == 'maxnum' max_association = associations.max regionAttributes_array[i] << max_association elsif scoring_system == 'minnum' min_association = associations.min regionAttributes_array[i] << min_association else abort("Invalid ranking method: #{scoring_system}") end end if scoring_system == 'mean' || scoring_system == 'geommean' || scoring_system == 'maxnum' || scoring_system == 'minnum' regionAttributes.select!{|regionID, attributes| attributes.last >= pvalue_cutoff} elsif scoring_system == 'fisher' regionAttributes.select!{|regionID, attributes| attributes.last <= pvalue_cutoff} end #Combined p-value: less value equals better association -> not due randomly. end |
#search4HPO(info2predict, trainingData) ⇒ Object
require ‘bigdecimal’
6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/pets/phen2reg_methods.rb', line 6 def search4HPO(info2predict, trainingData) #search if there are profile HPOs within the association file hpo_regions = {} info2predict.each do |hpo| regions = trainingData[hpo] if !regions.nil? hpo_regions[hpo] = regions end end return hpo_regions end |
#system_call(code_folder, script, args_string) ⇒ Object
8 9 10 11 12 13 14 |
# File 'lib/pets/generalMethods.rb', line 8 def system_call(code_folder, script, args_string) cmd = File.join(code_folder, script) + ' ' + args_string puts "==> #{cmd}" Benchmark.bm do |x| x.report{ system(cmd) } end end |
#translate_codes(clusters, hpo) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 |
# File 'lib/pets/coPatReporterMethods.rb', line 2 def translate_codes(clusters, hpo) translated_clusters = [] clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary| translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}} translated_clusters << [clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary, translate_codes ] end return translated_clusters end |
#translate_hpos_in_results(results, hpo) ⇒ Object
46 47 48 49 50 51 52 53 54 |
# File 'lib/pets/reg2phen_methods.rb', line 46 def translate_hpos_in_results(results, hpo) results.each do |coords, data| data.each do |info| # hpo_name, rejected = hpo.translate_codes2names([info[3]]) info[3] = hpo.translate_id(info[3]) # info[3] = hpo_name.first end end end |
#write_array(array, file_path) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/pets/io.rb', line 41 def write_array(array, file_path) File.open(file_path, 'w') do |handler| array.each do |record| if record.class == String line = record else line = record.join("\t") end handler.puts line end end end |
#write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name) ⇒ Object
128 129 130 131 132 133 134 135 136 137 |
# File 'lib/pets/io.rb', line 128 def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name) File.open(filename, 'w') do |f| f.puts "#{x_axis_name}\t#{y_axis_name}" x_axis_value.each_with_index do |value,i| y_value = y_axis_value[i] raise("The #{i} position is not presented in y_axis_value") if y_value.nil? f.puts [value, y_value].join("\t") end end end |
#write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit) ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/pets/io.rb', line 76 def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit) File.open(cluster_chromosome_data_file, 'w') do |f| f.puts %w[cluster_id chr count].join("\t") index = 0 last_id = cluster_data.first.first unless cluster_data.empty? cluster_data.each do |cluster_id, patient_number, chr, count| index += 1 if cluster_id != last_id break if index == limit f.puts ["#{patient_number}_#{index}", chr, count].join("\t") last_id = cluster_id end end end |
#write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit) ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/pets/io.rb', line 63 def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit) File.open(cluster_ic_data_file, 'w') do |f| f.puts %w[cluster_id ic Plen].join("\t") all_ics.each_with_index do |cluster_ics, i| break if i == limit cluster_length = cluster_ics.length cluster_ics.each_with_index do |clust_ic, j| f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}" end end end end |
#write_compressed_plain_file(data, path) ⇒ Object
460 461 462 463 464 465 466 |
# File 'lib/pets/io.rb', line 460 def write_compressed_plain_file(data, path) File.open(path, 'w') do |f| gz = Zlib::GzipWriter.new(f) gz.write data.to_json gz.close end end |
#write_coverage_data(coverage_to_plot, coverage_to_plot_file) ⇒ Object
90 91 92 93 94 95 96 |
# File 'lib/pets/io.rb', line 90 def write_coverage_data(coverage_to_plot, coverage_to_plot_file) File.open(coverage_to_plot_file, 'w') do |f| coverage_to_plot.each do |chr, position, freq| f.puts "#{chr}\t#{position}\t#{freq}" end end end |
#write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/pets/io.rb', line 99 def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) CSV.open(detailed_profile_evaluation_file, "wb") do |csv| suggested_childs.each do |pat_id, suggestions| warning = nil warning = 'WARNING: Very few phenotypes' if suggestions.length < 4 csv << ["PATIENT #{pat_id}", "#{warning}"] csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"] suggestions.each do |parent, childs| parent_code, parent_name = parent if childs.empty? csv << ["#{parent_name} (#{parent_code})", '-'] else parent_writed = false childs.each do |child_code, child_name| if !parent_writed parent_field = "#{parent_name} (#{parent_code})" parent_writed = true else parent_field = "" end csv << [parent_field, "#{child_name} (#{child_code})"] end end end csv << ["", ""] end end end |
#write_hash(hash, file_path, header = []) ⇒ Object
32 33 34 35 36 37 38 39 |
# File 'lib/pets/io.rb', line 32 def write_hash(hash, file_path, header = []) File.open(file_path, 'w') do |handler| handler.puts header.join("\t") if !header.empty? hash.each do |key, array| handler.puts "#{key}\t#{array.join("\t")}" end end end |
#write_matrix_for_R(matrix, x_names, y_names, file) ⇒ Object
54 55 56 57 58 59 60 61 |
# File 'lib/pets/io.rb', line 54 def write_matrix_for_R(matrix, x_names, y_names, file) File.open(file, 'w') do |f| f.puts x_names.join("\t") matrix.each_with_index do |row, i| f.puts [y_names[i]].concat(row).join("\t") end end end |
#write_patient_hpo_stat(average_hp_per_pat_distribution, output_file) ⇒ Object
158 159 160 161 162 163 164 165 |
# File 'lib/pets/io.rb', line 158 def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file) File.open(output_file, 'w') do |f| f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}" average_hp_per_pat_distribution.each do |patient_num, ave| f.puts "#{patient_num}\t#{ave}" end end end |
#write_profile_pairs(similarity_pairs, filename) ⇒ Object
148 149 150 151 152 153 154 155 156 |
# File 'lib/pets/io.rb', line 148 def write_profile_pairs(similarity_pairs, filename) File.open(filename, 'w') do |f| similarity_pairs.each do |pairsA, pairsB_and_values| pairsB_and_values.each do |pairsB, values| f.puts "#{pairsA}\t#{pairsB}\t#{values}" end end end end |
#write_similarity_matrix(similarity_matrix, similarity_matrix_file) ⇒ Object
140 141 142 143 144 145 146 |
# File 'lib/pets/io.rb', line 140 def write_similarity_matrix(similarity_matrix, similarity_matrix_file) File.open(similarity_matrix_file, 'w') do |f| similarity_matrix.each do |row| f.puts row.join("\t") end end end |