Top Level Namespace

Defined Under Namespace

Modules: Pets Classes: Cohort, Cohort_Parser, Genomic_Feature, Reference_parser

Constant Summary collapse

COMMON_OPTPARSE =

Needs define ROOT_PATH constant in file requiring this file

File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets', 'common_optparse.rb'))
REPORT_FOLDER =
File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
EXTERNAL_DATA =
File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
EXTERNAL_CODE =
File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
HPO_FILE =
File.join(EXTERNAL_DATA, 'hp.json')
MONDO_FILE =
File.join(EXTERNAL_DATA, 'mondo.obo')
IC_FILE =
File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')

Instance Method Summary collapse

Instance Method Details

#add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/pets/phen2reg_methods.rb', line 48

def add_parentals_of_not_found_hpos_in_regions(
	patient_hpo_profile, 
	trainingData, 
	region2hpo, 
	regionAttributes, 
	association_scores,
	 # hpo_code => [phenotype, relations]
	)
	new_hpos = []
	region2hpo.each do |regionID, hpos|
		hpos_not_found = patient_hpo_profile - hpos
		parental_hpos = []
		hpo_scores = {}
		hpos_not_found.each do |hpo|
			region, parental_hpo = get_region_with_parental_hpo(hpo, regionID, trainingData , )
			if !region.nil? && 
				!parental_hpos.include?(parental_hpo) && 
				!patient_hpo_profile.include?(parental_hpo)
				parental_hpos << parental_hpo
				hpo_scores[parental_hpo] = region.last
			end
		end
		hpos.concat(parental_hpos)
		new_hpos.concat(parental_hpos)
		association_scores[regionID].merge!(hpo_scores)
	end
	patient_hpo_profile.concat(new_hpos.uniq)
end

#add_record(hash, key, record, uniq = false) ⇒ Object



16
17
18
19
20
21
22
23
24
25
# File 'lib/pets/generalMethods.rb', line 16

def add_record(hash, key, record, uniq=false)
	query = hash[key]
	if query.nil?
		hash[key] = [record]
	elsif !uniq # We not take care by repeated entries
		query << record
	elsif !query.include?(record) # We want uniq entries
		query << record
	end
end

#binom(n, k) ⇒ Object



174
175
176
177
178
179
180
# File 'lib/pets/generalMethods.rb', line 174

def binom(n,k)
	if k > 0 && k < n
		res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
	else
		res = 1
	end
end

#calculate_coverage(regions_data, delete_thresold = 0) ⇒ Object



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/pets/coPatReporterMethods.rb', line 138

def calculate_coverage(regions_data, delete_thresold = 0)
	raw_coverage = {}
	n_regions = 0
	patients = 0
	nt = 0
	regions_data.each do |start, stop, chr, reg_id|
		number_of_patients = reg_id.split('.').last.to_i
		if number_of_patients <= delete_thresold
			number_of_patients = 0
		else
			n_regions += 1
			nt += stop - start			
		end
    add_record(raw_coverage, chr, [start, stop, number_of_patients])
		patients += number_of_patients
	end
	return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
end

#calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number) ⇒ Object



262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/pets/phen2reg_methods.rb', line 262

def calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number)     
  records_to_delete = []
  counter = 0
  adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
    hpo_coincidences = patient_original_phenotypes & hpo_list
    original_hpo_recovery_percentage = hpo_coincidences.length / patient_original_phenotypes.length.to_f * 100
    records_to_delete << counter if original_hpo_recovery_percentage < min_hpo_recovery_percentage
    query = predicted_hpo_percentage[patient_number] 
    if query.nil?
     predicted_hpo_percentage[patient_number] = [original_hpo_recovery_percentage]
    else
     query << original_hpo_recovery_percentage
    end   
    counter += 1 
  end
  records_to_delete.reverse_each do |record_number|
    adjacent_regions_joined.delete_at(record_number)
  end
end

#compute_hyper_prob(a, b, c, d, n) ⇒ Object



167
168
169
170
171
172
# File 'lib/pets/generalMethods.rb', line 167

def compute_hyper_prob(a, b, c, d, n)
	binomA = binom(a + b, a)
	binomC = binom(c + d, c)
	divisor = binom(n, a + c)
	return (binomA * binomC).fdiv(divisor)
end

#compute_IC_values(patient_data, total_patients) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/pets/generalMethods.rb', line 28

def compute_IC_values(patient_data, total_patients)
	patients_per_hpo = Hash.new(0)
	last_patient_ID = ''
	patient_data.each do |patient_ID, |
		patient, count = patient_ID.split('_i')
		if patient != last_patient_ID
			hpos, chr, start, stop = 
			hpos.each do |h|
				patients_per_hpo[h] += 1
			end
		end
		last_patient_ID = patient
	end
	# STDERR.puts patients_per_hpo.inspect
	# Process.exit
	patients_per_hpo.each do |hpo, patient_number|
		patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients))
	end
	return patients_per_hpo
end

#compute_pathway_enrichment(genes_clusters, genes_with_kegg) ⇒ Object



109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/pets/generalMethods.rb', line 109

def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
	pathways_genes_in_predictions = {}
	genes_in_predictions = []
	genes_clusters.each do |cluster|
		cluster.each do |geneID, data|
			geneName, description, pathways, geneSyns = data
			pathways.each do |pathway|
				query = pathways_genes_in_predictions[pathway]
				if query.nil?
					pathways_genes_in_predictions[pathway] = [geneID]
				else
					query << geneID if !query.include?(geneID)
				end
			end
			genes_in_predictions << geneID if !genes_in_predictions.include?(geneID)
		end
	end
	genes_out_of_predictions = genes_with_kegg.keys - genes_in_predictions
	gene_number = genes_with_kegg.length
	stats = []
	pathways_genes_in_predictions.each do |pathway, pathway_predicted_genes|
		pathway_id, pathway_name = pathway
		no_pathway_predicted_genes = genes_in_predictions - pathway_predicted_genes 
		pathway_no_predicted_genes_count = 0
		no_pathway_no_predicted_genes_count = 0
		genes_out_of_predictions.each do |geneID|
			query = genes_with_kegg[geneID]
			if query[2].map{|pathway_info| pathway_info.first}.include?(pathway_id)
				pathway_no_predicted_genes_count += 1
			else
				no_pathway_no_predicted_genes_count += 1
			end
		end
		#Fisher => http://www.biostathandbook.com/fishers.html
		no_pathway_predicted_genes_count = no_pathway_predicted_genes.length
		pathway_predicted_genes_count = pathway_predicted_genes.length
		accumulated_prob = 0
		pathway_no_predicted_genes_count.times do |n|
			no_pathway_predicted_genes_count_shifted = no_pathway_predicted_genes_count - n
			pathway_predicted_genes_count_shifted = pathway_predicted_genes_count - n
			if no_pathway_predicted_genes_count_shifted >= 0 && pathway_predicted_genes_count_shifted >= 0
				accumulated_prob += compute_hyper_prob(
					n, 
					no_pathway_predicted_genes_count_shifted, 
					pathway_predicted_genes_count_shifted, 
					no_pathway_no_predicted_genes_count + n, 
					gene_number
				)
			else
				break
			end
		end
		contigency = [pathway_no_predicted_genes_count, no_pathway_predicted_genes_count, pathway_predicted_genes_count, no_pathway_no_predicted_genes_count]
		stats << [pathway, pathway_predicted_genes, contigency, accumulated_prob]
	end
	return stats
end

#coor_overlap?(ref_start, ref_stop, start, stop) ⇒ Boolean

Common methods for predictors Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5

  1. Indexing by chr (region)

Returns:

  • (Boolean)


295
296
297
298
299
300
301
302
303
304
# File 'lib/pets/io.rb', line 295

def coor_overlap?(ref_start, ref_stop, start, stop)
  overlap = false
  if (stop > ref_start && stop <= ref_stop) ||
    (start >= ref_start && start < ref_stop) ||
    (start <= ref_start && stop >= ref_stop) ||
    (start > ref_start && stop < ref_stop)
    overlap = true
  end
  return overlap
end

#download(ftp_server, path, name) ⇒ Object



475
476
477
478
479
480
481
# File 'lib/pets/io.rb', line 475

def download(ftp_server, path, name)
  ftp = Net::FTP.new()
  ftp.connect(ftp_server)
  ftp.
  ftp.getbinaryfile(path, name)
  ftp.close
end

#dummy_cluster_patients(patient_data, matrix_file, clust_pat_file) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
# File 'lib/pets/coPatReporterMethods.rb', line 87

def dummy_cluster_patients(patient_data, matrix_file, clust_pat_file)
  if !File.exists?(matrix_file)
    pat_hpo_matrix, pat_id, hp_id  = patient_data.to_bmatrix
    x_axis_file = matrix_file.gsub('.npy','_x.lst')
    y_axis_file = matrix_file.gsub('.npy','_y.lst')
    pat_hpo_matrix.save(matrix_file, hp_id, x_axis_file, pat_id, y_axis_file)
  end
  system_call(EXTERNAL_CODE, 'get_clusters.R', "-d #{matrix_file} -o #{clust_pat_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clust_pat_file)
  clustered_patients = load_clustered_patients(clust_pat_file)
  return(clustered_patients)
end

#generate_gene_locations(gene_location) ⇒ Object



56
57
58
59
60
61
62
63
64
# File 'lib/pets/reg2phen_methods.rb', line 56

def generate_gene_locations(gene_location)
  gene_locations = {}
  gene_location.each do |chr, genesInfo|
    genesInfo.each do |geneID, geneStart, geneStop|
      gene_locations[geneID] = [chr, geneStart, geneStop]
    end
  end
  return gene_locations
end

#generate_genes_dictionary(gene_location, genes_with_kegg) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/pets/reg2phen_methods.rb', line 66

def generate_genes_dictionary(gene_location, genes_with_kegg)
  reestructured_gene_locations = generate_gene_locations(gene_location)
  genes_dictionary = {}
  genes_with_kegg.each do |geneID, annotInfo|
    #STDERR.puts annotInfo.shift.inspect
    gene_location_data = reestructured_gene_locations[geneID]
    unless gene_location_data.nil?
      geneName, description, pathways, geneSyns = annotInfo
      genes_dictionary[geneName] = gene_location_data.dup.concat([false])
      geneSyns.each do |gene_symbol|
        genes_dictionary[gene_symbol] = gene_location_data.dup.concat([true])
      end
    end
  end
  return genes_dictionary
end

#generate_hpo_region_matrix(region2hpo, association_scores, info2predict, null_value = 0) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/pets/phen2reg_methods.rb', line 107

def generate_hpo_region_matrix(region2hpo, association_scores, info2predict, null_value=0)
	# #method for creating the hpo to region matrix for plotting
	# #info2predict = hpo list from user
	# #hpo_associated_regions = [[chr, start, stop, [hpos_list], [weighted_association_scores]]]
	hpo_region_matrix = []
	region2hpo.each do |regionID, hpos_list|
		row = []
		info2predict.each do |user_hpo|
			value = association_scores[regionID][user_hpo]
			if value.nil?
				row << null_value
			else
				row << value
			end
		end
		hpo_region_matrix << row
	end
	return hpo_region_matrix
end

#get_and_parse_external_data(all_paths) ⇒ Object



185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/pets/generalMethods.rb', line 185

def get_and_parse_external_data(all_paths)
	sources = [
	  ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
	  ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
	  ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]]
	]
	sources.each do |server, path, output|
	  download(server, path, output) if !File.exists?(output)
	end

	genes_with_kegg = {}
	gene_location = {}
	if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location])
	  gene_list, gene_location = load_gene_data(all_paths[:gene_data])
	  ### kegg_data = parse_kegg_data(genes_found_attributes.keys)
	  kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info])
	  genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data)
	  write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways])
	  write_compressed_plain_file(gene_location, all_paths[:gene_location])
	else
	  gene_location = read_compressed_json(all_paths[:gene_location])
	  genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways])
	end
	return gene_location, genes_with_kegg
end

#get_cluster_metadata(clusters_info) ⇒ Object



284
285
286
287
288
289
290
291
292
# File 'lib/pets/coPatReporterMethods.rb', line 284

def (clusters_info)
  average_hp_per_pat_distribution = []
  clusters_info.each do |cl_id, pat_info|
      hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
      hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
      average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
  end
  return average_hp_per_pat_distribution
end

#get_detailed_similarity(profile, candidates, evidences, hpo) ⇒ Object



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/pets/generalMethods.rb', line 211

def get_detailed_similarity(profile, candidates, evidences, hpo)
	profile_length = profile.length
	matrix = []
	profile_length.times{ matrix << Array.new(candidates.length, 0)}
	cand_number = 0
	candidates.each do |candidate_id, similarity|
		local_sim = []
		candidate_evidence = evidences[candidate_id]
		profile.each do |profile_term|
			candidate_evidence.each do |candidate_term|
				term_sim = hpo.compare([candidate_term], [profile_term], sim_type: :lin, bidirectional: false)
				local_sim << [profile_term, candidate_term, term_sim]
			end
		end
		local_sim.sort!{|s1, s2| s2.last <=> s1.last}

		final_pairs = []
		processed_profile_terms = []
		processed_candidate_terms = []
		local_sim.each do |pr_term, cd_term, sim|
			if !processed_profile_terms.include?(pr_term) && !processed_candidate_terms.include?(cd_term)
				final_pairs << [pr_term, cd_term, sim]
				processed_profile_terms << pr_term
				processed_candidate_terms << cd_term
			end
			break if profile_length == processed_profile_terms.length
		end
		final_pairs.each do |pr_term, cd_term, similarity|
			matrix[profile.index(pr_term)][cand_number] = similarity
		end
		cand_number += 1
	end
	return matrix
end

#get_final_coverage(raw_coverage, bin_size) ⇒ Object



110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/pets/coPatReporterMethods.rb', line 110

def get_final_coverage(raw_coverage, bin_size)
	coverage_to_plot = []
	raw_coverage.each do |chr, coverages|
		coverages.each do |start, stop, coverage|
			bin_start = start - start % bin_size
			bin_stop = stop - stop%bin_size
			while bin_start < bin_stop
				coverage_to_plot << [chr, bin_start, coverage]
				bin_start += bin_size
			end
		end
	end
	return coverage_to_plot
end

#get_genes_by_coordinates(genes_dictionary, chr, start, stop) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
# File 'lib/pets/reg2phen_methods.rb', line 34

def get_genes_by_coordinates(genes_dictionary, chr, start, stop)
  genes_in_sor = genes_dictionary.select do |gene_sym, attributes|
    gene_chr, gene_start, gene_stop, is_GeneSynom = attributes
    if gene_chr == chr && !is_GeneSynom
      coor_overlap?(gene_start, gene_stop, start, stop)
    else
      false
    end
  end
  return genes_in_sor.keys
end

#get_mean_size(all_sizes) ⇒ Object



99
100
101
102
103
104
105
106
107
# File 'lib/pets/coPatReporterMethods.rb', line 99

def get_mean_size(all_sizes)
    accumulated_size = 0
    number = 0
    all_sizes.each do |size, occurrences|
      accumulated_size += size *occurrences
      number += occurrences
    end
    return accumulated_size.fdiv(number)
end

#get_profile_ic(hpo_names, phenotype_ic) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/pets/coPatReporterMethods.rb', line 60

def get_profile_ic(hpo_names, phenotype_ic)
  ic = 0
  profile_length = 0
  hpo_names.each do |hpo_id|
    hpo_ic = phenotype_ic[hpo_id]
    raise("The term #{hpo_id} not exists in the given ic table") if hpo_ic.nil?
    ic += hpo_ic 
    profile_length += 1
  end
  profile_length = 1 if profile_length == 0
  return ic.fdiv(profile_length)
end

#get_region_with_parental_hpo(hpo, regionID, trainingData, hpo_metadata) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/pets/phen2reg_methods.rb', line 77

def get_region_with_parental_hpo(hpo, regionID, trainingData , )
	region = nil
	final_hpo = nil
	hpos = [hpo]
	while !hpos.empty?
		temp = []
		hpos.each do |hp| 
			hpo_data = [hp]
			if !hpo_data.nil?
				main_hpo_code, phenotype, relations = hpo_data
				temp.concat(relations.map{|rel| rel.first})
			end
		end
		temp.each do |temp_hpo|
			regions = trainingData[temp_hpo]
			if !regions.nil?
				final_reg = regions.select{|reg| reg[3] == regionID}
				if !final_reg.empty?
					region = final_reg.first
					final_hpo = temp_hpo
					temp = []
					break
				end
			end
		end
		hpos = temp
	end
	return region, final_hpo
end

#get_semantic_similarity_clustering(options, patient_data, temp_folder) ⇒ Object



181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'lib/pets/coPatReporterMethods.rb', line 181

def get_semantic_similarity_clustering(options, patient_data, temp_folder)
  template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
  hpo = Cohort.get_ontology(Cohort.act_ont)
  reference_profiles = nil
  reference_profiles = load_profiles(options[:reference_profiles], hpo) if !options[:reference_profiles].nil?
  Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name|
    matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
    profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
    clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
    if !File.exists?(matrix_filename)
      if reference_profiles.nil? 
        profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
      else # AS reference profiles are constant, the sematic comparation will be A => B (A reference). So, we have to invert the elements to perform the comparation
        ont = Cohort.get_ontology(:hpo)
        pat_profiles = ont.profiles
        ont.load_profiles(reference_profiles, reset_stored: true)
        profiles_similarity = ont.compare_profiles(sim_type: method_name.to_sym, 
          external_profiles: pat_profiles, 
          bidirectional: false)
        ont.load_profiles(pat_profiles, reset_stored: true)
        profiles_similarity = invert_nested_hash(profiles_similarity)
      end
      remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
      write_profile_pairs(profiles_similarity, profiles_similarity_filename)
      if reference_profiles.nil?
        axis_file = matrix_filename.gsub('.npy','.lst')
        similarity_matrix, axis_names = profiles_similarity.to_wmatrix(squared: true, symm: true)
        similarity_matrix.save(matrix_filename, axis_names, axis_file)
      else
        axis_file_x = matrix_filename.gsub('.npy','_x.lst')
        axis_file_y = matrix_filename.gsub('.npy','_y.lst')
        similarity_matrix, y_names, x_names = profiles_similarity.to_wmatrix(squared: false, symm: true)
        similarity_matrix.save(matrix_filename, y_names, axis_file_y, x_names, axis_file_x)
      end
    end
    ext_var = ''
    if method_name == 'resnik'
      ext_var = '-m max'
    elsif method_name == 'lin'
      ext_var = '-m comp1'
    end
    cluster_file = "#{method_name}_clusters.txt"
    if !reference_profiles.nil?
      ext_var << ' -s' 
      axis_file = "#{axis_file_y},#{axis_file_x}"
      cluster_file = "#{method_name}_clusters_rows.txt"
    end
    out_file = File.join(temp_folder, method_name)
    system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file +  '_heatmap.png')
    clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, cluster_file), patient_data)  
    write_patient_hpo_stat((clusters_info), clusters_distribution_filename)
    out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
    system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
    sim_mat4cluster = {}
    if options[:detailed_clusters]
      clusters_codes.each do |cluster|
        cluster_cohort = Cohort.new
        clID, patient_number, patient_ids, hpo_codes = cluster
        patient_ids.each_with_index {|patID, i| cluster_cohort.add_record([patID, hpo_codes[i], []])}
        cluster_profiles = cluster_cohort.profiles
        ref_profile = cluster_cohort.get_general_profile
        hpo.load_profiles({ref: ref_profile}, reset_stored: true)    
        similarities = hpo.compare_profiles(external_profiles: cluster_profiles, sim_type: :lin, bidirectional: false)
        candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(ref_profile, similarities[:ref], cluster_profiles, hpo, 100, 100)
        candidate_sim_matrix.unshift(['HP'] + candidates_ids)
        sim_mat4cluster[clID] = candidate_sim_matrix
      end
    end


    clusters = translate_codes(clusters_codes, hpo)
    container = {
      :temp_folder => temp_folder,
      :cluster_name => method_name,
      :clusters => clusters,
      :hpo => hpo,
      :sim_mat4cluster => sim_mat4cluster
     }

    report = Report_html.new(container, 'Patient clusters report')
    report.build(template)
    report.write(options[:output_file]+"_#{method_name}_clusters.html")
    system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -m #{method_name} -o #{File.join(temp_folder, method_name + '_sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
  end
end

#get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {}) ⇒ Object



246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# File 'lib/pets/generalMethods.rb', line 246

def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {})
		candidates = similarities.to_a
		if other_scores.empty?
			candidates.sort!{|s1, s2| s2.last <=> s1.last}
			candidates = candidates.first(candidate_limit)
		else # Prioritize first by the external list of scores, select the candidates and then rioritize by similarities
			selected_candidates = []
			candidates.each do |cand|
				cand_id = cand[0]
				cand_lab = id2label[cand_id.to_s]
				next if cand_lab.nil?
				other_score = other_scores[cand_lab]
				next if other_score.nil?
				cand << other_score
				selected_candidates << cand
			end
			selected_candidates.sort!{|e1, e2| e2[2] <=> e1[2]}
			candidates = selected_candidates.first(candidate_limit)
			candidates.sort!{|e1, e2| e2[1] <=> e1[1]}
			candidates.each{|c| c.pop}
		end
		candidates_ids = candidates.map{|c| c.first}
		candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo)
		candidate_similarity_matrix.each_with_index do |row, i|
			row.unshift(hpo.translate_id(reference_prof[i]))
		end
		candidate_similarity_matrix.sort!{|r1,r2| r2[1..r2.length].inject(0){|sum,n| sum +n} <=> r1[1..r1.length].inject(0){|sum,n| sum +n}}
		candidate_similarity_matrix = candidate_similarity_matrix.first(term_limit)
		return candidate_similarity_matrix, candidates, candidates_ids
end

#get_sor_length_distribution(raw_coverage) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/pets/coPatReporterMethods.rb', line 125

def get_sor_length_distribution(raw_coverage)
	all_cnvs_length = []
	cnvs_count = []
	raw_coverage.each do |chr, coords_info|
		coords_info.each do |start, stop, pat_records|
			region_length = stop - start + 1
			all_cnvs_length << [region_length, pat_records]
		end
	end
	all_cnvs_length.sort!{|c1, c2| c1[1] <=> c2[1]}
	return all_cnvs_length
end

#get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos) ⇒ Object



73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/pets/coPatReporterMethods.rb', line 73

def get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
  stats = []
  stats << ['Unique HPO terms', hpo_stats.length]
  stats << ['Cohort size', patient_data.profiles.length]
  stats << ['Rejected patients by empty profile', rejected_patients.length]
  stats << ['HPOs per patient (average)', patient_data.get_profiles_mean_size]
  stats << ['HPO terms per patient: percentile 90', patient_data.get_profile_length_at_percentile(perc=90)]
  stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)]
  stats << ['DsI for uniq HP terms', patient_data.get_dataset_specifity_index('uniq')]
  stats << ['DsI for frequency weigthed HP terms', patient_data.get_dataset_specifity_index('weigthed')]
  stats << ['Number of unknown phenotypes', rejected_hpos.length]
  return stats
end

#get_top_dummy_clusters_stats(top_clust_phen) ⇒ Object



157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/pets/coPatReporterMethods.rb', line 157

def get_top_dummy_clusters_stats(top_clust_phen)
  new_cluster_phenotypes = {}
  top_clust_phen.each_with_index do |cluster, clusterID|
    phenotypes_frequency = Hash.new(0)
    total_patients = cluster.length
    cluster.each do |phenotypes|
      phenotypes.each do |p|
        phenotypes_frequency[p] += 1
      end
    end
    new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
  end
  return new_cluster_phenotypes
end

#group_by_region(hpo_regions) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/pets/phen2reg_methods.rb', line 18

def group_by_region(hpo_regions)
	#hpo_regions-> hpo => [[chr, start, stop, regID, score], [...]]
	region2hpo = {}
	regionAttributes = {}
	association_scores = {}
	hpo_regions.each do |hpo, regions|
		regions.each do |chr, start, stop, regionID, association_score|
			query = region2hpo[regionID]
			if query.nil?
				region2hpo[regionID] = [hpo]
			else
				query << hpo
			end
			query = regionAttributes[regionID]
			if query.nil?
				total_patients_in_region = regionID.split('.')[3].to_i
				region_length = stop - start
				regionAttributes[regionID] = [chr, start, stop, total_patients_in_region, region_length]
			end
			query = association_scores[regionID]
			if query.nil?
				association_scores[regionID] = {hpo => association_score}
			else
				query[hpo] = association_score 
			end
		end		
	end
	return region2hpo, regionAttributes, association_scores
end

#hpo_quality_control(prediction_data, hpos_ci_values, hpo) ⇒ Object

def hpo_quality_control(prediction_data, hpo_metadata_file, information_coefficient_file)



230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/pets/phen2reg_methods.rb', line 230

def hpo_quality_control(prediction_data, hpos_ci_values, hpo)
	characterised_hpos = []
	##information_coef_file= hpo_code, ci
	##prediction_data = [hpo1, hpo2, hpo3...]
	# hpos_ci_values = load_hpo_ci_values(information_coefficient_file)
	prediction_data.each do |hpo_code|
		# names, rejected = hpo.translate_codes2names([hpo_code])
		names, rejected = hpo.translate_ids([hpo_code])
		tmp = [names.first, hpo_code] # col hpo name, col hpo code
		ci = hpos_ci_values[hpo_code]
		unless ci.nil? # col exists? and ci values
			tmp.concat(["yes", ci])
		else
			tmp.concat(["no", "-"])
		end
		# parent = prediction_data & hpo.get_parents(hpo_code)
		parent = prediction_data & hpo.get_ancestors(hpo_code, true)
		if parent.empty?
			parent << "-"
		else
			# n, r = hpo.translate_codes2names(parent)
			n, r = hpo.translate_ids(parent)
			parent = parent.zip(n) # Combine code ids with hpo names
		end
		tmp << parent # col parents
		specific_childs = hpo.get_childs_table([hpo_code], true)
		tmp << specific_childs.first.last
		characterised_hpos << tmp
	end
	return characterised_hpos
end

#invert_nested_hash(h) ⇒ Object



267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# File 'lib/pets/coPatReporterMethods.rb', line 267

def invert_nested_hash(h)
  new_h = {}
  h.each do |k1, vals1|
    vals1.each do |v1|
      vals1.each do |k2, vals2|
        query = new_h[k2]
        if query.nil?
          new_h[k2] = {k1 => vals2}
        else
          query[k1] = vals2
        end
      end
    end
  end
  return new_h
end

#join_regions(regions) ⇒ Object



197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/pets/phen2reg_methods.rb', line 197

def join_regions(regions)
	#[chr, start, stop, association_values.keys, association_values.values, score]
	merged_regions = []
	sorted_regions = regions.sort_by{|reg | [reg[0], reg[1]]}
	ref_reg = sorted_regions.shift
	while !sorted_regions.empty?
		next_region = sorted_regions.shift
		if ref_reg[0] == next_region[0] &&
			(ref_reg[2] - next_region[1]).abs <= 1 &&
			(ref_reg[5] - next_region[5]).abs.fdiv([ref_reg[5], next_region[5]].max) <= 0.05 &&
			ref_reg[3] == next_region[3]

			ref_reg[2] = next_region[2]
			ref_assoc_values = ref_reg[4] 
			next_assoc_values = next_region[4]
			assoc_values = []
			ref_assoc_values.each_with_index do |ref_val, i|
				#assoc_values << (ref_val + next_assoc_values[i]).fdiv(2)
				assoc_values << [ref_val, next_assoc_values[i]].max
			end
			ref_reg[4] = assoc_values
			#ref_reg[5] = (ref_reg[5] + next_region[5]).fdiv(2)
			ref_reg[5] = [ref_reg[5], next_region[5]].max
		else
			merged_regions << ref_reg
			ref_reg = next_region
		end
	end
	merged_regions << ref_reg
	return merged_regions
end

#load_biosystem2gene_dictionary(biosystems_gene_path) ⇒ Object



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/pets/generalMethods.rb', line 81

def load_biosystem2gene_dictionary(biosystems_gene_path)
	gene2kegg = {}
	infile = open(biosystems_gene_path)
	gz = Zlib::GzipReader.new(infile)
	gz.each_line do |line|
		line.chomp!
		biosystem, gene, score = line.split("\t")
		query = gene2kegg[gene]
		if query.nil?
			gene2kegg[gene] = [biosystem]
		else
			query << biosystem
		end
	end
	return gene2kegg
end

#load_clustered_patients(file) ⇒ Object



370
371
372
373
374
375
376
377
378
379
380
381
382
383
# File 'lib/pets/io.rb', line 370

def load_clustered_patients(file)
  clusters = {}
  File.open(file).each do |line|
    line.chomp!
    pat_id, cluster_id = line.split("\t")
    query = clusters[cluster_id]
    if query.nil?
      clusters[cluster_id] = [pat_id]
    else
      query << pat_id
    end
  end
  return clusters
end

#load_coordinates(file_path) ⇒ Object



256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'lib/pets/io.rb', line 256

def load_coordinates(file_path)
  coordinates = {}
  header = true
  File.open(file_path).each do |line|
    fields = line.chomp.split("\t")
    if header
      header = false
    else
      entity, chr, strand, start, stop = fields
      if chr == 'NA'
        STDERR.puts "Warning: Record #{fields.inspect} is undefined"
        next
      end
      coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
    end
  end
  return coordinates
end

#load_evidence_profiles(file_path, hpo) ⇒ Object



275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/pets/io.rb', line 275

def load_evidence_profiles(file_path, hpo)
  profiles = {}
  id2label = {}
  #count = 0
  File.open(file_path).each do |line|
    id, label, profile = line.chomp.split("\t")
    hpos = profile.split(',').map{|a| a.to_sym}
    hpos, rejected_hpos = hpo.check_ids(hpos)
    if !hpos.empty?
      hpos = hpo.clean_profile(hpos)
      profiles[id] = hpos if !hpos.empty?
      id2label[id] = label
    end
  end
  return profiles, id2label
end

#load_evidences(evidences_path, hpo) ⇒ Object



238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# File 'lib/pets/io.rb', line 238

def load_evidences(evidences_path, hpo)
  genomic_coordinates = {}
  coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
  coord_files.each do |cd_f|
    entity = File.basename(cd_f, '.coords')
    coordinates = load_coordinates(cd_f)
    genomic_coordinates[entity] = coordinates
  end
  evidences = {}
  evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
  evidence_files.each do |e_f|
    pair = File.basename(e_f, '.txt')
    profiles, id2label = load_evidence_profiles(e_f, hpo)
    evidences[pair] = {prof: profiles, id2lab: id2label}
  end
  return evidences, genomic_coordinates
end

#load_gene_data(gene_data_path) ⇒ Object



385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
# File 'lib/pets/io.rb', line 385

def load_gene_data(gene_data_path)
  gene_list = {} #geneID => attr
  gene_location = {} # chr => gene
  infile = open(gene_data_path)
  gz = Zlib::GzipReader.new(infile)
  current_chr = nil
  genes = []
  gz.each_line do |line|
    line.chomp!
    next if line =~ /^#/ 
    fields = line.split("\t")
    if fields[8].include?('genome=chromosome')
      chr = fields[8].split(';')[1].split('=').last
      gene_location[current_chr] = genes
      genes = []
      current_chr = chr
    elsif fields[2] == 'gene'
      attributes = {}
      fields[8].split(';').each do |pair| 
        key, value = pair.split('=')
        attributes[key] = value
      end
      geneName = nil
      geneName = attributes['gene'] if !attributes['gene'].nil?
      geneSyns = []
      geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
      description = attributes['description']
      description = URI.unescape(description) if !description.nil?
      attributes['Dbxref'] =~ /GeneID:(\d+)/
      gene_list[$1] = [geneName, geneSyns, description]
      genes << [$1, fields[3].to_i, fields[4].to_i]
    end
  end
  gene_location[current_chr] = genes
  return gene_list, gene_location
end

#load_hpo_ci_values(information_coefficient_file) ⇒ Object



360
361
362
363
364
365
366
367
368
# File 'lib/pets/io.rb', line 360

def load_hpo_ci_values(information_coefficient_file)
  hpos_ci_values = {}
  File.open(information_coefficient_file).each do |line|
    line.chomp!
    hpo_code, ci = line.split("\t")
    hpos_ci_values[hpo_code.to_sym] = ci.to_f
  end
  return hpos_ci_values
end

#load_hpo_ontology(hpo_file, excluded_hpo_file) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/pets/io.rb', line 4

def load_hpo_ontology(hpo_file, excluded_hpo_file)
  hpo = nil
  if !hpo_file.include?('.json')
    if !excluded_hpo_file.nil?
      hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
    else
      hpo = Ontology.new(file: hpo_file, load_file: true)
    end
  else
    hpo = Ontology.new
    hpo.read(hpo_file)
    if !excluded_hpo_file.nil?
      hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
      hpo.remove_removable()
      hpo.build_index()
    end
  end
  return hpo
end

#load_profiles(file_path, hpo) ⇒ Object



188
189
190
191
192
193
194
195
196
197
198
199
200
201
# File 'lib/pets/io.rb', line 188

def load_profiles(file_path, hpo)
  profiles = {}
  #count = 0
  File.open(file_path).each do |line|
    id, profile = line.chomp.split("\t")
    hpos = profile.split(',').map{|a| a.to_sym}
    hpos, rejected_hpos = hpo.check_ids(hpos)
    if !hpos.empty?
      hpos = hpo.clean_profile(hpos)
      profiles[id] = hpos if !hpos.empty?
    end
  end
  return profiles
end

#load_tabular_vars(path) ⇒ Object



217
218
219
220
221
222
223
224
225
226
# File 'lib/pets/io.rb', line 217

def load_tabular_vars(path)
  vars = []
  File.open(path).each do |line|
    fields = line.chomp.split("\t")
    chr = fields[0].gsub('chr','')
    start = fields[1].to_i
    vars << [chr, start, start]
  end
  return vars
end

#load_training_file4HPO(training_file, thresold = 0) ⇒ Object

  1. Indexing by hpo (code)

prepare training file for analysis using phenotype2region prediction



323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# File 'lib/pets/io.rb', line 323

def load_training_file4HPO(training_file, thresold=0)
  training_set = {}
  information = loadFile(training_file, thresold)
  information.each do |info|
    hpoCode = info.delete_at(4)
    query = training_set[hpoCode]
    if query.nil?
      training_set[hpoCode] = [info]  
    else
      query << info
    end
  end
  # STDERR.puts training_set.keys.inspect
  return training_set
end

#load_training_file4regions(training_file) ⇒ Object



306
307
308
309
310
311
312
313
314
315
316
317
318
319
# File 'lib/pets/io.rb', line 306

def load_training_file4regions(training_file)
  training_set = {}
  posInfo = loadFile(training_file)
  posInfo.each do |info|
    chr = info.shift
    query = training_set[chr]
    if query.nil?
      training_set[chr] = [info]
    else
      query << info
    end
  end
  return training_set
end

#load_variants(variant_folder) ⇒ Object



203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/pets/io.rb', line 203

def load_variants(variant_folder)
  variants = {}
  Dir.glob(File.join(variant_folder, '*.{tab,vcf,vcf.gz}')).each do |path|
    profile_id, ext = File.basename(path).split(".", 2)
    if ext == 'tab' || ext == 'txt'
      vars = load_tabular_vars(path)
    elsif ext == 'vcf' || ext == 'vcf.gz'
      vars = load_vcf(path, ext)
    end
    variants[profile_id] = Genomic_Feature.new(vars)
  end
  return variants
end

#load_vcf(path, ext) ⇒ Object

Some compressed files are fragmented internally. If so, VCFfile only reads first fragment



228
229
230
231
232
233
234
235
236
# File 'lib/pets/io.rb', line 228

def load_vcf(path, ext) # Some compressed files are fragmented internally. If so, VCFfile only reads first fragment
  vars = []             # Use zcat original.vcf.gz | gzip > new.vcf.gz to obtain a contigous file
  vcf  = BioVcf::VCFfile.new(file: path, is_gz: ext == 'vcf.gz' ? true : false )
  vcf.each do |var|
    vars << [var.chrom.gsub('chr',''), var.pos, var.pos]
  end
  puts vars.length
  return vars
end

#loadBiosistemsInfo(biosystems_info_path, filterDB) ⇒ Object



67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/pets/generalMethods.rb', line 67

def loadBiosistemsInfo(biosystems_info_path, filterDB)
	bsid2attributes = {}
	infile = open(biosystems_info_path)
	gz = Zlib::GzipReader.new(infile)
	gz.each_line do |line|
		line.chomp!
		#STDERR.puts line.inspect
		fields = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').split("\t")
		bsid = fields.shift
		bsid2attributes[bsid] = [fields[1], fields[2]] if filterDB == fields[0]
	end
	return bsid2attributes
end

#loadFile(file, thresold = 0) ⇒ Object

  1. Load training info file:

Chr;Start;Stop;HPO;Association;node



342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
# File 'lib/pets/io.rb', line 342

def loadFile(file, thresold=0)
  information = []
  File.open(file).each do |line|
    line.chomp!
    allInfo = line.split("\t")
    associationValue = allInfo[4].to_f
    if associationValue >= thresold
      chr = allInfo[0]
      startPos = allInfo[1].to_i
      stopPos = allInfo[2].to_i
      hpoCode = allInfo[3]
      nodeID = allInfo[5]
      information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
    end
  end
  return information
end

#merge_genes_with_kegg_data(gene_list, kegg_data) ⇒ Object



98
99
100
101
102
103
104
105
106
107
# File 'lib/pets/generalMethods.rb', line 98

def merge_genes_with_kegg_data(gene_list, kegg_data)
	merged_data = {}
	gene_list.each do |geneID, values|
		geneName, geneSyn, description = values
		kegg_entry = kegg_data[geneID]	
		kegg_entry = [] if kegg_entry.nil?
		merged_data[geneID] = [geneName, description, kegg_entry, geneSyn]
	end
	return merged_data
end

#parse_clusters_file(clusters_file, patient_data) ⇒ Object



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/pets/io.rb', line 167

def parse_clusters_file(clusters_file, patient_data)
  clusters_info = {}
  clusters_table = []
  File.open(clusters_file).each do |line|
    line.chomp!
    patientID, clusterID = line.split("\t")
    patientHPOProfile = patient_data.get_profile(patientID)
    query = clusters_info[clusterID]
    if query.nil? 
      clusters_info[clusterID] = {patientID => patientHPOProfile}
    else
      query[patientID] = patientHPOProfile
    end
  end
  clusters_info.each do |clusterID, patients_info|
    patients_per_cluster = patients_info.keys.length
    clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
  end
  return clusters_table, clusters_info
end

#parse_kegg_data(query_genes) ⇒ Object



422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
# File 'lib/pets/io.rb', line 422

def parse_kegg_data(query_genes)
  kegg_data = {} #gene => attb
    while !query_genes.empty?
      gene_set = query_genes.shift(10)
      url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
      uri = URI(url)
      response = Net::HTTP.get(uri) 
    geneID = nil
    gene_names = []
    definition = nil
    pathways = []
    parsing_pathway_field = false
    response.squeeze(' ').each_line do |line|
      line.chomp!
      if line =~ /^ENTRY/
        geneID = line.split(' ')[1]
      elsif line =~ /^NAME/
        gene_names = line.split(' ', 2).last.split(', ')
      elsif line =~ /^DEFINITION/
        definition = line.split(' ', 2)[1]
      elsif line =~ /^PATHWAY/
        pathways << line.split(' ', 3)[1..2]
        parsing_pathway_field = true
      elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
        parsing_pathway_field = false
      elsif parsing_pathway_field
        pathways << line.strip.split(' ', 2)
      elsif line == '///'
        parsing_pathway_field = false
        kegg_data[geneID] = [gene_names, definition, pathways]
        pathways = []
        gene_names = []
      end
    end
  end
  return kegg_data
end

#parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/pets/generalMethods.rb', line 50

def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
	kegg_data = {}
	gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path)
	keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG')
	keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/}

	gene2biosystems.each do |geneID, pathways|
		kegg_pathways = []
		pathways.each do |biosystem|
			kAttrib = keggAttributes[biosystem]
			kegg_pathways << kAttrib if !kAttrib.nil?
		end
		kegg_data[geneID] = kegg_pathways
	end
	return kegg_data
end

#parse_patient_results(results) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/pets/reg2phen_methods.rb', line 83

def parse_patient_results(results)
  patient_results = []
  patient_results << ['Record', 'Region', 'Chromosome', 'Prediction start', 'Prediction stop', 'Phenotype (HPO)', 'P-value', 'SOR start', 'SOR stop', 'Number of patients sharing SOR', 'Genes']
  counter = 1
  results.each do |k, val|
    val.each_with_index do |v, i|
      if i == 0
        patient_results << v.unshift(counter, k)
      else
        patient_results << v.unshift(counter, '')
      end
      counter += 1 
    end
  end
  return patient_results
end

#predict_patient(predictions, training_set, threshold, transform, genes, genes_dictionary) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/pets/reg2phen_methods.rb', line 3

def predict_patient(predictions, training_set, threshold, transform, genes, genes_dictionary)
  results = {}
  predictions.each do |info|
    if genes
      geneID = info.shift
      info = genes_dictionary[geneID]
    end 
    chr, pt_start, pt_stop, is_GeneSynom = info
    sors = training_set[chr]
    next if sors.nil?
    sors.each do |sor_start, sor_stop, nodeID, hpo_code, association_score|
      next if !coor_overlap?(pt_start, pt_stop, sor_start, sor_stop) 
      patientsNum = nodeID.split('.').last
      if association_score >= threshold 
        association_score = 10**(-association_score) if transform
        genes_in_sor = get_genes_by_coordinates(genes_dictionary, chr, sor_start, sor_stop)
        record2save = [chr, pt_start, pt_stop, hpo_code, association_score, sor_start, sor_stop, patientsNum, genes_in_sor.join(',')]
        key = info[0...3].join(":")
        key.concat(" (#{geneID})") if genes
        query_res = results[key]
        if query_res.nil?
            results[key] = [record2save]
        else
            query_res << record2save
        end
      end
    end
  end
  return results
end

#process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/pets/coPatReporterMethods.rb', line 42

def process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
  chrs = Hash.new(0)
  all_phens = []
  profile_ics = []
  profile_lengths = []
  patient_ids.each do |pat_id|
    phenotypes = patient_data.get_profile(pat_id) 
    profile_ics << get_profile_ic(phenotypes, phenotype_ic)
    profile_lengths << phenotypes.length
    if processed_clusters < options[:clusters2show_detailed_phen_data]
      phen_names, rejected_codes = ont.translate_ids(phenotypes) #optional
      all_phens << phen_names
    end  
    patient_data.get_vars(pat_id).get_chr.each{|chr| chrs[chr] += 1} if !options[:chromosome_col].nil?
  end
  return chrs, all_phens, profile_ics, profile_lengths 
end

#process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) ⇒ Object

get ic and chromosomes



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/pets/coPatReporterMethods.rb', line 16

def process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic) # get ic and chromosomes
  ont = Cohort.get_ontology(Cohort.act_ont)
  all_ics = []
  all_lengths = []
  top_cluster_phenotypes = []
  cluster_data_by_chromosomes = []
  multi_chromosome_patients = 0
  processed_clusters = 0
  clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
    num_of_patients = patient_ids.length
    next if num_of_patients == 1
    chrs, all_phens, profile_ics, profile_lengths = process_cluster(patient_ids, patient_data, phenotype_ic, options, ont, processed_clusters)
    top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
    all_ics << profile_ics
    all_lengths << profile_lengths
    if !options[:chromosome_col].nil?
      multi_chromosome_patients += num_of_patients if chrs.length > 1
      chrs.each do |chr, count|
        cluster_data_by_chromosomes << [cluster_id, num_of_patients, chr, count]
      end
    end
    processed_clusters += 1
  end
  return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
end

#read_compressed_json(path) ⇒ Object



468
469
470
471
472
473
# File 'lib/pets/io.rb', line 468

def read_compressed_json(path)
  infile = open(path)
  gz = Zlib::GzipReader.new(infile)
  object = JSON.parse(gz.read)
  return object
end

#read_excluded_hpo_file(file) ⇒ Object



24
25
26
27
28
29
30
# File 'lib/pets/io.rb', line 24

def read_excluded_hpo_file(file)
  excluded_hpo = []
  File.open(file).each do |line|
    excluded_hpo << line.chomp
  end
  return excluded_hpo
end

#remove_nested_entries(nested_hash) ⇒ Object



172
173
174
175
176
177
178
179
# File 'lib/pets/coPatReporterMethods.rb', line 172

def remove_nested_entries(nested_hash)
  empty_root_ids = []
  nested_hash.each do |root_id, entries|
    entries.select!{|id, val| yield(id, val)}
    empty_root_ids << root_id if entries.empty?
  end
  empty_root_ids.each{|id| nested_hash.delete(id)}
end

#report_data(container, html_file) ⇒ Object



282
283
284
285
286
287
288
289
290
291
292
293
# File 'lib/pets/phen2reg_methods.rb', line 282

def report_data(characterised_hpos, hpo_associated_regions, html_file, hpo, genes_with_kegg_data, pathway_stats)
	container = {:characterised_hpos => characterised_hpos,
	 	:merged_regions => hpo_associated_regions,
	 	:hpo => hpo,
	 	:genes_with_kegg_data => genes_with_kegg_data,
	 	:pathway_stats => pathway_stats
	 }
	template = File.open(File.join(REPORT_FOLDER, 'patient_report.erb')).read
	report = Report_html.new(container, 'Patient HPO profile summary')
	report.build(template)
	report.write(html_file)
end

#save_patient_matrix(output, patient_hpo_profile, regionAttributes, hpo_region_matrix) ⇒ Object



295
296
297
298
299
300
301
302
303
304
# File 'lib/pets/phen2reg_methods.rb', line 295

def save_patient_matrix(output, patient_hpo_profile, regionAttributes, hpo_region_matrix)
	File.open(output, "w") do |f|
		f.puts "Region\t#{patient_hpo_profile.join("\t")}"
		regionAttributes_array = regionAttributes.values
		hpo_region_matrix.each_with_index do |association_values, i|
		  chr, start, stop = regionAttributes_array[i]
		  f.puts "#{chr}:#{start}-#{stop}\t#{association_values.join("\t")}"
		end
	end
end

#scoring_regions(regionAttributes, hpo_region_matrix, scoring_system, pvalue_cutoff, freedom_degree, null_value = 0) ⇒ Object



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/pets/phen2reg_methods.rb', line 127

def scoring_regions(regionAttributes, hpo_region_matrix, scoring_system, pvalue_cutoff, freedom_degree, null_value=0)
	#hpo_associated_regions = [[chr, start, stop, [hpos_list], [weighted_association_scores]]]
	#hpo_region_matrix = [[0, 0.4, 0, 0.4], [0, 0, 0.5, 0.4]]
	# STDERR.puts "EH"
	regionAttributes_array = regionAttributes.values
	max_cluster_length = hpo_region_matrix.map{|x| x.count {|i| i != 0}}.max if freedom_degree == 'maxnum'
	hpo_region_matrix.each_with_index do |associations, i|
		sample_length = nil
		if freedom_degree == 'prednum'
			sample_length = associations.length
		elsif freedom_degree == 'phennum'
			sample_length = associations.count{|s| s != 0}
		elsif freedom_degree == 'maxnum'
			sample_length = max_cluster_length
		else
			abort("Invalid freedom degree calculation method: #{freedom_degree}")
		end

		if scoring_system == 'mean'
			mean_association = associations.inject(0){|s,x| s + x } / sample_length
			regionAttributes_array[i] << mean_association
		elsif scoring_system == 'fisher'
			#hyper must be ln not log10 from net analyzer
			#https://en.wikipedia.org/wiki/Fisher%27s_method
			# STDERR.puts associations.inspect
			lns = associations.map{|a| Math.log(10 ** -a)} #hyper values come as log10 values
			sum = lns.inject(0){|s, a| s + a} 
			combined_pvalue = Statistics2.chi2_x(sample_length *2, -2*sum)	
			regionAttributes_array[i] << combined_pvalue
		elsif scoring_system == 'stouffer'
			sum = associations.inject(0){|s,x| s + x}
			combined_z_score = sum/Math.sqrt(sample_length)
			regionAttributes_array[i] << combined_z_score
		elsif scoring_system == 'geommean'
			#NOTE: if troubles with number size, use BigDecimal
			geommean_mult = associations.inject(1){|s,x| s * x}
			geommean_association = geommean_mult.to_f ** ( sample_length ** -1 )
			regionAttributes_array[i] << geommean_association
		elsif scoring_system == 'median'
			median_value = associations.length / 2
			if median_value % 2 == 0
				median_up = associations.sort[median_value]
				median_down = associations.sort[median_value - 1]
				pair_median = ( median_up + median_down ) / 2
				median_association = associations.sort[pair_median.ceil]
			else
				median_association = associations.sort[median_value.ceil]
			end
			regionAttributes_array[i] << median_association
		elsif scoring_system == 'maxnum'
			max_association = associations.max
			regionAttributes_array[i] << max_association
		elsif scoring_system == 'minnum'
			min_association = associations.min
			regionAttributes_array[i] << min_association
		else 
			abort("Invalid ranking method: #{scoring_system}")
		end
	end
	if scoring_system == 'mean' || 
		scoring_system == 'geommean' ||
		scoring_system == 'maxnum' ||
		scoring_system == 'minnum'
		regionAttributes.select!{|regionID, attributes| attributes.last >= pvalue_cutoff}
	elsif scoring_system == 'fisher'
		regionAttributes.select!{|regionID, attributes| attributes.last <= pvalue_cutoff}
	end
	#Combined p-value: less value equals better association -> not due randomly.
end

#search4HPO(info2predict, trainingData) ⇒ Object

require ‘bigdecimal’



6
7
8
9
10
11
12
13
14
15
16
# File 'lib/pets/phen2reg_methods.rb', line 6

def search4HPO(info2predict, trainingData)
	#search if there are profile HPOs within the association file 
	hpo_regions = {}
	info2predict.each do |hpo|
		regions = trainingData[hpo]
		if !regions.nil?
			hpo_regions[hpo] = regions
		end
	end
	return hpo_regions
end

#system_call(code_folder, script, args_string) ⇒ Object



8
9
10
11
12
13
14
# File 'lib/pets/generalMethods.rb', line 8

def system_call(code_folder, script, args_string)
	cmd = File.join(code_folder, script) + ' ' + args_string
	puts "==> #{cmd}"
	Benchmark.bm do |x|
		x.report{	system(cmd) }
	end
end

#translate_codes(clusters, hpo) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
# File 'lib/pets/coPatReporterMethods.rb', line 2

def translate_codes(clusters, hpo)
  translated_clusters = []
  clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
        translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
        translated_clusters << [clusterID, 
          num_of_pats, 
          patientIDs_ary, 
          patient_hpos_ary, 
          translate_codes
        ]
  end
  return translated_clusters
end

#translate_hpos_in_results(results, hpo) ⇒ Object



46
47
48
49
50
51
52
53
54
# File 'lib/pets/reg2phen_methods.rb', line 46

def translate_hpos_in_results(results, hpo)
  results.each do |coords, data|
    data.each do |info|
      # hpo_name, rejected = hpo.translate_codes2names([info[3]])
      info[3] = hpo.translate_id(info[3])
      # info[3] = hpo_name.first
    end
  end
end

#write_array(array, file_path) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/pets/io.rb', line 41

def write_array(array, file_path)
  File.open(file_path, 'w') do |handler|
    array.each do |record|
      if record.class == String
        line = record
      else
        line = record.join("\t")
      end
      handler.puts line  
    end
  end
end

#write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name) ⇒ Object



128
129
130
131
132
133
134
135
136
137
# File 'lib/pets/io.rb', line 128

def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
  File.open(filename, 'w') do |f|
    f.puts "#{x_axis_name}\t#{y_axis_name}"
    x_axis_value.each_with_index do |value,i|
      y_value = y_axis_value[i]
      raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
      f.puts [value, y_value].join("\t")
    end
  end
end

#write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/pets/io.rb', line 76

def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
  File.open(cluster_chromosome_data_file, 'w') do |f|
    f.puts %w[cluster_id chr count].join("\t")
    index = 0
    last_id = cluster_data.first.first unless cluster_data.empty?
    cluster_data.each do |cluster_id, patient_number, chr, count|
      index += 1 if cluster_id != last_id 
      break if index == limit
      f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
      last_id = cluster_id
    end
  end
end

#write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/pets/io.rb', line 63

def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
  File.open(cluster_ic_data_file, 'w') do |f|
    f.puts %w[cluster_id ic Plen].join("\t")
    all_ics.each_with_index do |cluster_ics, i|
      break if i == limit
      cluster_length = cluster_ics.length
      cluster_ics.each_with_index do |clust_ic, j|
        f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
      end
    end
  end
end

#write_compressed_plain_file(data, path) ⇒ Object



460
461
462
463
464
465
466
# File 'lib/pets/io.rb', line 460

def write_compressed_plain_file(data, path)
  File.open(path, 'w') do |f|
    gz = Zlib::GzipWriter.new(f)
    gz.write data.to_json
    gz.close
  end
end

#write_coverage_data(coverage_to_plot, coverage_to_plot_file) ⇒ Object



90
91
92
93
94
95
96
# File 'lib/pets/io.rb', line 90

def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
  File.open(coverage_to_plot_file, 'w') do |f|
    coverage_to_plot.each do |chr, position, freq|
     f.puts "#{chr}\t#{position}\t#{freq}"
   end
  end
end

#write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) ⇒ Object



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/pets/io.rb', line 99

def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
  CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
    suggested_childs.each do |pat_id, suggestions|
      warning = nil
      warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
      csv << ["PATIENT #{pat_id}", "#{warning}"]
      csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
      suggestions.each do |parent, childs|
        parent_code, parent_name = parent
        if childs.empty?
          csv << ["#{parent_name} (#{parent_code})", '-']
        else
          parent_writed = false
          childs.each do |child_code, child_name|
            if !parent_writed
              parent_field = "#{parent_name} (#{parent_code})"
              parent_writed = true
            else
              parent_field = ""
            end
            csv << [parent_field, "#{child_name} (#{child_code})"]
          end
        end
      end
      csv << ["", ""]
    end
  end
end

#write_hash(hash, file_path, header = []) ⇒ Object



32
33
34
35
36
37
38
39
# File 'lib/pets/io.rb', line 32

def write_hash(hash, file_path, header = [])
  File.open(file_path, 'w') do |handler|
    handler.puts header.join("\t") if !header.empty?
    hash.each do |key, array|
      handler.puts "#{key}\t#{array.join("\t")}"
    end
  end
end

#write_matrix_for_R(matrix, x_names, y_names, file) ⇒ Object



54
55
56
57
58
59
60
61
# File 'lib/pets/io.rb', line 54

def write_matrix_for_R(matrix, x_names, y_names, file)
  File.open(file, 'w') do |f|
    f.puts x_names.join("\t")
    matrix.each_with_index do |row, i|
      f.puts [y_names[i]].concat(row).join("\t")
    end
  end
end

#write_patient_hpo_stat(average_hp_per_pat_distribution, output_file) ⇒ Object



158
159
160
161
162
163
164
165
# File 'lib/pets/io.rb', line 158

def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file)
  File.open(output_file, 'w') do |f|
    f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
    average_hp_per_pat_distribution.each do |patient_num, ave|
      f.puts "#{patient_num}\t#{ave}"
    end
  end
end

#write_profile_pairs(similarity_pairs, filename) ⇒ Object



148
149
150
151
152
153
154
155
156
# File 'lib/pets/io.rb', line 148

def write_profile_pairs(similarity_pairs, filename)
  File.open(filename, 'w') do |f|
    similarity_pairs.each do |pairsA, pairsB_and_values|
      pairsB_and_values.each do |pairsB, values|
        f.puts "#{pairsA}\t#{pairsB}\t#{values}"
      end
    end
  end
end

#write_similarity_matrix(similarity_matrix, similarity_matrix_file) ⇒ Object



140
141
142
143
144
145
146
# File 'lib/pets/io.rb', line 140

def write_similarity_matrix(similarity_matrix, similarity_matrix_file)  
  File.open(similarity_matrix_file, 'w') do |f|
    similarity_matrix.each do |row|
      f.puts row.join("\t")
    end
  end
end