Top Level Namespace
Defined Under Namespace
Modules: Bio, ChimericSeqs, CommonFunctions, FlAnalysis, FlnStats, FullLengtherNext, NcRna, ScbiMapreduce
Classes: Cdhit, ExoBlastHit, ExonerateResult, Mapping, Mpileup, MyWorker, MyWorkerEst, MyWorkerManagerEst, MyWorkerManagerFln, Orf, Seq, Sequence, String, TestCode, UneLosHit
Constant Summary
collapse
- FAILED =
-5
- OTHER =
-4
- UNMAPPED =
-3
- CHIMERA =
-2
- MISASSEMBLED =
-1
- UNKNOWN =
0
- COMPLETE =
1
- N_TERMINAL =
2
- C_TERMINAL =
3
- INTERNAL =
4
- NCRNA =
5
- CODING =
6
- OPERATION =
0
- QUERY =
1
- TARGET =
2
Constants included
from FlnStats
FlnStats::REPORT_FOLDER
ChimericSeqs::BEG, ChimericSeqs::HIT, ChimericSeqs::STOP
Instance Method Summary
collapse
-
#analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering) ⇒ Object
-
#artifact?(seq, query, db_name, db_path, options, new_seqs) ⇒ Boolean
-
#clean_by_identity(blast_result, ident) ⇒ Object
-
#clean_by_query_length_match(blast_result, min_len_nt) ⇒ Object
-
#clean_hsp_by_identity(hit, identity) ⇒ Object
-
#clean_overlapping_hsps(blast_result, keep_if_diff_sense = false) ⇒ Object
-
#clean_subject_overlapping_hsps(complete_hit, cleaned_hits) ⇒ Object
-
#cluster_hsps(hsps) ⇒ Object
-
#clustering_by_annot(seqs_with_hit, annotation_type) ⇒ Object
-
#clustering_by_id(seqs_with_hit) ⇒ Object
-
#count_cpu(options) ⇒ Object
-
#do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) ⇒ Object
Second server to representative transcriptome.
-
#do_makeblastdb(seqs, output, dbtype) ⇒ Object
-
#filter_hits(query, select_hits = 10) ⇒ Object
-
#find_hit(hit_acc, ar_hits) ⇒ Object
-
#get_coverage_subject(hit) ⇒ Object
-
#go_for_graph(sequences_by_ontologies, fpkm = {}) ⇒ Object
-
#hsps_relationship_subject(hit) ⇒ Object
-
#load_cd_hit_sequences_names(file) ⇒ Object
-
#load_isoform_hash(file) ⇒ Object
-
#load_seq_in_hash(seq_name, seq, isoform_hash) ⇒ Object
-
#misassembled_detection(query) ⇒ Object
-
#multiple_hsps(query, num) ⇒ Object
-
#overlapping_hsps_on_subject(query) ⇒ Object
-
#reduce_pool_sequences(putative_seqs, main_path, cpu) ⇒ Object
-
#report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB) ⇒ Object
-
#reptrans(seqs_annotation_prot, seqs_some_coding, seqs_unknown, options) ⇒ Object
-
#same_query_hsp(hit, second_hit) ⇒ Object
-
#same_sense?(hit, second_hit) ⇒ Boolean
-
#same_subject_hsp(hit, second_hit) ⇒ Object
-
#select_hits_by_coverage_subject(hits) ⇒ Object
-
#select_hits_by_evalue(hits, evalue) ⇒ Object
-
#select_hits_by_identity_query(hits) ⇒ Object
-
#select_hsps_by_id(hits, selected_ids) ⇒ Object
-
#select_representative(clusters_seqs_annot_prot) ⇒ Object
-
#select_seqs_more_500pb(seqs_array) ⇒ Object
-
#select_seqs_with_name(array_seqs, array_names) ⇒ Object
-
#set_thresold_evalue(hits) ⇒ Object
-
#subject_overlapping_hsps(hit) ⇒ Object
-
#write_fasta(seqs_array, file_name, mode) ⇒ Object
Methods included from FlnStats
#add_percentages_by_scalar, #add_percentages_by_vector, #calculate_n50_n90, #coding_stats_reptrans, #get_taxonomy, #handle_data_main_summary, #handle_data_reptrans_summary, #initialize_stats_hash, #initialize_stats_hash_reptrans, #last_stats, #sequence_stats, #summary_stats, #table_title, #write_mapping_report, #write_reptrans_stats, #write_summary_stats, #write_txt
#check_frame_shift, #contenidos_en_prot, #corrige_frame, #extend_match, #match_with_gapped_reference, #match_with_ungapped_reference, #refine_match, #reverse_seq, #scan_sequences
Methods included from NcRna
#find_nc_rna
Methods included from FlAnalysis
#analiza_orf_y_fl, #compare_seq_length_with_subject, #determine_status, #exonerate_fix_frame_shift, #find_end, #find_start, #mark_nt_seqs, #modify_3p_align, #modify_5p_align, #save_annotations, #set_start_codon, #show_nts
#cluster_query_hits, #confirm_chimeras, #define_hit_limits, #define_homology_zones, #delete_zones, #do_clustal, #duplicate_hits, #fragment_chimera, #get_hits, #get_limits, #get_limits_s, #hit_is_in?, #hit_move_limits, #hit_set_q_len, #min_distance_between_homology_zones, #move_limits, #overlapping_zones, #search_chimeras, #search_ident_prots, #set_cut_positions, #set_limits
Instance Method Details
#analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering) ⇒ Object
67
68
69
70
71
72
73
74
75
76
77
|
# File 'lib/full_lengther_next/reptrans.rb', line 67
def analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering)
clusters_seqs_annot_DB = clustering_by_id(seqs_annotation_DB)
representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB)
if pfam_clustering
clusters_seqs_annot_DB = clustering_by_annot(representative_seqs_annot_DB, :pfam_id) representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB) end
stats_hash[key_stats] += representative_seqs_annot_DB.length
report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
write_fasta(representative_seqs_annot_DB, reptrans_fasta, 'w')
end
|
#artifact?(seq, query, db_name, db_path, options, new_seqs) ⇒ Boolean
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
# File 'lib/full_lengther_next/artifacts.rb', line 9
def artifact?(seq, query, db_name, db_path, options, new_seqs)
artifact = false
if query.nil? && seq.unmapped? seq.hit = nil
artifact = true
seq.type = UNMAPPED
end
if !query.nil?
if !artifact && misassembled_detection(query) seq.hit = query.hits.first
artifact = true
seq.type = MISASSEMBLED
seq.warnings('ERROR#1')
end
=begin
if !artifact
hit_reference = query.hits.first.dup
query, overlapping = overlapping_hsps_on_subject(query)
if overlapping
if query.hits.first.nil?
seq.hit = hit_reference
else
seq.hit = query.hits.first
end
artifact = true
seq.type = OTHER
seq.warnings('ERROR#2')
end
end
=end
if !artifact && multiple_hsps(query, 3)
seq.hit = query.hits.first
seq.warnings('ERROR#3')
end
if !artifact && !options[:chimera].include?('d')
chimera = search_chimeras(seq, query, options, db_name, db_path)
if !chimera.nil?
new_seqs.concat(chimera)
seq.db_name = db_name
seq.type = CHIMERA
artifact = true
end
end
end
if artifact
if $verbose > 1
puts seq.prot_annot_calification
end
seq.db_name = db_name
seq.save_fasta = false
seq.ignore = true
end
return artifact
end
|
#clean_by_identity(blast_result, ident) ⇒ Object
137
138
139
140
141
142
143
144
145
146
|
# File 'lib/full_lengther_next/blast_functions.rb', line 137
def clean_by_identity(blast_result, ident)
blast_result.querys.each do |query|
if !query.hits.first.nil?
new_hits = query.hits.select{|hit| hit.ident > ident}
new_hits = [nil] if new_hits.empty? query.hits = new_hits
end
query.full_query_length = query.full_query_length.to_i end
end
|
#clean_by_query_length_match(blast_result, min_len_nt) ⇒ Object
148
149
150
151
152
153
154
155
156
157
158
|
# File 'lib/full_lengther_next/blast_functions.rb', line 148
def clean_by_query_length_match(blast_result, min_len_nt)
blast_result.querys.each do |query|
if !query.hits.first.nil?
new_hits = query.hits.select{|hit| hit.align_len * 3 > min_len_nt}
new_hits = [nil] if new_hits.empty? query.hits = new_hits
end
query.full_query_length = query.full_query_length.to_i
end
end
|
#clean_hsp_by_identity(hit, identity) ⇒ Object
298
299
300
301
|
# File 'lib/full_lengther_next/blast_functions.rb', line 298
def clean_hsp_by_identity(hit, identity)
hit.select!{|hsp| hsp.ident >= identity}
return hit
end
|
#clean_overlapping_hsps(blast_result, keep_if_diff_sense = false) ⇒ Object
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
|
# File 'lib/full_lengther_next/blast_functions.rb', line 161
def clean_overlapping_hsps(blast_result, keep_if_diff_sense = false)
blast_result.querys.each do |query|
if query.hits.length > 1
query.hits.each_with_index do |hit, j|
if hit.nil?
next
end
query.hits.each_with_index do |second_hit, i|
if second_hit.nil? || i == j next
end
if same_query_hsp(hit, second_hit) if keep_if_diff_sense
if same_sense?(hit, second_hit) query.hits[i] = nil
end
else
query.hits[i] = nil
end
end
end
end
query.hits.compact!
end
end
end
|
#clean_subject_overlapping_hsps(complete_hit, cleaned_hits) ⇒ Object
246
247
248
249
250
251
252
|
# File 'lib/full_lengther_next/blast_functions.rb', line 246
def clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
if complete_hit.length > 1
complete_hit, overlapping = subject_overlapping_hsps(complete_hit)
end
cleaned_hits.concat(complete_hit)
return complete_hit, overlapping
end
|
#cluster_hsps(hsps) ⇒ Object
303
304
305
306
307
308
309
310
311
312
313
314
315
|
# File 'lib/full_lengther_next/blast_functions.rb', line 303
def cluster_hsps(hsps)
hits = []
last_acc = ''
hsps.each do |hsp|
if hsp.acc != last_acc
hits << [hsp]
else
hits.last << hsp
end
last_acc = hsp.acc
end
return hits
end
|
#clustering_by_annot(seqs_with_hit, annotation_type) ⇒ Object
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
|
# File 'lib/full_lengther_next/reptrans.rb', line 165
def clustering_by_annot(seqs_with_hit, annotation_type)
clusters = []
annot_id = []
no_annotation_clusters = []
seqs_with_hit.each do |seq|
annot = seq.functional_annotations[annotation_type]
annot = annot.split(';').sort.join(';') if !annot.nil?
if annot == '-' || annot.nil?
no_annotation_clusters << [seq]
else
position = annot_id.index(annot)
if position.nil?
annot_id << annot
clusters << [seq]
else
clusters[position] << seq
end
end
end
clusters.concat(no_annotation_clusters)
return clusters
end
|
#clustering_by_id(seqs_with_hit) ⇒ Object
150
151
152
153
154
155
156
157
158
159
160
161
162
163
|
# File 'lib/full_lengther_next/reptrans.rb', line 150
def clustering_by_id(seqs_with_hit)
clusters=[]
hit_id=[]
seqs_with_hit.each do |seq|
position=hit_id.index(seq.get_acc)
if position.nil?
hit_id << seq.get_acc
clusters << [seq]
else
clusters[position] << seq
end
end
return clusters
end
|
#count_cpu(options) ⇒ Object
206
207
208
209
210
211
212
213
214
|
# File 'lib/full_lengther_next/reptrans.rb', line 206
def count_cpu(options)
cpu = 0
if options[:workers].class.to_s == 'Array'
cpu = options[:workers].length + 1
else
cpu = options[:workers]
end
return cpu
end
|
#do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) ⇒ Object
Second server to representative transcriptome
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
# File 'lib/full_lengther_next/reptrans.rb', line 101
def do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) $LOG.info 'Starting server for EST analysis'
custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker_EST.rb')
options[:chimera] = nil
MyWorkerManagerEst.init_work_manager(putative_seqs, options, blast_path)
server_EST = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerEst, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
server_EST.chunk_size = options[:chunk_size]
server_EST.start_server
$LOG.info 'Closing server for EST analysis'
seqs_with_EST, putative_seqs = MyWorkerManagerEst.get_array_seqs
if !seqs_with_EST.empty?
analysis_over_DB_annotated_seqs(seqs_with_EST, reptrans_fasta, cluster_EST_annotated_path, stats_hash, 'est_annotated')
end
return putative_seqs
end
|
#do_makeblastdb(seqs, output, dbtype) ⇒ Object
35
36
37
38
39
40
41
42
43
44
|
# File 'lib/full_lengther_next/handle_db.rb', line 35
def do_makeblastdb(seqs, output, dbtype)
cmd="makeblastdb -in - -out #{output} -title #{File.basename(output)} -dbtype #{dbtype} -parse_seqids"
IO.popen(cmd,'w+') {|makedb|
makedb.sync = true
makedb.write(seqs)
makedb.close_write
puts makedb.readlines
makedb.close_read
}
end
|
#filter_hits(query, select_hits = 10) ⇒ Object
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
# File 'lib/full_lengther_next/blast_functions.rb', line 4
def filter_hits(query, select_hits=10) hits = query.hits
if !hits.first.nil?
hits = cluster_hsps(hits)
hits = hits[0..select_hits]
hits = select_hits_by_identity_query(hits)
hits = select_hits_by_coverage_subject(hits)
end
if hits.empty?
if select_hits >= query.hits.length || select_hits >= 100 hits = [cluster_hsps(query.hits).first]
else
hits = filter_hits(query, select_hits+10)
end
end
return hits
end
|
#find_hit(hit_acc, ar_hits) ⇒ Object
317
318
319
320
321
322
323
324
325
326
|
# File 'lib/full_lengther_next/blast_functions.rb', line 317
def find_hit(hit_acc, ar_hits)
selected_hit = nil
ar_hits.each do |hit|
if hit.first.acc == hit_acc
selected_hit = hit
break
end
end
return selected_hit
end
|
#get_coverage_subject(hit) ⇒ Object
22
23
24
25
26
27
28
29
30
31
32
|
# File 'lib/full_lengther_next/blast_functions.rb', line 22
def get_coverage_subject(hit)
perc_identity = hit.align_len*100.0/hit.s_len
if perc_identity > 100 && hit.class.to_s == 'ExoBlastHit' && !hit.q_frameshift.empty?
hit.q_frameshift.length.times do |n| align_len = hit.align_len- (n + 1)
perc_identity = align_len*100.0/hit.s_len
break if perc_identity <= 100
end
end
return perc_identity
end
|
#go_for_graph(sequences_by_ontologies, fpkm = {}) ⇒ Object
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
# File 'lib/full_lengther_next/go_methods.rb', line 1
def go_for_graph(sequences_by_ontologies, fpkm = {})
container = {}
go_data = [
[:function_go, 'F:'],
[:component_go, 'C:'],
[:process_go, 'P:']
]
go_data.each do |key, prefix|
go_ontology = sequences_by_ontologies.select{|go, seq_ids| go =~ /^#{prefix}/}
go_names = []
go_vals = []
go_ontology.each do |go_name, seq_names|
go_label = go_name.gsub(prefix, '')
if fpkm.empty?
go_vals << seq_names.length
go_names << go_label
else
sum = seq_names.map{|seq_name| fpkm[seq_name].first }.inject { |sum, n| sum + n }
if sum > 0
go_vals << sum
go_names << go_label
end
end
end
go_table = []
go_names.each_with_index do |name, index|
go_table << [name, go_vals[index]]
end
go_table.sort!{|v1, v2| v2[1] <=> v1[1]}
go_table.unshift([key.to_s, 'GO'])
if !go_names.empty?
container[key] = go_table
else
container[key] = [
[key.to_s, 'GO'],
['No_data', 1]
]
end
end
return container
end
|
#hsps_relationship_subject(hit) ⇒ Object
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
|
# File 'lib/full_lengther_next/blast_functions.rb', line 271
def hsps_relationship_subject(hit)
hsps = []
hit.each_with_index do |hsp, j|
hit.each_with_index do |second_hsp, i|
if i == j next
end
if same_subject_hsp(hsp, second_hsp)
if !hsps.include?([hsp, second_hsp]) && !hsps.include?([second_hsp, hsp]) hsps << [hsp, second_hsp]
end
end
end
end
return hsps
end
|
#load_cd_hit_sequences_names(file) ⇒ Object
120
121
122
123
124
125
126
127
128
129
130
|
# File 'lib/full_lengther_next/reptrans.rb', line 120
def load_cd_hit_sequences_names(file)
names=[]
File.open(file).readlines.each do |line|
if line =~ /^>/
line.chomp!
line.gsub!('>','')
names << line
end
end
return names
end
|
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
# File 'lib/full_lengther_next/handle_db.rb', line 3
def load_isoform_hash(file)
isoform_hash = {}
if File.exists?(file)
fasta = ScbiZcatFile.new(file)
filtered_fasta = ''
seq_name = nil
seq = ''
while !fasta.eof
line = fasta.readline.chomp
if line[0] == '>'
load_seq_in_hash(seq_name, seq, isoform_hash) if !seq_name.nil?
seq_name = line
seq = ''
else
seq << line
end
end
load_seq_in_hash(seq_name, seq, isoform_hash)
end
return isoform_hash
end
|
#load_seq_in_hash(seq_name, seq, isoform_hash) ⇒ Object
25
26
27
28
29
30
31
32
33
|
# File 'lib/full_lengther_next/handle_db.rb', line 25
def load_seq_in_hash(seq_name, seq, isoform_hash)
name, desc = seq_name.split(' ', 2)
name =~ /(\w+\|(\w+)\-\d+\|)/
if isoform_hash[$2].nil?
isoform_hash[$2] = ">#{$1}#{desc}\n#{seq}"
else
isoform_hash[$2] += "\n>#{$1}#{desc}\n#{seq}"
end
end
|
#misassembled_detection(query) ⇒ Object
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
|
# File 'lib/full_lengther_next/blast_functions.rb', line 192
def misassembled_detection(query)
miss=false
hits = cluster_hsps(query.hits)
misassembled_hits = []
hits.each do |hit|
if hit.length > 1
negative_frame = hit.select{|hsp| hsp.q_frame < 0}
if negative_frame.length > 0 && negative_frame.length != hit.length
misassembled_hits << hit.first.acc
end
end
end
if misassembled_hits.length*1.0/ hits.length > 0.5
miss = true
else query.hits.reverse_each do |hsp|
if misassembled_hits.include?(hsp.acc)
query.hits.delete(hsp)
end
end
end
return miss
end
|
#multiple_hsps(query, num) ⇒ Object
216
217
218
219
220
221
222
223
|
# File 'lib/full_lengther_next/blast_functions.rb', line 216
def multiple_hsps(query, num)
multiple = false
hsps = query.hits.select{|h| h.acc == query.hits.first.acc}
if hsps.length >= num
multiple = true
end
return multiple
end
|
#overlapping_hsps_on_subject(query) ⇒ Object
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
|
# File 'lib/full_lengther_next/blast_functions.rb', line 225
def overlapping_hsps_on_subject(query)
overlapping = false
current_hit = query.hits.first.acc
complete_hit = []
cleaned_hits = []
query.hits.each do |hit|
if hit.acc != current_hit
complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
complete_hit = []
end
complete_hit << hit
current_hit = hit.acc
end
complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
query.hits = cleaned_hits
return query, overlapping
end
|
#reduce_pool_sequences(putative_seqs, main_path, cpu) ⇒ Object
88
89
90
91
92
93
94
95
96
97
98
99
|
# File 'lib/full_lengther_next/reptrans.rb', line 88
def reduce_pool_sequences(putative_seqs, main_path, cpu)
temp_fasta = File.join(main_path, 'temp.fasta')
temp_fasta_clean = File.join(main_path, 'temp_cln.fasta')
log_file = File.join(main_path, 'log_cd_hit_Cod_Unk')
write_fasta(putative_seqs, temp_fasta, 'w')
$LOG.info "Start cd-hit with coding and unknow sequences"
system("cd-hit -i #{temp_fasta} -o #{temp_fasta_clean} -c 0.95 -M 0 -T #{cpu} > #{log_file}") if !File.exists?(temp_fasta_clean)
$LOG.info "Ended cd-hit with coding and unknow sequences"
cd_hit_names_putative_seqs = load_cd_hit_sequences_names(temp_fasta_clean)
putative_seqs = select_seqs_with_name(putative_seqs, cd_hit_names_putative_seqs)
return putative_seqs
end
|
#report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB) ⇒ Object
79
80
81
82
83
84
85
86
|
# File 'lib/full_lengther_next/reptrans.rb', line 79
def report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
cluster_file = File.open(cluster_file_path, 'w')
representative_seqs_annot_DB.each_with_index do |rep_seq, i|
cluster_seqs = clusters_seqs_annot_DB[i].map{|seq| seq.seq_name}.join(';')
cluster_file.puts "#{rep_seq.seq_name}\t#{cluster_seqs}"
end
cluster_file.close
end
|
#reptrans(seqs_annotation_prot, seqs_some_coding, seqs_unknown, options) ⇒ Object
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
# File 'lib/full_lengther_next/reptrans.rb', line 9
def reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
cpus = count_cpu(options)
stats_hash = initialize_stats_hash_reptrans
main_path = File.join(Dir.pwd, 'fln_results')
reptrans_fasta = File.join(main_path, 'Representative_transcriptome.fasta')
blast_path = File.join(main_path, 'ESTdb')
cluster_prot_annotated_path =File.join(main_path, 'Prot_clusters')
cluster_EST_annotated_path =File.join(main_path, 'EST_clusters')
html_file = File.join(main_path, 'Representative_transcriptome_stats.html')
txt_file = File.join(main_path, 'Representative_transcriptome_stats.txt')
analysis_over_DB_annotated_seqs(seqs_annotation_prot, reptrans_fasta, cluster_prot_annotated_path, stats_hash, 'prot_annotated', options[:high_clustering])
seqs_annotation_prot = nil
putative_seqs = seqs_some_coding
if !options[:est_db].nil? putative_seqs += seqs_unknown putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus)
if !File.exists?(blast_path +'.nsq')
$LOG.info "Start makeblastdb over EST DB"
system("makeblastdb -in #{options[:est_db]} -out #{blast_path} -dbtype nucl -parse_seqids > #{File.join(main_path, 'log_makeblast_db')}")
$LOG.info "Ended makeblastdb over EST DB"
end
putative_seqs = do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash)
end
if !putative_seqs.nil? && !putative_seqs.empty?
putative_seqs = select_seqs_more_500pb(putative_seqs)
putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus) if options[:est_db].nil? putative_seqs.sort!{|s1, s2| if s2.t_code == s1.t_code
s2.fasta_length <=> s1.fasta_length
else
s2.t_code <=> s1.t_code
end
}
count = 0
putative_seqs.each do |coding_seq|
coding_stats_reptrans(coding_seq, stats_hash)
count +=1
end
write_fasta(putative_seqs, reptrans_fasta, 'a')
end
write_reptrans_stats(stats_hash, html_file, txt_file)
end
|
#same_query_hsp(hit, second_hit) ⇒ Object
117
118
119
120
121
122
123
124
125
|
# File 'lib/full_lengther_next/blast_functions.rb', line 117
def same_query_hsp(hit, second_hit)
same = false
if hit.acc == second_hit.acc
if hit.q_beg <= second_hit.q_beg && hit.q_end >= hit.q_end && (second_hit.q_beg - hit.q_end).abs > 1
same = true
end
end
return same
end
|
#same_sense?(hit, second_hit) ⇒ Boolean
127
128
129
130
131
132
133
134
135
|
# File 'lib/full_lengther_next/blast_functions.rb', line 127
def same_sense?(hit, second_hit)
same= false
hit_sense = hit.q_frame <=> 0
second_hit_sense = second_hit.q_frame <=> 0
if hit_sense == second_hit_sense
same = true
end
return same
end
|
#same_subject_hsp(hit, second_hit) ⇒ Object
107
108
109
110
111
112
113
114
115
|
# File 'lib/full_lengther_next/blast_functions.rb', line 107
def same_subject_hsp(hit, second_hit)
same = false
if hit.acc == second_hit.acc
if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1
same = true
end
end
return same
end
|
#select_hits_by_coverage_subject(hits) ⇒ Object
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
# File 'lib/full_lengther_next/blast_functions.rb', line 34
def select_hits_by_coverage_subject(hits)
selected_hits = []
coverage_thresold = get_coverage_subject(hits.first.first)
coverage_thresold = 100 if coverage_thresold > 100
hits.map{|hit|
hit.each do |hsp|
coverage = get_coverage_subject(hsp)
if coverage > 100
next
end
if coverage >= coverage_thresold
selected_hits << hit
break
end
end
}
return selected_hits
end
|
#select_hits_by_evalue(hits, evalue) ⇒ Object
68
69
70
71
72
73
74
75
76
77
78
|
# File 'lib/full_lengther_next/blast_functions.rb', line 68
def select_hits_by_evalue(hits, evalue)
selected_hits = []
hits.map{|hit|
hit.each do |hsp|
if hsp.e_val <= evalue
selected_hits << hit
end
end
}
return selected_hits
end
|
#select_hits_by_identity_query(hits) ⇒ Object
54
55
56
57
58
59
60
61
62
63
64
65
66
|
# File 'lib/full_lengther_next/blast_functions.rb', line 54
def select_hits_by_identity_query(hits)
selected_hits = []
identity = hits.first.first.ident
hits.map{|hit|
hit.each do |hsp|
if hsp.ident >= identity
selected_hits << hit
break
end
end
}
return selected_hits
end
|
#select_hsps_by_id(hits, selected_ids) ⇒ Object
80
81
82
83
84
85
86
87
88
|
# File 'lib/full_lengther_next/blast_functions.rb', line 80
def select_hsps_by_id(hits, selected_ids)
selected_hits = []
hits.map{|hsp|
if selected_ids.include?(hsp.acc)
selected_hits << hsp
end
}
return selected_hits
end
|
#select_representative(clusters_seqs_annot_prot) ⇒ Object
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
|
# File 'lib/full_lengther_next/reptrans.rb', line 188
def select_representative(clusters_seqs_annot_prot)
seqs = []
clusters_seqs_annot_prot.each do |cluster|
if !cluster.first.coverage_analysis.empty? max_transcript_mean_coverage = cluster.map{|seq| seq.coverage_analysis[3] }.max - 0.05 cluster.select!{|seq| seq.coverage_analysis[3] >= max_transcript_mean_coverage}
end
seq = cluster.select{|s| s.type == COMPLETE}.sort{|fl1, fl2| fl2.seq_fasta <=> fl1.seq_fasta}.first if seq.nil?
cluster.sort!{|cl1, cl2| cl2.get_pident <=> cl1.get_pident}
best_pident = cluster.first.get_pident
seq = cluster.select{|s| s.get_pident == best_pident}.sort{|s1, s2| s2.seq_fasta <=> s1.seq_fasta}.first
end
seqs << seq
end
return seqs
end
|
#select_seqs_more_500pb(seqs_array) ⇒ Object
132
133
134
135
|
# File 'lib/full_lengther_next/reptrans.rb', line 132
def select_seqs_more_500pb(seqs_array)
seqs = seqs_array.select{|seq| seq.fasta_length > 500 }
return seqs
end
|
#select_seqs_with_name(array_seqs, array_names) ⇒ Object
137
138
139
140
|
# File 'lib/full_lengther_next/reptrans.rb', line 137
def select_seqs_with_name(array_seqs, array_names)
seqs = array_seqs.select{|seq| array_names.include?(seq.seq_name)}
return seqs
end
|
#set_thresold_evalue(hits) ⇒ Object
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
# File 'lib/full_lengther_next/blast_functions.rb', line 90
def set_thresold_evalue(hits)
evalue = 100
hits.map{|hit|
if hit.e_val != 0 && hit.e_val < evalue
evalue = hit.e_val
end
}
if evalue == 100
evalue = 0
else
exp = Math.log10(evalue).abs.to_i
min_exp = (exp/10.0).ceil
evalue = 10.0**-(exp-min_exp)
end
return evalue
end
|
#subject_overlapping_hsps(hit) ⇒ Object
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
|
# File 'lib/full_lengther_next/blast_functions.rb', line 254
def subject_overlapping_hsps(hit)
overlapping = false
hsp_table = hsps_relationship_subject(hit)
if !hsp_table.empty?
hit = clean_hsp_by_identity(hit, 55)
if hit.empty?
overlapping = true
else
hsp_table = hsps_relationship_subject(hit)
if !hsp_table.empty?
overlapping = true
end
end
end
return hit, overlapping
end
|
#write_fasta(seqs_array, file_name, mode) ⇒ Object
142
143
144
145
146
147
148
|
# File 'lib/full_lengther_next/reptrans.rb', line 142
def write_fasta(seqs_array, file_name, mode)
file=File.open(file_name, mode)
seqs_array.each do |seq|
file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
end
file.close
end
|