Module: Aai
- Includes:
- Utils
- Defined in:
- lib/aai/utils.rb,
lib/aai/version.rb,
lib/aai/core_extensions.rb,
lib/aai.rb
Defined Under Namespace
Modules: CoreExtensions, Utils
Constant Summary collapse
- VERSION =
"0.5.5"
- COPYRIGHT =
"2017 Ryan Moore"
- CONTACT =
"[email protected]"
- WEBSITE =
"https://github.com/mooreryan/aai"
- LICENSE =
"MIT"
- VERSION_BANNER =
" # Version: #{VERSION} # Copyright #{COPYRIGHT} # Contact: #{CONTACT} # Website: #{WEBSITE} # License: #{LICENSE}"
- BLAST_DB_SEPARATOR =
"...."
- BLAST_DB_SUFFIX =
"#{BLAST_DB_SEPARATOR}aai"
- PIDENT_CUTOFF =
30
- EVALUE_CUTOFF =
1e-3
- LENGTH_CUTOFF =
actually is 70 percent
70
Instance Method Summary collapse
-
#aai_strings(one_way_aai, two_way_aai) ⇒ Object
Returns an array (enumerable) of aai strings ready to print.
- #blast_permutations!(fastas, blast_dbs, cpus = 4) ⇒ Object
- #get_best_hits(fnames, seq_lengths) ⇒ Object
-
#make_blastdbs!(fnames, cpus = 4) ⇒ Array<String>
Make blast dbs given an array of filenames.
- #one_way_aai(best_hits) ⇒ Object
-
#process_input_seqs!(fnames) ⇒ Object
Returns a hash table with sequence lengths and writes new fasta files with clean headers for blast.
- #two_way_aai(best_hits) ⇒ Object
Methods included from Utils
#check_command, #check_files, #clean_str, #command?, #one_way_combinations, #two_ary_permutations
Instance Method Details
#aai_strings(one_way_aai, two_way_aai) ⇒ Object
Returns an array (enumerable) of aai strings ready to print
274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 |
# File 'lib/aai.rb', line 274 def aai_strings one_way_aai, two_way_aai aai_strings = {} keys = (one_way_aai.keys + two_way_aai.keys). map { |key| key.sort }.uniq keys.each do |key| a_to_b_aai = one_way_aai[key] || "NA" b_to_a_aai = one_way_aai[key.reverse] || "NA" two_way = two_way_aai[key] || "NA" aai_strings[key] = [a_to_b_aai, b_to_a_aai, two_way] end aai_strings.map do |genome_pair, aais| [genome_pair, aais].flatten.join "\t" end end |
#blast_permutations!(fastas, blast_dbs, cpus = 4) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/aai.rb', line 24 def blast_permutations! fastas, blast_dbs, cpus=4 file_permutations = one_way_combinations fastas, blast_dbs, true file_permutations = file_permutations.select do |f1, f2| genome_from_fname(f1) != genome_from_fname(f2) end first_files = file_permutations.map(&:first) second_files = file_permutations.map(&:last) first_genomes = first_files.map do |fname| ary = fname.split(".") ary.take(ary.length - 1).join end second_genomes = second_files.map do |fname| ary = fname.split(BLAST_DB_SEPARATOR).take(1) AbortIf.abort_unless ary.length == 1, "Bad file name for #{fname}" ary = ary.first.split(".") File.basename ary.take(ary.length - 1).join end outf_names = first_genomes.zip(second_genomes).map do |f1, f2| "#{f1}____#{f2}.aai_blastp" end args = first_files.length.times.map do |idx| [first_files[idx], second_files[idx], outf_names[idx]] end Parallel.each(args, in_processes: cpus) do |infiles| query = infiles[0] db = infiles[1] out = infiles[2] cmd = "diamond blastp --threads 1 --outfmt 6 " + "--query #{query} --db #{db} --out #{out} " + "--evalue #{EVALUE_CUTOFF}" Process.run_and_time_it! "Diamond blast", cmd end outf_names end |
#get_best_hits(fnames, seq_lengths) ⇒ Object
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
# File 'lib/aai.rb', line 122 def get_best_hits fnames, seq_lengths best_hits = {} num_fnames = fnames.count log_every = num_fnames > 10 ? num_fnames / 10 : 1 fnames.each_with_index do |fname, idx| # blast files if (idx % log_every).zero? AbortIf.logger.debug { "Working on blastp file " + "##{idx} of #{num_fnames}"} end File.open(fname, "rt").each_line do |line| ary = line.chomp.split "\t" query = ary[0] target = ary[1] pident = ary[2].to_f length = ary[3].to_f evalue = ary[10].to_f query_genome = query.split("____").first query_seq = query.split("____").last target_genome = target.split("____").first target_seq = target.split("____").last seq_length_key = "#{query_genome}____#{query_seq}" AbortIf.abort_unless seq_lengths.has_key?(seq_length_key), "#{seq_length_key} is missing from " + "seq_lengths" query_length = seq_lengths[seq_length_key].to_f length_percent = length / query_length * 100 hit_info = { query_name: query_seq, target_name: target_seq, query_genome: query_genome, target_genome: target_genome, pident: pident, length: length_percent, evalue: evalue } # check if it is a best hit candidate if pident >= PIDENT_CUTOFF && evalue <= EVALUE_CUTOFF && length_percent >= LENGTH_CUTOFF if best_hits.has_key? query_genome if best_hits[query_genome].has_key? query_seq if best_hits[query_genome][query_seq]. has_key?(target_genome) # check if we should replace the best hit? current_best_hit = best_hits[query_genome][query_seq][target_genome] if pident >= current_best_hit[:pident] best_hits[query_genome][query_seq][target_genome] = hit_info end else best_hits[query_genome][query_seq][target_genome] = hit_info end else best_hits[query_genome][query_seq] = { target_genome => hit_info } end else best_hits[query_genome] = { query_seq => { target_genome => hit_info } } end else # pass end end end best_hits end |
#make_blastdbs!(fnames, cpus = 4) ⇒ Array<String>
Make blast dbs given an array of filenames.
77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/aai.rb', line 77 def make_blastdbs! fnames, cpus=4 suffix = BLAST_DB_SUFFIX outfiles = fnames.map { |fname| fname + suffix } Parallel.each(fnames, in_processes: cpus) do |fname| cmd = "diamond makedb --threads 1 --in #{fname} " + "--db #{fname}#{BLAST_DB_SUFFIX}" Process.run_and_time_it! "Make db", cmd end outfiles end |
#one_way_aai(best_hits) ⇒ Object
208 209 210 211 212 213 |
# File 'lib/aai.rb', line 208 def one_way_aai best_hits one_way_best_hits(best_hits).map do |genome_pair, best_hits| [genome_pair, best_hits.map { |hit| hit[:pident] }.reduce(:+) / best_hits.length.to_f] end.to_h end |
#process_input_seqs!(fnames) ⇒ Object
Returns a hash table with sequence lengths and writes new fasta files with clean headers for blast.
The sequences are annotated with the genome that they came from.
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/aai.rb', line 97 def process_input_seqs! fnames seq_lengths = {} clean_fnames = [] fnames.each do |fname| clean_fname = fname + "_aai_clean" clean_fnames << clean_fname File.open(clean_fname, "w") do |f| Object::ParseFasta::SeqFile.open(fname).each_record do |rec| unless bad_seq? rec.seq header = annotate_header clean_header(rec.header), File.basename(fname) seq_lengths[header] = rec.seq.length f.puts ">#{header}\n#{rec.seq}" end end end end [seq_lengths, clean_fnames] end |
#two_way_aai(best_hits) ⇒ Object
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 |
# File 'lib/aai.rb', line 215 def two_way_aai best_hits # the pair key is the [g1, g2].sort two_way_aai = {} one_way_hits = one_way_best_hits best_hits genome_pair_keys = one_way_hits.keys.map { |pair| pair.sort }.uniq num_genome_pairs = genome_pair_keys.count log_every = num_genome_pairs > 10 ? num_genome_pairs / 10 : 1 genome_pair_keys.each_with_index do |pair_key, idx| if (idx % log_every).zero? AbortIf.logger.debug { "Working on genome pair ##{idx} of " + "#{num_genome_pairs}" } end if one_way_hits.has_key?(pair_key) && one_way_hits.has_key?(pair_key.reverse) forward_hits = one_way_hits[pair_key] reverse_hits = one_way_hits[pair_key.reverse] combinations = one_way_combinations forward_hits, reverse_hits two_way_hits = combinations.select do |h1, h2| two_way_hit? h1, h2 end two_way_hit_info = two_way_hits.map do |h1, h2| { genome_pair: [h1[:query_genome], h1[:target_genome]].sort, pident: (h1[:pident] + h2[:pident]) / 2.0 } end two_way_hit_info.each do |hit| if two_way_aai.has_key? hit[:genome_pair] two_way_aai[hit[:genome_pair]] << hit[:pident] else two_way_aai[hit[:genome_pair]] = [hit[:pident]] end end elsif !one_way_hits.has_key?(pair_key) AbortIf.logger.warn { "No pair info for #{pair_key}. " + "No two way hits possible " + "for #{pair_key}." } elsif !one_way_hits.has_key?(pair_key.reverse) AbortIf.logger.warn { "No pair info for #{pair_key.reverse}. " + "No two way hits possible " + "for #{pair_key}." } end end # outside of genome_pair_keys.each two_way_aai.map do |genome_pair, pidents| [genome_pair, pidents.reduce(:+) / pidents.length.to_f] end.to_h end |