Class: Lederhosen::CLI
- Inherits:
-
Thor
- Object
- Thor
- Lederhosen::CLI
- Includes:
- Thor::Actions
- Defined in:
- lib/lederhosen/cli.rb,
lib/lederhosen/no_tasks.rb,
lib/lederhosen/tasks/cluster.rb,
lib/lederhosen/tasks/version.rb,
lib/lederhosen/tasks/get_reps.rb,
lib/lederhosen/tasks/make_udb.rb,
lib/lederhosen/tasks/otu_table.rb,
lib/lederhosen/tasks/otu_filter.rb,
lib/lederhosen/tasks/split_fasta.rb,
lib/lederhosen/tasks/join_otu_tables.rb,
lib/lederhosen/tasks/count_taxonomies.rb,
lib/lederhosen/tasks/separate_unclassified.rb
Overview
The CLI class holds all of the Thor tasks
Instance Attribute Summary collapse
-
#taxonomy_format ⇒ Object
Returns the value of attribute taxonomy_format.
Instance Method Summary collapse
- #cluster ⇒ Object
- #count_taxonomies ⇒ Object
- #get_reps ⇒ Object
- #join_otu_tables ⇒ Object
- #make_udb ⇒ Object
- #otu_filter ⇒ Object
- #otu_table ⇒ Object
- #separate_unclassified ⇒ Object
- #split_fasta ⇒ Object
- #version ⇒ Object
Instance Attribute Details
#taxonomy_format ⇒ Object
Returns the value of attribute taxonomy_format.
5 6 7 |
# File 'lib/lederhosen/no_tasks.rb', line 5 def taxonomy_format @taxonomy_format end |
Instance Method Details
#cluster ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/lederhosen/tasks/cluster.rb', line 16 def cluster input = File.([:input]) database = File.([:database]) threads = [:threads] identity = [:identity] output = File.([:output]) strand = [:strand] dry_run = [:dry_run] query_cov = [:query_cov] ohai "#{'(dry run)' if dry_run} clustering #{input} to #{database} and saving to #{output}" .each_pair do |key, value| ohai "#{key} = #{value}" end cmd = ['usearch', "--usearch_local #{input}", "--id #{identity}", "--uc #{output}", "--db #{database}", "--strand #{strand}", "--query_cov #{query_cov}" ] # threads = False : use all threads (default) if threads != false cmd << "--threads #{threads}" end cmd = cmd.join(' ') unless dry_run run cmd else puts cmd end end |
#count_taxonomies ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/lederhosen/tasks/count_taxonomies.rb', line 9 def count_taxonomies input = [:input] output = [:output] ohai "generating #{output} from #{input}" handle = File.open(input) uc = UCParser.new(handle) taxonomy_count = get_taxonomy_count(uc) handle.close out = File.open(output, 'w') out.puts '# taxonomy, number_of_reads' taxonomy_count.each_pair do |taxonomy, count| out.puts "#{taxonomy.tr(',','_')},#{count}" end out.close end |
#get_reps ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/lederhosen/tasks/get_reps.rb', line 11 def get_reps inputs = Dir[[:input]] database = File.([:database]) output = File.([:output]) taxa = Set.new ohai "getting representative database sequences from #{database} using #{inputs.size} cluster file(s) and saving to #{output}" # parse uc file, get list of taxa we need to get # full sequences for from the database = ProgressBar.new 'reading uc(s)', inputs.size inputs.each do |input| File.open(input) do |handle| results = UCParser.new(handle) results.each do |result| taxa << result.target if result.hit? end end end .finish ohai "found #{taxa.size} representative sequences" # print representative sequences from database output = File.open(output, 'w') kept = 0 File.open(database) do |handle| Dna.new(handle).each do |record| if taxa.include? record.name output.puts record kept += 1 end end end output.close ohai "saved #{kept} representatives" end |
#join_otu_tables ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/lederhosen/tasks/join_otu_tables.rb', line 12 def join_otu_tables input = Dir[[:input]] output = File.([:output]) ohai "combining #{input.size} file(s) and saving to #{output}" all_otu_names = Set.new all_samples = Set.new sample_name_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } } # read all of the csv files input.each do |input_file| File.open(input_file) do |handle| otu_names = handle.gets.strip.split(',')[1..-1] all_otu_names += otu_names.to_set handle.each do |line| line = line.strip.split(',') sample = File.basename(input_file) all_samples << sample read_counts = line[1..-1] otu_names.zip(read_counts) do |name, count| sample_name_count[sample][name] = count end end end end # save to csv File.open(output, 'w') do |handle| header = all_otu_names.to_a.sort handle.puts "-,#{header.join(',')}" all_samples.to_a.sort.each do |sample| handle.print "#{sample}" header.each do |name| handle.print ",#{sample_name_count[sample][name]}" end handle.print "\n" end end end |
#make_udb ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/lederhosen/tasks/make_udb.rb', line 9 def make_udb input = [:input] output = [:output] word_length = [:word_length] db_step = [:db_step] ohai "making udb w/ #{input}, saving as #{output}." cmd = ['usearch', "-makeudb_usearch #{input}", "-output #{output}", ] cmd = cmd.join(' ') run cmd end |
#otu_filter ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/lederhosen/tasks/otu_filter.rb', line 13 def otu_filter input = File.([:input]) output = File.([:output]) reads = [:reads] min_samples = [:samples] ohai "filtering otu file #{input} (reads = #{reads}, samples = #{min_samples})" # make one pass finding which OTUs to keep # create mask that maps which columns correspond to good OTUs # pass over table again printing only those columns seen = Hash.new { |h, k| h[k] = 0 } otu_order = [] = ProgressBar.new 'counting', File.size(input) total_reads = 0 File.open(input) do |handle| header = handle.gets.strip.split(',') header.each { |x| otu_order << x } handle.each do |line| .set handle.pos line = line.strip.split(',') sample_name = line[0] abunds = line[1..-1].map &:to_i otu_order.zip(abunds) do |o, a| total_reads += a seen[o] += 1 if a >= reads end end end .finish mask = otu_order.map { |x| seen[x] >= min_samples } ohai "found #{otu_order.size} otus, keeping #{mask.count(true)}" output = File.open(output, 'w') = ProgressBar.new 'writing', File.size(input) filtered_reads = 0 File.open(input) do |handle| header = handle.gets.strip.split(',') header = header.zip(mask).map { |k, m| k if m }.compact output.print header.join(',') output.print ",noise\n" # need a "noise" column handle.each do |line| .set handle.pos line = line.strip.split(',') sample_name = line[0] counts = line[1..-1].map &:to_i kept_counts = counts.zip(mask).map { |c, m| c if m }.compact noise = counts.zip(mask).map { |c, m| c unless m }.compact.inject(:+) filtered_reads += noise || 0 output.puts "#{sample_name},#{kept_counts.join(',')},#{noise}" end end .finish ohai "kept #{total_reads - filtered_reads}/#{total_reads} reads (#{100*(total_reads - filtered_reads)/total_reads.to_f}%)" output.close end |
#otu_table ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
# File 'lib/lederhosen/tasks/otu_table.rb', line 15 def otu_table inputs = Dir[[:files]] level = [:level].downcase output = [:output] ohai "Generating OTU matrix from #{inputs.size} inputs at #{level} level and saving to #{output}." # sanity check fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level fail 'no inputs matched your glob' if inputs.size == 0 sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } } # create a progress bar with the total number of bytes of # the files we're slurping up = ProgressBar.new "loading", inputs.size inputs.each do |input_file| File.open(input_file).each do |line| next if line =~ /^#/ # skip header(s) line = line.strip.split(',') taxonomy, count = line count = count.to_i tax = if taxonomy == 'unclassified_reads' 'unclassified_reads' else if level == 'original' taxonomy else parse_taxonomy(taxonomy)[level] end end sample_cluster_count[input_file][tax] += count end .inc end .finish all_clusters = sample_cluster_count.values.map(&:keys).flatten.uniq.sort out = File.open(output, 'w') out.puts all_clusters.join(',') = ProgressBar.new('saving', inputs.size) inputs.sort.each do |input| .inc out.print "#{input}" all_clusters.each do |c| out.print ",#{sample_cluster_count[input][c]}" end out.print "\n" end .finish end |
#separate_unclassified ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/lederhosen/tasks/separate_unclassified.rb', line 15 def separate_unclassified uc_file = [:uc_file] reads = [:reads] output = [:output] strict = [:strict] unclassifieds = Set.new handle = File.open(uc_file) uc = UCParser.new(handle) total_reads = 0 if not strict uc.each do |result| unclassifieds << result.query if result.miss? total_reads += 1 end elsif strict uc.each_slice(2) do |left, right| total_reads += 2 if left.miss? || right.miss? # at least one is a miss unclassifieds << left.query unclassifieds << right.query # both are hits, check taxonomies else ta = parse_taxonomy(right.target) tb = parse_taxonomy(left.target) # inconsistent assignment or at least one is a miss if (ta[strict] != tb[strict]) unclassifieds << left.query unclassifieds << right.query end end end end ohai "found #{unclassifieds.size} unclassified #{'(strict pairing)' if strict} reads." ohai "unclassified: #{'%0.2f' % (100*unclassifieds.size/total_reads.to_f)} %" handle.close # open fasta file, output unclassified reads out = File.open(output, 'w') Dna.new(File.open(reads)).each do |record| if unclassifieds.include? record.name out.puts record end end out.close end |
#split_fasta ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/lederhosen/tasks/split_fasta.rb', line 18 def split_fasta input = File.([:input]) out_dir = [:out_dir] n = [:n].to_i gzip = [:gzip] ohai "splitting #{input} into files with #{n} reads stored in #{out_dir}" ohai "using gzip" if gzip `mkdir -p #{out_dir}` File.open input do |handle| = ProgressBar.new 'splitting', File.size(handle) Dna.new(handle).each_with_index do |record, i| .set handle.pos # I have to use a class variable here because # if I don't the variable gets set to nil after # after each iteration. @out = if i%n == 0 # start a new file # GzipWriter must be closed explicitly # this raises an exception this first time @out.close rescue nil # create an IO object depending on whether or # not the user wants to use gzip if gzip Zlib::GzipWriter.open(File.join(out_dir, "split_#{i/n}.fasta.gz")) else File.open(File.join(out_dir, "split_#{i/n}.fasta"), 'w') end else # keep using current handle @out end @out.puts record end .finish @out.close end ohai "created #{Dir[File.join(out_dir, '*')].size} files" end |
#version ⇒ Object
10 11 12 |
# File 'lib/lederhosen/tasks/version.rb', line 10 def version puts "lederhosen-#{Lederhosen::Version::STRING} codename #{Lederhosen::Version::CODENAME}" end |