Class: Lederhosen::CLI

Inherits:

Thor

Object
Thor
Lederhosen::CLI

show all

Includes:: Thor::Actions

Defined in:: lib/lederhosen/cli.rb,
lib/lederhosen/no_tasks.rb,
lib/lederhosen/tasks/trim.rb,
lib/lederhosen/tasks/cluster.rb,
lib/lederhosen/tasks/version.rb,
lib/lederhosen/tasks/get_reps.rb,
lib/lederhosen/tasks/make_udb.rb,
lib/lederhosen/tasks/otu_table.rb,
lib/lederhosen/tasks/otu_filter.rb,
lib/lederhosen/tasks/split_fasta.rb,
lib/lederhosen/tasks/join_otu_tables.rb

Overview

The CLI class holds all of the Thor tasks

Instance Attribute Summary collapse

#taxonomy_format ⇒ Object

Returns the value of attribute taxonomy_format.

Instance Method Summary collapse

Instance Attribute Details

#taxonomy_format ⇒ `Object`

Returns the value of attribute taxonomy_format.



4
5
6

# File 'lib/lederhosen/no_tasks.rb', line 4

def taxonomy_format
  @taxonomy_format
end

Instance Method Details

#cluster ⇒ `Object`

# File 'lib/lederhosen/tasks/cluster.rb', line 14

def cluster
  input    = File.expand_path(options[:input])
  database = File.expand_path(options[:database])
  threads  = options[:threads]
  identity = options[:identity]
  output   = File.expand_path(options[:output])
  strand   = options[:strand]

  ohai "clustering #{input} to #{database} and saving to #{output}"

  options.each_pair do |key, value|
    ohai "#{key} = #{value}"
  end

  cmd = ['usearch',
    "--usearch_local #{input}",
    "--id #{identity}",
    "--uc #{output}",
    "--db #{database}",
    "--strand #{strand}"
  ]

  # threads = False : use all threads (default)
  if threads != false
    cmd << "--threads #{threads}"
  end

  cmd = cmd.join(' ')

  run cmd
end

#get_reps ⇒ `Object`

# File 'lib/lederhosen/tasks/get_reps.rb', line 11

def get_reps
  inputs   = Dir[options[:input]]
  database = File.expand_path(options[:database])
  output   = File.expand_path(options[:output])

  taxa = Set.new

  ohai "getting representative database sequences from #{database} using #{inputs.size} cluster file(s) and saving to #{output}"

  # parse uc file, get list of taxa we need to get
  # full sequences for from the database
  pbar = ProgressBar.new 'reading uc(s)', inputs.size

  inputs.each do |input|
    File.open(input) do |handle|
      pbar.inc
      handle.each do |line|
        header = parse_usearch_line(line.strip)
        taxa << header['original'] rescue nil
      end
    end
  end

  pbar.finish

  ohai "found #{taxa.size} representative sequences"

  # print representative sequences from database
  output = File.open(output, 'w')
  kept = 0
  File.open(database) do |handle|
    Dna.new(handle).each do |record|
      if taxa.include? record.name
        output.puts record
        kept += 1
      end
    end
  end

  output.close

  ohai "saved #{kept} representatives"

end

#join_otu_tables ⇒ `Object`

# File 'lib/lederhosen/tasks/join_otu_tables.rb', line 12

def join_otu_tables

  input = Dir[options[:input]]
  output = File.expand_path(options[:output])

  ohai "combining #{input.size} file(s) and saving to #{output}"

  all_otu_names = Set.new
  all_samples = Set.new

  sample_name_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }

  # read all of the csv files
  input.each do |input_file|
    File.open(input_file) do |handle|
      otu_names = handle.gets.strip.split(',')[1..-1]
      all_otu_names += otu_names.to_set

      handle.each do |line|
        line = line.strip.split(',')
        sample = File.basename(input_file)
        all_samples << sample
        read_counts = line[1..-1]
        otu_names.zip(read_counts) do |name, count|
          sample_name_count[sample][name] = count
        end
      end
    end
  end

  # save to csv
  File.open(output, 'w') do |handle|
    header = all_otu_names.to_a.sort
    handle.puts "-,#{header.join(',')}"

    all_samples.to_a.sort.each do |sample|
      handle.print "#{sample}"
      header.each do |name|
        handle.print ",#{sample_name_count[sample][name]}"
      end
      handle.print "\n"
    end
  end


end

#make_udb ⇒ `Object`

# File 'lib/lederhosen/tasks/make_udb.rb', line 9

def make_udb
  input       = options[:input]
  output      = options[:output]
  word_length = options[:word_length]

  ohai "making udb w/ #{input}, saving as #{output}."

  cmd = ['usearch',
         "-makeudb_usearch #{input}",
         "-output #{output}"]

  cmd = cmd.join(' ')

  run cmd
end

#otu_filter ⇒ `Object`

# File 'lib/lederhosen/tasks/otu_filter.rb', line 13

def otu_filter
  input       = File.expand_path(options[:input])
  output      = File.expand_path(options[:output])
  reads       = options[:reads]
  min_samples = options[:samples]

  ohai "filtering otu file #{input} (reads = #{reads}, samples = #{min_samples})"

  cluster_sample_count = Hash.new { |h, k| h[k] = Hash.new }

  ohai "loading csv file #{input}"

  # slurp up CSV file
  File.open input do |handle|
    header = handle.gets.strip.split(',')
    cluster_ids = header[1..-1]
    handle.each do |line|
      line = line.strip.split(',')
      sample_id = line[0].to_sym
      counts = line[1..-1].map(&:to_i)
      cluster_ids.zip(counts).each do |cluster, count|
        cluster_sample_count[cluster][sample_id] = count
      end
    end
  end

  ohai "filtering"

  # filter sample_cluster_count
  filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }

  ohai "saving to #{output}"

  # save the table
  out = File.open(output, 'w')
  samples = filtered.values.map(&:keys).flatten.uniq
  clusters = filtered.keys
  out.puts "-,#{clusters.join(',')}"
  samples.each do |sample|
    out.print "#{sample}"
    clusters.each do |cluster|
      out.print ",#{filtered[cluster][sample]}"
    end
    out.print "\n"
  end
  out.close

  ohai "kept #{filtered.keys.size} clusters (#{filtered.keys.size/cluster_sample_count.size.to_f})."
  kept_reads = filtered.values.map { |x| x.values.inject(:+) }.inject(:+)
  total_reads = cluster_sample_count.values.map { |x| x.values.inject(:+) }.inject(:+)
  ohai "kept #{kept_reads}/#{total_reads} reads (#{kept_reads/total_reads.to_f})."
end

#otu_table ⇒ `Object`

# File 'lib/lederhosen/tasks/otu_table.rb', line 21

def otu_table
  input  = Dir[options[:files]]
  prefix = options[:prefix]
  levels = options[:levels].map(&:downcase)

  ohai "generating #{levels.join(', ')} table(s) from #{input.size} file(s) and saving to prefix #{prefix}."

  # sanity check
  levels.each do |level|
    fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
  end

  level_sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } } }

  all_names = Hash.new { |h, k| h[k] = Set.new }

  # create a progress bar with the total number of bytes of
  # the files we're slurping up
  pbar = ProgressBar.new "loading", input.size

  # Load cluster table

  input.each do |input_file|
    pbar.inc
    File.open(input_file) do |handle|
      handle.each do |line|

        dat = parse_usearch_line(line.strip)
        levels.each do |level|
          name =
            if dat.nil?
              'unclassified_reads'
            else
              dat[level] || 'unparsed_name'
            end

          level_sample_cluster_count[level][input_file][name] += 1
          all_names[level] << name
        end

      end
    end
  end

  pbar.finish

  # save to csv(s)
  levels.each do |level|

    ohai "saving #{level} table"

    File.open("#{prefix}.#{level}.csv", 'w') do |handle|
      header = all_names[level].to_a.compact.sort
      handle.puts "#{level.capitalize},#{header.join(',')}"

      input.each do |sample|
        handle.print "#{sample}"
        header.each do |name|
          handle.print ",#{level_sample_cluster_count[level][sample][name]}"
        end
        handle.print "\n"
      end
    end
  end
end

#split_fasta ⇒ `Object`

# File 'lib/lederhosen/tasks/split_fasta.rb', line 18

def split_fasta
  input   = File.expand_path(options[:input])
  out_dir = options[:out_dir]
  n       = options[:n].to_i
  gzip    = options[:gzip]

  ohai "splitting #{input} into files with #{n} reads stored in #{out_dir}"
  ohai "using gzip" if gzip

  `mkdir -p #{out_dir}`

  File.open input do |handle|
    pbar = ProgressBar.new 'splitting', File.size(handle)
    Dna.new(handle).each_with_index do |record, i|
      pbar.set handle.pos
      # I have to use a class variable here because
      # if I don't the variable gets set to nil after
      # after each iteration.
      @out =
        if i%n == 0 # start a new file
          # GzipWriter must be closed explicitly
          # this raises an exception this first time
          @out.close rescue nil

          # create an IO object depending on whether or
          # not the user wants to use gzip
          if gzip
            Zlib::GzipWriter.open(File.join(out_dir, "split_#{i/n}.fasta.gz"))
          else
            File.open(File.join(out_dir, "split_#{i/n}.fasta"), 'w')
          end
        else # keep using current handle
          @out
        end
      @out.puts record
    end
    pbar.finish
    @out.close
  end

  ohai "created #{Dir[File.join(out_dir, '*')].size} files"
end

#trim ⇒ `Object`

# File 'lib/lederhosen/tasks/trim.rb', line 14

def trim
  raw_reads = options[:reads_dir]
  out_dir   = options[:out_dir]

  ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"

  run "mkdir -p #{out_dir}"

  raw_reads = get_grouped_qseq_files raw_reads

  ohai "found #{raw_reads.length} pairs of reads"

  pbar = ProgressBar.new "trimming", raw_reads.length
  raw_reads.each do |a|
    pbar.inc
    out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
    # TODO get total and trimmed
    total, trimmed = trim_pairs a[1][0], a[1][1], out, :min_length => 70
  end
  pbar.finish

end

#version ⇒ `Object`



10
11
12

# File 'lib/lederhosen/tasks/version.rb', line 10

def version
  puts "lederhosen-#{Lederhosen::Version::STRING} codename #{Lederhosen::Version::CODENAME}"
end

Class: Lederhosen::CLI

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#taxonomy_format ⇒ Object

Instance Method Details

#cluster ⇒ Object

#get_reps ⇒ Object

#join_otu_tables ⇒ Object

#make_udb ⇒ Object

#otu_filter ⇒ Object

#otu_table ⇒ Object

#split_fasta ⇒ Object

#trim ⇒ Object

#version ⇒ Object

#taxonomy_format ⇒ `Object`

#cluster ⇒ `Object`

#get_reps ⇒ `Object`

#join_otu_tables ⇒ `Object`

#make_udb ⇒ `Object`

#otu_filter ⇒ `Object`

#otu_table ⇒ `Object`

#split_fasta ⇒ `Object`

#trim ⇒ `Object`

#version ⇒ `Object`