Class: Lederhosen::CLI

Inherits:
Thor
  • Object
show all
Includes:
Thor::Actions
Defined in:
lib/lederhosen/cli.rb,
lib/lederhosen/no_tasks.rb,
lib/lederhosen/tasks/trim.rb,
lib/lederhosen/tasks/cluster.rb,
lib/lederhosen/tasks/version.rb,
lib/lederhosen/tasks/get_reps.rb,
lib/lederhosen/tasks/make_udb.rb,
lib/lederhosen/tasks/otu_table.rb,
lib/lederhosen/tasks/otu_filter.rb,
lib/lederhosen/tasks/split_fasta.rb,
lib/lederhosen/tasks/join_otu_tables.rb

Overview

The CLI class holds all of the Thor tasks

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#taxonomy_formatObject

Returns the value of attribute taxonomy_format.



4
5
6
# File 'lib/lederhosen/no_tasks.rb', line 4

def taxonomy_format
  @taxonomy_format
end

Instance Method Details

#clusterObject



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/lederhosen/tasks/cluster.rb', line 14

def cluster
  input    = File.expand_path(options[:input])
  database = File.expand_path(options[:database])
  threads  = options[:threads]
  identity = options[:identity]
  output   = File.expand_path(options[:output])
  strand   = options[:strand]

  ohai "clustering #{input} to #{database} and saving to #{output}"

  options.each_pair do |key, value|
    ohai "#{key} = #{value}"
  end

  cmd = ['usearch',
    "--usearch_local #{input}",
    "--id #{identity}",
    "--uc #{output}",
    "--db #{database}",
    "--strand #{strand}"
  ]

  # threads = False : use all threads (default)
  if threads != false
    cmd << "--threads #{threads}"
  end

  cmd = cmd.join(' ')

  run cmd
end

#get_repsObject



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/lederhosen/tasks/get_reps.rb', line 11

def get_reps
  inputs   = Dir[options[:input]]
  database = File.expand_path(options[:database])
  output   = File.expand_path(options[:output])

  taxa = Set.new

  ohai "getting representative database sequences from #{database} using #{inputs.size} cluster file(s) and saving to #{output}"

  # parse uc file, get list of taxa we need to get
  # full sequences for from the database
  pbar = ProgressBar.new 'reading uc(s)', inputs.size

  inputs.each do |input|
    File.open(input) do |handle|
      pbar.inc
      handle.each do |line|
        header = parse_usearch_line(line.strip)
        taxa << header['original'] rescue nil
      end
    end
  end

  pbar.finish

  ohai "found #{taxa.size} representative sequences"

  # print representative sequences from database
  output = File.open(output, 'w')
  kept = 0
  File.open(database) do |handle|
    Dna.new(handle).each do |record|
      if taxa.include? record.name
        output.puts record
        kept += 1
      end
    end
  end

  output.close

  ohai "saved #{kept} representatives"

end

#join_otu_tablesObject



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/lederhosen/tasks/join_otu_tables.rb', line 12

def join_otu_tables

  input = Dir[options[:input]]
  output = File.expand_path(options[:output])

  ohai "combining #{input.size} file(s) and saving to #{output}"

  all_otu_names = Set.new
  all_samples = Set.new

  sample_name_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }

  # read all of the csv files
  input.each do |input_file|
    File.open(input_file) do |handle|
      otu_names = handle.gets.strip.split(',')[1..-1]
      all_otu_names += otu_names.to_set

      handle.each do |line|
        line = line.strip.split(',')
        sample = File.basename(input_file)
        all_samples << sample
        read_counts = line[1..-1]
        otu_names.zip(read_counts) do |name, count|
          sample_name_count[sample][name] = count
        end
      end
    end
  end

  # save to csv
  File.open(output, 'w') do |handle|
    header = all_otu_names.to_a.sort
    handle.puts "-,#{header.join(',')}"

    all_samples.to_a.sort.each do |sample|
      handle.print "#{sample}"
      header.each do |name|
        handle.print ",#{sample_name_count[sample][name]}"
      end
      handle.print "\n"
    end
  end


end

#make_udbObject



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/lederhosen/tasks/make_udb.rb', line 9

def make_udb
  input       = options[:input]
  output      = options[:output]
  word_length = options[:word_length]

  ohai "making udb w/ #{input}, saving as #{output}."

  cmd = ['usearch',
         "-makeudb_usearch #{input}",
         "-output #{output}"]

  cmd = cmd.join(' ')

  run cmd
end

#otu_filterObject



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/lederhosen/tasks/otu_filter.rb', line 13

def otu_filter
  input       = File.expand_path(options[:input])
  output      = File.expand_path(options[:output])
  reads       = options[:reads]
  min_samples = options[:samples]

  ohai "filtering otu file #{input} (reads = #{reads}, samples = #{min_samples})"

  cluster_sample_count = Hash.new { |h, k| h[k] = Hash.new }

  ohai "loading csv file #{input}"

  # slurp up CSV file
  File.open input do |handle|
    header = handle.gets.strip.split(',')
    cluster_ids = header[1..-1]
    handle.each do |line|
      line = line.strip.split(',')
      sample_id = line[0].to_sym
      counts = line[1..-1].map(&:to_i)
      cluster_ids.zip(counts).each do |cluster, count|
        cluster_sample_count[cluster][sample_id] = count
      end
    end
  end

  ohai "filtering"

  # filter sample_cluster_count
  filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }

  ohai "saving to #{output}"

  # save the table
  out = File.open(output, 'w')
  samples = filtered.values.map(&:keys).flatten.uniq
  clusters = filtered.keys
  out.puts "-,#{clusters.join(',')}"
  samples.each do |sample|
    out.print "#{sample}"
    clusters.each do |cluster|
      out.print ",#{filtered[cluster][sample]}"
    end
    out.print "\n"
  end
  out.close

  ohai "kept #{filtered.keys.size} clusters (#{filtered.keys.size/cluster_sample_count.size.to_f})."
  kept_reads = filtered.values.map { |x| x.values.inject(:+) }.inject(:+)
  total_reads = cluster_sample_count.values.map { |x| x.values.inject(:+) }.inject(:+)
  ohai "kept #{kept_reads}/#{total_reads} reads (#{kept_reads/total_reads.to_f})."
end

#otu_tableObject



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/lederhosen/tasks/otu_table.rb', line 21

def otu_table
  input  = Dir[options[:files]]
  prefix = options[:prefix]
  levels = options[:levels].map(&:downcase)

  ohai "generating #{levels.join(', ')} table(s) from #{input.size} file(s) and saving to prefix #{prefix}."

  # sanity check
  levels.each do |level|
    fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
  end

  level_sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } } }

  all_names = Hash.new { |h, k| h[k] = Set.new }

  # create a progress bar with the total number of bytes of
  # the files we're slurping up
  pbar = ProgressBar.new "loading", input.size

  # Load cluster table

  input.each do |input_file|
    pbar.inc
    File.open(input_file) do |handle|
      handle.each do |line|

        dat = parse_usearch_line(line.strip)
        levels.each do |level|
          name =
            if dat.nil?
              'unclassified_reads'
            else
              dat[level] || 'unparsed_name'
            end

          level_sample_cluster_count[level][input_file][name] += 1
          all_names[level] << name
        end

      end
    end
  end

  pbar.finish

  # save to csv(s)
  levels.each do |level|

    ohai "saving #{level} table"

    File.open("#{prefix}.#{level}.csv", 'w') do |handle|
      header = all_names[level].to_a.compact.sort
      handle.puts "#{level.capitalize},#{header.join(',')}"

      input.each do |sample|
        handle.print "#{sample}"
        header.each do |name|
          handle.print ",#{level_sample_cluster_count[level][sample][name]}"
        end
        handle.print "\n"
      end
    end
  end
end

#split_fastaObject



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/lederhosen/tasks/split_fasta.rb', line 18

def split_fasta
  input   = File.expand_path(options[:input])
  out_dir = options[:out_dir]
  n       = options[:n].to_i
  gzip    = options[:gzip]

  ohai "splitting #{input} into files with #{n} reads stored in #{out_dir}"
  ohai "using gzip" if gzip

  `mkdir -p #{out_dir}`

  File.open input do |handle|
    pbar = ProgressBar.new 'splitting', File.size(handle)
    Dna.new(handle).each_with_index do |record, i|
      pbar.set handle.pos
      # I have to use a class variable here because
      # if I don't the variable gets set to nil after
      # after each iteration.
      @out =
        if i%n == 0 # start a new file
          # GzipWriter must be closed explicitly
          # this raises an exception this first time
          @out.close rescue nil

          # create an IO object depending on whether or
          # not the user wants to use gzip
          if gzip
            Zlib::GzipWriter.open(File.join(out_dir, "split_#{i/n}.fasta.gz"))
          else
            File.open(File.join(out_dir, "split_#{i/n}.fasta"), 'w')
          end
        else # keep using current handle
          @out
        end
      @out.puts record
    end
    pbar.finish
    @out.close
  end

  ohai "created #{Dir[File.join(out_dir, '*')].size} files"
end

#trimObject



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/lederhosen/tasks/trim.rb', line 14

def trim
  raw_reads = options[:reads_dir]
  out_dir   = options[:out_dir]

  ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"

  run "mkdir -p #{out_dir}"

  raw_reads = get_grouped_qseq_files raw_reads

  ohai "found #{raw_reads.length} pairs of reads"

  pbar = ProgressBar.new "trimming", raw_reads.length
  raw_reads.each do |a|
    pbar.inc
    out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
    # TODO get total and trimmed
    total, trimmed = trim_pairs a[1][0], a[1][1], out, :min_length => 70
  end
  pbar.finish

end

#versionObject



10
11
12
# File 'lib/lederhosen/tasks/version.rb', line 10

def version
  puts "lederhosen-#{Lederhosen::Version::STRING} codename #{Lederhosen::Version::CODENAME}"
end