Class: Lederhosen::CLI

Inherits:
Thor
  • Object
show all
Includes:
Thor::Actions
Defined in:
lib/lederhosen/cli.rb,
lib/lederhosen/no_tasks.rb,
lib/lederhosen/tasks/cluster.rb,
lib/lederhosen/tasks/version.rb,
lib/lederhosen/tasks/get_reps.rb,
lib/lederhosen/tasks/make_udb.rb,
lib/lederhosen/tasks/otu_table.rb,
lib/lederhosen/tasks/otu_filter.rb,
lib/lederhosen/tasks/split_fasta.rb,
lib/lederhosen/tasks/join_otu_tables.rb,
lib/lederhosen/tasks/count_taxonomies.rb,
lib/lederhosen/tasks/separate_unclassified.rb

Overview

The CLI class holds all of the Thor tasks

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#taxonomy_formatObject

Returns the value of attribute taxonomy_format.



5
6
7
# File 'lib/lederhosen/no_tasks.rb', line 5

def taxonomy_format
  @taxonomy_format
end

Instance Method Details

#clusterObject



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/lederhosen/tasks/cluster.rb', line 16

def cluster
  input     = File.expand_path(options[:input])
  database  = File.expand_path(options[:database])
  threads   = options[:threads]
  identity  = options[:identity]
  output    = File.expand_path(options[:output])
  strand    = options[:strand]
  dry_run   = options[:dry_run]
  query_cov = options[:query_cov]

  ohai "#{'(dry run)' if dry_run} clustering #{input} to #{database} and saving to #{output}"

  options.each_pair do |key, value|
    ohai "#{key} = #{value}"
  end

  cmd = ['usearch',
    "--usearch_local #{input}",
    "--id #{identity}",
    "--uc #{output}",
    "--db #{database}",
    "--strand #{strand}",
    "--query_cov #{query_cov}"
  ]

  # threads = False : use all threads (default)
  if threads != false
    cmd << "--threads #{threads}"
  end

  cmd = cmd.join(' ')

  unless dry_run
    run cmd
  else
    puts cmd
  end
end

#count_taxonomiesObject



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/lederhosen/tasks/count_taxonomies.rb', line 9

def count_taxonomies
  input  = options[:input]
  output = options[:output]

  ohai "generating #{output} from #{input}"

  handle = File.open(input)
  uc = UCParser.new(handle)
  taxonomy_count = get_taxonomy_count(uc)
  handle.close

  out = File.open(output, 'w')
  out.puts '# taxonomy, number_of_reads'
  taxonomy_count.each_pair do |taxonomy, count|
    out.puts "#{taxonomy.tr(',','_')},#{count}"
  end
  out.close

end

#get_repsObject



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/lederhosen/tasks/get_reps.rb', line 11

def get_reps
  inputs   = Dir[options[:input]]
  database = File.expand_path(options[:database])
  output   = File.expand_path(options[:output])

  taxa = Set.new

  ohai "getting representative database sequences from #{database} using #{inputs.size} cluster file(s) and saving to #{output}"

  # parse uc file, get list of taxa we need to get
  # full sequences for from the database
  pbar = ProgressBar.new 'reading uc(s)', inputs.size

  inputs.each do |input|
    File.open(input) do |handle|
      results = UCParser.new(handle)
      results.each do |result|
        taxa << result.target if result.hit?
      end
    end
  end

  pbar.finish

  ohai "found #{taxa.size} representative sequences"

  # print representative sequences from database
  output = File.open(output, 'w')
  kept = 0
  File.open(database) do |handle|
    Dna.new(handle).each do |record|
      if taxa.include? record.name
        output.puts record
        kept += 1
      end
    end
  end

  output.close

  ohai "saved #{kept} representatives"

end

#join_otu_tablesObject



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/lederhosen/tasks/join_otu_tables.rb', line 12

def join_otu_tables

  input = Dir[options[:input]]
  output = File.expand_path(options[:output])

  ohai "combining #{input.size} file(s) and saving to #{output}"

  all_otu_names = Set.new
  all_samples = Set.new

  sample_name_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }

  # read all of the csv files
  input.each do |input_file|
    File.open(input_file) do |handle|
      otu_names = handle.gets.strip.split(',')[1..-1]
      all_otu_names += otu_names.to_set

      handle.each do |line|
        line = line.strip.split(',')
        sample = File.basename(input_file)
        all_samples << sample
        read_counts = line[1..-1]
        otu_names.zip(read_counts) do |name, count|
          sample_name_count[sample][name] = count
        end
      end
    end
  end

  # save to csv
  File.open(output, 'w') do |handle|
    header = all_otu_names.to_a.sort
    handle.puts "-,#{header.join(',')}"

    all_samples.to_a.sort.each do |sample|
      handle.print "#{sample}"
      header.each do |name|
        handle.print ",#{sample_name_count[sample][name]}"
      end
      handle.print "\n"
    end
  end


end

#make_udbObject



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/lederhosen/tasks/make_udb.rb', line 9

def make_udb
  input       = options[:input]
  output      = options[:output]
  word_length = options[:word_length]
  db_step     = options[:db_step]

  ohai "making udb w/ #{input}, saving as #{output}."

  cmd = ['usearch',
         "-makeudb_usearch #{input}",
         "-output #{output}",
        ]

  cmd = cmd.join(' ')

  run cmd
end

#otu_filterObject



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/lederhosen/tasks/otu_filter.rb', line 13

def otu_filter
  input       = File.expand_path(options[:input])
  output      = File.expand_path(options[:output])
  reads       = options[:reads]
  min_samples = options[:samples]

  ohai "filtering otu file #{input} (reads = #{reads}, samples = #{min_samples})"

  # make one pass finding which OTUs to keep
  # create mask that maps which columns correspond to good OTUs
  # pass over table again printing only those columns

  seen = Hash.new { |h, k| h[k] = 0 }

  otu_order = []

  pbar = ProgressBar.new 'counting', File.size(input)
  total_reads = 0

  File.open(input) do |handle|
    header = handle.gets.strip.split(',')
    header.each { |x| otu_order << x }

    handle.each do |line|
      pbar.set handle.pos
      line = line.strip.split(',')
      sample_name = line[0]
      abunds = line[1..-1].map &:to_i
      otu_order.zip(abunds) do |o, a|
        total_reads += a
        seen[o] += 1 if a >= reads
      end
    end
  end

  pbar.finish

  mask = otu_order.map { |x| seen[x] >= min_samples }

  ohai "found #{otu_order.size} otus, keeping #{mask.count(true)}"

  output = File.open(output, 'w')

  pbar = ProgressBar.new 'writing', File.size(input)
  filtered_reads = 0
  File.open(input) do |handle|
    header = handle.gets.strip.split(',')
    header = header.zip(mask).map { |k, m| k if m }.compact
    output.print header.join(',')
    output.print ",noise\n" # need a "noise" column

    handle.each do |line|
      pbar.set handle.pos
      line = line.strip.split(',')

      sample_name = line[0]
      counts = line[1..-1].map &:to_i

      kept_counts = counts.zip(mask).map { |c, m| c if m }.compact
      noise = counts.zip(mask).map { |c, m| c unless m }.compact.inject(:+)
      filtered_reads += noise || 0

      output.puts "#{sample_name},#{kept_counts.join(',')},#{noise}"

    end
  end

  pbar.finish

  ohai "kept #{total_reads - filtered_reads}/#{total_reads} reads (#{100*(total_reads - filtered_reads)/total_reads.to_f}%)"

  output.close

end

#otu_tableObject



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/lederhosen/tasks/otu_table.rb', line 15

def otu_table
  inputs = Dir[options[:files]]
  level  = options[:level].downcase
  output = options[:output]

  ohai "Generating OTU matrix from #{inputs.size} inputs at #{level} level and saving to #{output}."

  # sanity check
  fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level      
  fail 'no inputs matched your glob' if inputs.size == 0

  sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }

  # create a progress bar with the total number of bytes of
  # the files we're slurping up
  pbar = ProgressBar.new "loading", inputs.size

  inputs.each do |input_file|
    File.open(input_file).each do |line|
      next if line =~ /^#/ # skip header(s)
      line = line.strip.split(',')
      taxonomy, count = line
      count = count.to_i
      tax =
        if taxonomy == 'unclassified_reads'
          'unclassified_reads'
        else
          if level == 'original'
            taxonomy
          else
            parse_taxonomy(taxonomy)[level]
          end
        end
      sample_cluster_count[input_file][tax] += count
    end
    pbar.inc
  end
  pbar.finish

  all_clusters = sample_cluster_count.values.map(&:keys).flatten.uniq.sort

  out = File.open(output, 'w')

  out.puts all_clusters.join(',')
  pbar = ProgressBar.new('saving', inputs.size)
  inputs.sort.each do |input|
    pbar.inc
    out.print "#{input}"
    all_clusters.each do |c|
      out.print ",#{sample_cluster_count[input][c]}"
    end
    out.print "\n"
  end
  pbar.finish

end

#separate_unclassifiedObject



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/lederhosen/tasks/separate_unclassified.rb', line 15

def separate_unclassified
  uc_file = options[:uc_file]
  reads   = options[:reads]
  output  = options[:output]
  strict  = options[:strict]

  unclassifieds = Set.new
  handle = File.open(uc_file)
  uc = UCParser.new(handle)
  total_reads = 0

  if not strict
    uc.each do |result|
      unclassifieds << result.query if result.miss?
      total_reads += 1
    end

  elsif strict

    uc.each_slice(2) do |left, right|
      total_reads += 2
      if left.miss? || right.miss? # at least one is a miss
        unclassifieds << left.query
        unclassifieds << right.query
      # both are hits, check taxonomies
      else
        ta = parse_taxonomy(right.target)
        tb = parse_taxonomy(left.target)
        # inconsistent assignment or at least one is a miss
        if (ta[strict] != tb[strict])
          unclassifieds << left.query
          unclassifieds << right.query
        end
      end
    end

  end

  ohai "found #{unclassifieds.size} unclassified #{'(strict pairing)' if strict} reads."
  ohai "unclassified: #{'%0.2f' % (100*unclassifieds.size/total_reads.to_f)} %"

  handle.close

  # open fasta file, output unclassified reads
  out = File.open(output, 'w')
  Dna.new(File.open(reads)).each do |record|
    if unclassifieds.include? record.name
      out.puts record
    end
  end
  out.close

end

#split_fastaObject



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/lederhosen/tasks/split_fasta.rb', line 18

def split_fasta
  input   = File.expand_path(options[:input])
  out_dir = options[:out_dir]
  n       = options[:n].to_i
  gzip    = options[:gzip]

  ohai "splitting #{input} into files with #{n} reads stored in #{out_dir}"
  ohai "using gzip" if gzip

  `mkdir -p #{out_dir}`

  File.open input do |handle|
    pbar = ProgressBar.new 'splitting', File.size(handle)
    Dna.new(handle).each_with_index do |record, i|
      pbar.set handle.pos
      # I have to use a class variable here because
      # if I don't the variable gets set to nil after
      # after each iteration.
      @out =
        if i%n == 0 # start a new file
          # GzipWriter must be closed explicitly
          # this raises an exception this first time
          @out.close rescue nil

          # create an IO object depending on whether or
          # not the user wants to use gzip
          if gzip
            Zlib::GzipWriter.open(File.join(out_dir, "split_#{i/n}.fasta.gz"))
          else
            File.open(File.join(out_dir, "split_#{i/n}.fasta"), 'w')
          end
        else # keep using current handle
          @out
        end
      @out.puts record
    end
    pbar.finish
    @out.close
  end

  ohai "created #{Dir[File.join(out_dir, '*')].size} files"
end

#versionObject



10
11
12
# File 'lib/lederhosen/tasks/version.rb', line 10

def version
  puts "lederhosen-#{Lederhosen::Version::STRING} codename #{Lederhosen::Version::CODENAME}"
end