Module: MiGA::Common::Format

Included in:
MiGA
Defined in:
lib/miga/common/format.rb

Overview

General formatting functions shared throughout MiGA.

Instance Method Summary collapse

Instance Method Details

#clean_fasta_file(file) ⇒ Object

Cleans a FastA file in place.



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/miga/common/format.rb', line 27

def clean_fasta_file(file)
  tmp_fh = nil
  tmp_path = nil
  begin
    if file =~ /\.gz/
      tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
      File.unlink tmp_path
      tmp_path += '.gz'
      tmp_fh = Zlib::GzipWriter.open(tmp_path, 9)
      fh = Zlib::GzipReader.open(file)
    else
      tmp_fh = Tempfile.new('MiGA')
      tmp_path = tmp_fh.path
      fh = File.open(file, 'r')
    end
    buffer = ''.dup
    fh.each_line do |ln|
      ln.chomp!
      if ln =~ /^>\s*(\S+)(.*)/
        id, df = $1, $2
        tmp_fh.print buffer.wrap_width(80)
        buffer = ''.dup
        tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, '_')}#{df}"
      else
        buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
      end
    end
    tmp_fh.print buffer.wrap_width(80)
    tmp_fh.close
    fh.close
    FileUtils.mv(tmp_path, file)
  ensure
    begin
      tmp_fh.close unless tmp_fh.nil?
      File.unlink(tmp_path) unless tmp_path.nil?
    rescue
    end
  end
end

#seqs_length(file, format, opts = {}) ⇒ Object

Calculates the average and standard deviation of the sequence lengths in a FastA or FastQ file (supports gzipped files). The format must be a Symbol, one of :fasta or :fastq. Additional estimations can be controlled via the opts Hash. Supported options include:

  • :n50: Include the N50 and the median (in bp)

  • :gc: Include the G+C content (in %)

  • :x: Include the undetermined bases content (in %)

  • :skew: Include G-C and A-T sequence skew (in %; forces gc: true). See definition used here in DOI:10.1177/117693430700300006



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/miga/common/format.rb', line 77

def seqs_length(file, format, opts = {})
  opts[:gc] = true if opts[:skew]
  fh = file =~ /\.gz/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
  l = []
  gc = 0
  xn = 0
  t  = 0
  c  = 0
  i  = 0 # <- Zlib::GzipReader doesn't set `$.`
  fh.each_line do |ln|
    i += 1
    if (format == :fasta and ln =~ /^>/) or
       (format == :fastq and (i % 4) == 1)
      l << 0
    elsif format == :fasta or (i % 4) == 2
      l[l.size - 1] += ln.chomp.size
      gc += ln.scan(/[GCgc]/).count if opts[:gc]
      xn += ln.scan(/[XNxn]/).count if opts[:x]
      if opts[:skew]
        t += ln.scan(/[Tt]/).count
        c += ln.scan(/[Cc]/).count
      end
    end
  end
  fh.close

  o = { n: l.size, tot: l.inject(0, :+), max: l.max }
  return o if o[:tot].zero?
  o[:avg] = o[:tot].to_f / l.size
  o[:var] = l.map { |a| a**2 }.inject(:+).to_f / l.size - o[:avg]**2
  o[:sd]  = Math.sqrt o[:var]
  o[:gc]  = 100.0 * gc / o[:tot] if opts[:gc]
  o[:x]   = 100.0 * xn / o[:tot] if opts[:x]
  if opts[:skew]
    at = o[:tot] - gc
    o[:at_skew] = 100.0 * (2 * t - at) / at
    o[:gc_skew] = 100.0 * (2 * c - gc) / gc
  end

  if opts[:n50]
    l.sort!
    thr = o[:tot] / 2
    pos = 0
    l.each do |a|
      pos += a
      o[:n50] = a
      break if pos >= thr
    end
    o[:med] = o[:n].even? ?
          0.5 * l[o[:n] / 2 - 1, 2].inject(:+) :
          l[(o[:n] - 1) / 2]
  end
  o
end

#tabulate(header, values, tabular = false) ⇒ Object

Tabulates an values, and Array of Arrays, all with the same number of entries as header. Returns an Array of String, one per line.



11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/miga/common/format.rb', line 11

def tabulate(header, values, tabular = false)
  fields = []
  fields << header.map(&:to_s) unless tabular && header.all?(&:nil?)
  fields << fields.first.map { |h| h.gsub(/\S/, '-') } unless tabular
  fields += values.map { |r| r.map { |cell| cell.nil? ? '?' : cell.to_s } }
  clen = tabular ? Array.new(header.size, 0) :
        fields.map { |r| r.map(&:length) }.transpose.map(&:max)
  fields.map do |r|
    (0..(clen.size - 1)).map do |col_n|
      col_n == 0 ? r[col_n].rjust(clen[col_n]) : r[col_n].ljust(clen[col_n])
    end.join(tabular ? "\t" : '  ')
  end
end