Module: MiseqRunStats

Defined in:
lib/miseq_run_stats.rb

Defined Under Namespace

Classes: AssemblyRunStats, AssemblySampleStats, ResequencingRunStats, ResequencingSampleStats

Instance Method Summary collapse

Instance Method Details

#parse_assembly_run_stats(xml_file, original_sample_names = nil) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/miseq_run_stats.rb', line 31

def parse_assembly_run_stats(xml_file, original_sample_names = nil)
  xml = Nokogiri::XML(File.read(xml_file))
  assembly_run_stats = AssemblyRunStats.new

  xml.search('//RunStats').each do |run_stats|
    assembly_run_stats.number_of_bases = run_stats.search('YieldInBasesPF').text.to_f/1000000000
    assembly_run_stats.number_of_clusters = run_stats.search('NumberOfClustersPF').text.to_i
  end

  # get un-named contig data
  assembly_stats = Array.new
  xml.search('//AssemblyStatistics').each do |assembly_sample_stats|
    number_of_contigs = assembly_sample_stats.search('NumberOfContigs').text.to_i
    mean_contig_size = assembly_sample_stats.search('MeanContigLength').text.to_f.to_i
    n50 = assembly_sample_stats.search('N50').text.to_i
    number_of_bases = assembly_sample_stats.search('BaseCount').text.to_i
    assembly_stats << {:number_of_contigs  => number_of_contigs, :mean_contig_size => mean_contig_size, :n50 => n50, :number_of_bases => number_of_bases}
  end

  assembly_run_stats.sample_stats = Hash.new
  xml.search('//SampleStatistics').each do |sample_stats|
    sample_name = sample_stats.search('SampleName').text
    sample_name = original_sample_names.select{|original_sample_name| sample_name =~ /#{original_sample_name}/}.first unless original_sample_names.nil? # alter sample name to original sample name if supplies as an array
    next if sample_name.nil?

    assembly_run_stats.sample_stats[sample_name] = AssemblySampleStats.new
    assembly_run_stats.sample_stats[sample_name].sample_name = sample_name
    assembly_run_stats.sample_stats[sample_name].number_of_clusters = sample_stats.search('NumberOfClustersPF').text
    assembly_sample_stats = assembly_stats.shift
    assembly_run_stats.sample_stats[sample_name].number_of_contigs = assembly_sample_stats[:number_of_contigs]
    assembly_run_stats.sample_stats[sample_name].mean_contig_size = assembly_sample_stats[:mean_contig_size]
    assembly_run_stats.sample_stats[sample_name].n50 = assembly_sample_stats[:n50]
    assembly_run_stats.sample_stats[sample_name].number_of_bases = assembly_sample_stats[:number_of_bases]
 
  end
  return assembly_run_stats
end

#parse_resequencing_run_stats(xml_file, original_sample_names = nil) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/miseq_run_stats.rb', line 7

def parse_resequencing_run_stats(xml_file, original_sample_names = nil)
  xml = Nokogiri::XML(File.read(xml_file))
  resequencing_run_stats = ResequencingRunStats.new

  xml.search('//RunStats').each do |run_stats|
    resequencing_run_stats.number_of_bases = run_stats.search('YieldInBasesPF').text.to_f/1000000000
    resequencing_run_stats.number_of_clusters = run_stats.search('NumberOfClustersPF').text.to_i
  end

  resequencing_run_stats.sample_stats = Hash.new
  xml.search('//SummarizedSampleStatisics').each do |summarised_samples_stats|
    sample_name = summarised_samples_stats.search('SampleName').text
    sample_name = original_sample_names.select{|original_sample_name| sample_name =~ /#{original_sample_name}/}.first unless original_sample_names.nil? # alter sample name to original sample name if supplies as an array

    resequencing_run_stats.sample_stats[sample_name] = ResequencingSampleStats.new
    resequencing_run_stats.sample_stats[sample_name].sample_name = sample_name
    resequencing_run_stats.sample_stats[sample_name].number_of_clusters = summarised_samples_stats.search('NumberOfClustersPF').text
    resequencing_run_stats.sample_stats[sample_name].number_of_forward_reads_aligned = summarised_samples_stats.search('ClustersAlignedR1').text
    resequencing_run_stats.sample_stats[sample_name].number_of_reverse_reads_aligned = summarised_samples_stats.search('ClustersAlignedR2').text
    resequencing_run_stats.sample_stats[sample_name].coverage = summarised_samples_stats.search('WeightedCoverage').text
    resequencing_run_stats.sample_stats[sample_name].number_of_snps = summarised_samples_stats.search('NumberHomozygousSNPs').text
  end
  return resequencing_run_stats
end