Module: TreeClusters

Defined in:
lib/tree_clusters.rb,
lib/tree_clusters/attrs.rb,
lib/tree_clusters/clade.rb,
lib/tree_clusters/version.rb,
lib/tree_clusters/attr_array.rb

Overview

Top level namespace of the Gem.

Defined Under Namespace

Classes: AttrArray, Attrs, Clade

Constant Summary collapse

PROJ_ROOT =
File.join __dir__, ".."
VERSION =
"0.8.3"

Instance Method Summary collapse

Instance Method Details

#all_clades(tree, metadata = nil) {|clade| ... } ⇒ Enumerator<Clade>

Given a NewickTree, return an array of all Clades in that tree.

Parameters:

Yield Parameters:

  • clade (Clade)

    a clade of the tree

Returns:

  • (Enumerator<Clade>)

    enumerator of Clade objects



169
170
171
172
173
174
175
# File 'lib/tree_clusters.rb', line 169

def all_clades tree,  = nil
  return enum_for(:all_clades, tree, ) unless block_given?

  tree.clade_nodes.reverse.each do |node|
    yield Clade.new node, tree, 
  end
end

#check_ids(tree, mapping, aln) ⇒ Object

Note:

If there are quoted names in the tree file, they are unquoted first.



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/tree_clusters.rb', line 127

def check_ids tree, mapping, aln
  tree_ids = Set.new(NewickTree.fromFile(tree).unquoted_taxa)

  mapping_ids = Set.new
  File.open(mapping, "rt").each_line.with_index do |line, idx|
    unless idx.zero?
      id, *rest = line.chomp.split

      mapping_ids << id
    end
  end

  aln_ids = Set.new
  ParseFasta::SeqFile.open(aln).each_record do |rec|
    aln_ids << rec.id
  end

  if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
    AbortIf::logger.error { "Seq IDs did not match in all input files" }

    tree_ids    = tree_ids.to_a.sort
    mapping_ids = mapping_ids.to_a.sort
    aln_ids     = aln_ids.to_a.sort

    AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
    AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
    AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" }

    raise AbortIf::Exit
  else
    true
  end
end

#consensus(bases) ⇒ Object

Note:

Each string is upcase’d before frequencies are calculated.

Given an ary of strings, find the most common string in the ary.

Examples:

Upper case and lower case count as the same.

TreeClusters::consensus %w[a A C T] #=> "A"

Ties take the one closest to the end

TreeClusters::consensus %w[a c T t C t g] #=> "T"

Parameters:

  • bases (Array<String>)

    an array of strings



57
58
59
60
61
62
63
64
65
# File 'lib/tree_clusters.rb', line 57

def consensus bases
  bases.
      map(&:upcase).
      group_by(&:itself).
      sort_by { |_, bases| bases.count }.
      reverse.
      first.
      first
end

#low_ent_cols(leaves, leaf2attrs, entropy_cutoff) ⇒ Object



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/tree_clusters.rb', line 88

def low_ent_cols leaves, leaf2attrs, entropy_cutoff
  low_ent_cols = []
  alns         = leaf2attrs.attrs leaves, :aln
  aln_cols     = alns.transpose

  aln_cols.each_with_index do |aln_col, aln_col_idx|
    has_gaps    = aln_col.any? { |aa| aa == "-" }
    low_entropy =
        Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff

    if !has_gaps && low_entropy
      low_ent_cols << (aln_col_idx + 1)
    end
  end

  Set.new low_ent_cols
end

#low_ent_cols_with_bases(leaves, leaf2attrs, entropy_cutoff) ⇒ Object

Like low_ent_cols method but also returns the bases at the positions.



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/tree_clusters.rb', line 107

def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
  low_ent_cols = []
  alns         = leaf2attrs.attrs leaves, :aln
  aln_cols     = alns.transpose

  aln_cols.each_with_index do |aln_col, aln_col_idx|
    has_gaps    = aln_col.any? { |aa| aa == "-" }
    low_entropy =
        Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff

    if !has_gaps && low_entropy
      low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort]
    end
  end

  Set.new low_ent_cols
end

#read_alignment(aln_fname) ⇒ Object



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/tree_clusters.rb', line 67

def read_alignment aln_fname
  leaf2attrs = TreeClusters::Attrs.new
  aln_len    = nil
  seq_num = 0
  ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
    seq_num += 1
    if ((seq_num + 1) % 1000).zero?
      STDERR.printf("Reading alignment: #{seq_num + 1}\r")
    end

    leaf2attrs[rec.id] = { aln: rec.seq.chars }

    aln_len ||= rec.seq.length

    abort_unless aln_len == rec.seq.length,
                 "Aln len mismatch for #{rec.id}"
  end

  leaf2attrs
end

#read_attrs_file(fname) ⇒ Object



323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
# File 'lib/tree_clusters.rb', line 323

def read_attrs_file fname

  attr_names = Set.new
  File.open(fname, "rt").each_line.with_index do |line, idx|
    unless idx.zero?
      _, attr_name, _ = line.chomp.split "\t"

      attr_names << attr_name
    end
  end

  attr_names = attr_names.to_a.sort

  attrs = TreeClusters::Attrs.new

  File.open(fname, "rt").each_line.with_index do |line, idx|
    unless idx.zero?
      leaf, attr_name, attr_val = line.chomp.split "\t"

      if attrs.has_key? leaf
        if attrs[leaf].has_key? attr_name
          attrs[leaf][attr_name] << attr_val
        else
          attrs[leaf][attr_name] = Set.new([attr_val])
        end
      else
        attrs[leaf] = {}

        attr_names.each do |name|
          attrs[leaf][name] = Set.new
        end
        attrs[leaf][attr_name] << attr_val
      end
    end
  end

  [attr_names, attrs]
end

#read_mapping_file(fname) ⇒ Object



304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# File 'lib/tree_clusters.rb', line 304

def read_mapping_file fname
  md_cat_names = nil
       = TreeClusters::Attrs.new

  File.open(fname, "rt").each_line.with_index do |line, idx|
    leaf_name, * = line.chomp.split "\t"

    if idx.zero?
      md_cat_names = 
    else
      .each_with_index do |val, val_idx|
        .add md_cat_names[val_idx], leaf_name, val
      end
    end
  end

  
end

#snazzy_clades(tree, metadata) ⇒ Object



177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# File 'lib/tree_clusters.rb', line 177

def snazzy_clades tree, 
  snazzy_clades = {}

  clades = self.
      all_clades(tree, ).
      sort_by { |clade| clade.all_leaves.count }.
      reverse

  .each do |md_cat, leaf2mdtag|
    already_checked   = Set.new
    single_tag_clades = {}

    clades.each do |clade|
      assert clade.all_leaves.count > 1,
             "A clade cannot also be a leaf"

      unless clade.all_leaves.all? do |leaf|
        already_checked.include? leaf
      end
        md_tags = clade.all_leaves.map do |leaf|
          assert leaf2mdtag.has_key?(leaf),
                 "leaf #{leaf} is missing from leaf2mdtag ht"

          leaf2mdtag[leaf]
        end

        # this clade is mono-phyletic w.r.t. this metadata category.
        if md_tags.uniq.count == 1
          clade.all_leaves.each do |leaf|
            already_checked << leaf
          end

          assert !single_tag_clades.has_key?(clade),
                 "clade #{clade.name} is repeated in single_tag_clades for #{md_cat}"

          single_tag_clades[clade] = md_tags.first
        end
      end
    end

    single_tag_clades.each do |clade, md_tag|
      non_clade_leaves = tree.unquoted_taxa - clade.all_leaves

      non_clade_leaves_with_this_md_tag = non_clade_leaves.map do |leaf|
        [leaf, leaf2mdtag[leaf]]
      end.select { |ary| ary.last == md_tag }

      if non_clade_leaves_with_this_md_tag.count.zero?
        if snazzy_clades.has_key? clade
          snazzy_clades[clade][md_cat] = md_tag
        else
          snazzy_clades[clade] = { md_cat => md_tag }
        end
      end
    end
  end

  snazzy_clades
end

#snazzy_info(tree, metadata) ⇒ Object



237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# File 'lib/tree_clusters.rb', line 237

def snazzy_info tree, 
  snazzy_info = {}

  clades = self.
      all_clades(tree, ).
      sort_by { |clade| clade.all_leaves.count }.
      reverse

  # Non snazzy clades have a value of nil, so set all to nil and the
  # snazzy ones will be overwritten.
  clades.each do |clade|
    snazzy_info[clade] = nil
  end

  .each do |md_cat, leaf2mdtag|
    already_checked   = Set.new
    single_tag_clades = {}

    clades.each do |clade|
      assert clade.all_leaves.count > 1,
             "A clade cannot also be a leaf"

      unless clade.all_leaves.all? do |leaf|
        already_checked.include? leaf
      end
        md_tags = clade.all_leaves.map do |leaf|
          assert leaf2mdtag.has_key?(leaf),
                 "leaf #{leaf} is missing from leaf2mdtag ht"

          leaf2mdtag[leaf]
        end

        # this clade is mono-phyletic w.r.t. this metadata category.
        if md_tags.uniq.count == 1
          clade.all_leaves.each do |leaf|
            already_checked << leaf
          end

          assert !single_tag_clades.has_key?(clade),
                 "clade #{clade.name} is repeated in single_tag_clades for #{md_cat}"

          single_tag_clades[clade] = md_tags.first
        end
      end
    end

    single_tag_clades.each do |clade, md_tag|
      non_clade_leaves = tree.unquoted_taxa - clade.all_leaves

      non_clade_leaves_with_this_md_tag = non_clade_leaves.map do |leaf|
        [leaf, leaf2mdtag[leaf]]
      end.select { |ary| ary.last == md_tag }

      is_snazzy_clade = non_clade_leaves_with_this_md_tag.count.zero?
      if is_snazzy_clade
        if !snazzy_info[clade].nil?
          snazzy_info[clade][md_cat] = md_tag
        else
          snazzy_info[clade] = { md_cat => md_tag }
        end
      end
    end
  end

  snazzy_info
end