Class: Bio::KEGG::Taxonomy

Inherits:
Object show all
Defined in:
lib/bio/db/kegg/taxonomy.rb

Overview

Description

Parse the KEGG ‘taxonomy’ file which describes taxonomic classification of organisms.

References

The KEGG ‘taxonomy’ file is available at

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(filename, orgs = []) ⇒ Taxonomy

Returns a new instance of Taxonomy.



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/bio/db/kegg/taxonomy.rb', line 26

def initialize(filename, orgs = [])
  # Stores the taxonomic tree as a linked list (implemented in Hash), so
  # every node need to have unique name (key) to work correctly
  @tree = Hash.new

  # Also stores the taxonomic tree as a list of arrays (full path)
  @path = Array.new

  # Also stores all leaf nodes (organism codes) of every intermediate nodes
  @leaves = Hash.new

  # tentative name for the root node (use accessor to change)
  @root = 'Genes'

  hier = Array.new
  level = 0
  label = nil

  File.open(filename).each do |line|
    next if line.strip.empty?

    # line for taxonomic hierarchy (indent according to the number of # marks)
    if line[/^#/]
	level = line[/^#+/].length
	label = line[/[A-z].*/]
	hier[level] = sanitize(label)

    # line for organims name (unify different strains of a species)
    else
	tax, org, name, desc = line.chomp.split("\t")
      if orgs.nil? or orgs.empty? or orgs.include?(org)
        species, strain, = name.split('_')
        # (0) Grouping of the strains of the same species.
        #  If the name of species is the same as the previous line,
        #  add the species to the same species group.
        #   ex. Gamma/enterobacteria has a large number of organisms,
        #       so sub grouping of strains is needed for E.coli strains etc.
        #
        # However, if the species name is already used, need to avoid
        # collision of species name as the current implementation stores
        # the tree as a Hash, which may cause the infinite loop.
        #
        # (1) If species name == the intermediate node of other lineage
        #  Add '_sp' to the species name to avoid the conflict (1-1), and if
        #  'species_sp' is already taken, use 'species_strain' instead (1-2).
        #   ex. Bacteria/Proteobacteria/Beta/T.denitrificans/tbd
        #       Bacteria/Proteobacteria/Epsilon/T.denitrificans_ATCC33889/tdn
        #    -> Bacteria/Proteobacteria/Beta/T.denitrificans/tbd
        #       Bacteria/Proteobacteria/Epsilon/T.denitrificans_sp/tdn
        #
        # (2) If species name == the intermediate node of the same lineage
        #  Add '_sp' to the species name to avoid the conflict.
        #   ex. Bacteria/Cyanobacgteria/Cyanobacteria_CYA/cya
        #       Bacteria/Cyanobacgteria/Cyanobacteria_CYB/cya
        #       Bacteria/Proteobacteria/Magnetococcus/Magnetococcus_MC1/mgm
        #    -> Bacteria/Cyanobacgteria/Cyanobacteria_sp/cya
        #       Bacteria/Cyanobacgteria/Cyanobacteria_sp/cya
        #       Bacteria/Proteobacteria/Magnetococcus/Magnetococcus_sp/mgm
        sp_group = "#{species}_sp"
        if @tree[species]
          if hier[level+1] == species
            # case (0)
          else
            # case (1-1)
            species = sp_group
            # case (1-2)
            if @tree[sp_group] and hier[level+1] != species
              species = name
            end
          end
        else
          if hier[level] == species
            # case (2)
            species = sp_group
          end
        end
        # 'hier' is an array of the taxonomic tree + species and strain name.
        #  ex. [nil, Eukaryotes, Fungi, Ascomycetes, Saccharomycetes] +
        #      [S_cerevisiae, sce]
        hier[level+1] = species	# sanitize(species)
        hier[level+2] = org
        ary = hier[1, level+2]
        warn ary.inspect if $DEBUG
        add_to_tree(ary)
        add_to_leaves(ary)
        add_to_path(ary)
      end
    end
  end
  return tree
end

Instance Attribute Details

#leavesObject (readonly)

Returns the value of attribute leaves.



120
121
122
# File 'lib/bio/db/kegg/taxonomy.rb', line 120

def leaves
  @leaves
end

#pathObject (readonly)

Returns the value of attribute path.



119
120
121
# File 'lib/bio/db/kegg/taxonomy.rb', line 119

def path
  @path
end

#rootObject

Returns the value of attribute root.



121
122
123
# File 'lib/bio/db/kegg/taxonomy.rb', line 121

def root
  @root
end

#treeObject (readonly)

Returns the value of attribute tree.



118
119
120
# File 'lib/bio/db/kegg/taxonomy.rb', line 118

def tree
  @tree
end

Instance Method Details

#add_to_leaves(ary) ⇒ Object

Add a new path [node, subnode, subsubnode, …, leaf] under the root node and stores leaf nodes to the every intermediate nodes as an Array.



140
141
142
143
144
145
146
# File 'lib/bio/db/kegg/taxonomy.rb', line 140

def add_to_leaves(ary)
  leaf = ary.last
  ary.each do |node|
    @leaves[node] ||= Array.new
    @leaves[node] << leaf
  end
end

#add_to_path(ary) ⇒ Object

Add a new path [node, subnode, subsubnode, …, leaf] under the root node and stores the path itself in an Array.



150
151
152
# File 'lib/bio/db/kegg/taxonomy.rb', line 150

def add_to_path(ary)
  @path << ary
end

#add_to_tree(ary) ⇒ Object

Add a new path [node, subnode, subsubnode, …, leaf] under the root node and every intermediate nodes stores their child nodes as a Hash.



129
130
131
132
133
134
135
136
# File 'lib/bio/db/kegg/taxonomy.rb', line 129

def add_to_tree(ary)
  parent = @root
  ary.each do |node|
    @tree[parent] ||= Hash.new
    @tree[parent][node] = nil
    parent = node
  end
end

#compact(node = root) ⇒ Object

Compaction of intermediate nodes of the resulted taxonomic tree.

- If child node has only one child node (grandchild), make the child of
  grandchild as a grandchild.
ex.
  Plants / Monocotyledons / grass family / osa
  --> Plants / Monocotyledons / osa


161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/bio/db/kegg/taxonomy.rb', line 161

def compact(node = root)
  # if the node has children
  if subnodes = @tree[node]
    # obtain grandchildren for each child
    subnodes.keys.each do |subnode|
      if subsubnodes = @tree[subnode]
        # if the number of grandchild node is 1
        if subsubnodes.keys.size == 1
          # obtain the name of the grandchild node
          subsubnode = subsubnodes.keys.first
          # obtain the child of the grandchlid node
          if subsubsubnodes = @tree[subsubnode]
            # make the child of grandchild node as a chlid of child node
            @tree[subnode] = subsubsubnodes
            # delete grandchild node
            @tree[subnode].delete(subsubnode)
            warn "--- compact: #{subsubnode} is replaced by #{subsubsubnodes}" if $DEBUG
            # retry until new grandchild also needed to be compacted.
            retry
          end
        end
      end
      # repeat recurseively
      compact(subnode)
    end
  end
end

#dfs(parent, &block) ⇒ Object

Traverse the taxonomic tree by the depth first search method under the given (root or intermediate) node.



224
225
226
227
228
229
230
231
# File 'lib/bio/db/kegg/taxonomy.rb', line 224

def dfs(parent, &block)
  if children = @tree[parent]
    yield parent, children
    children.keys.each do |child|
      dfs(child, &block)
    end
  end
end

#dfs_with_level(parent, &block) ⇒ Object

Similar to the dfs method but also passes the current level of the nest to the iterator.



235
236
237
238
239
240
241
242
243
244
245
# File 'lib/bio/db/kegg/taxonomy.rb', line 235

def dfs_with_level(parent, &block)
  @level ||= 0
  if children = @tree[parent]
    yield parent, children, @level
    @level += 1
    children.keys.each do |child|
      dfs_with_level(child, &block)
    end
    @level -= 1
  end
end

#organisms(group) ⇒ Object



123
124
125
# File 'lib/bio/db/kegg/taxonomy.rb', line 123

def organisms(group)
  @leaves[group]
end

#reduce(node = root) ⇒ Object

Reduction of the leaf node of the resulted taxonomic tree.

- If the parent node have only one leaf node, replace parent node
  with the leaf node.
ex.
 Plants / Monocotyledons / osa
 --> Plants / osa


196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/bio/db/kegg/taxonomy.rb', line 196

def reduce(node = root)
  # if the node has children
  if subnodes = @tree[node]
    # obtain grandchildren for each child
    subnodes.keys.each do |subnode|
      if subsubnodes = @tree[subnode]
        # if the number of grandchild node is 1
        if subsubnodes.keys.size == 1
          # obtain the name of the grandchild node
          subsubnode = subsubnodes.keys.first
          # if the grandchild node is a leaf node
          unless @tree[subsubnode]
            # make the grandchild node as a child node
            @tree[node].update(subsubnodes)
            # delete child node
            @tree[node].delete(subnode)
            warn "--- reduce: #{subnode} is replaced by #{subsubnode}" if $DEBUG
          end
        end
      end
      # repeat recursively
      reduce(subnode)
    end
  end
end

#to_sObject

Convert the taxonomic tree structure to a simple ascii art.



248
249
250
251
252
253
254
# File 'lib/bio/db/kegg/taxonomy.rb', line 248

def to_s
  result = "#{@root}\n"
  @tree[@root].keys.each do |node|
    result += ascii_tree(node, "  ")
  end
  return result
end