Class: GeneValidator::Cluster

Inherits:
Object
  • Object
show all
Defined in:
lib/genevalidator/clusterization.rb

Overview

Stores the values belonging to one cluster Used for clusterization among a vector of values

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lengths) ⇒ Cluster



157
158
159
# File 'lib/genevalidator/clusterization.rb', line 157

def initialize(lengths)
  @lengths = lengths
end

Instance Attribute Details

#lengthsObject

a hash map containing the pair (length, no_occurences)



155
156
157
# File 'lib/genevalidator/clusterization.rb', line 155

def lengths
  @lengths
end

Instance Method Details

#add(cluster) ⇒ Object

Merges the current cluster with the one given as parameter clusters vector of Cluster objects



264
265
266
267
268
# File 'lib/genevalidator/clusterization.rb', line 264

def add(cluster)
  cluster.lengths.each do |elem|
    lengths[elem[0]] = elem[1]
  end
end

#densityObject

Returns the density of the cluster: how many values it contains



176
177
178
179
180
181
182
# File 'lib/genevalidator/clusterization.rb', line 176

def density
  d = 0
  lengths.each do |elem|
    d += elem[1]
  end
  d
end

#deviation(clusters, queryLength) ⇒ Object

Returns the deviation of a value from the values in all clusters Params: clusters: a list of Cluster objects queryLength: a reference Sequence object Output: Real number



252
253
254
255
256
257
258
259
# File 'lib/genevalidator/clusterization.rb', line 252

def deviation(clusters, queryLength)
  hits = clusters.map { |c| c.lengths.map { |x| a = Array.new(x[1], x[0]) }.flatten }.flatten
  raw_hits = clusters.map { |c| c.lengths.map { |x| a = Array.new(x[1], x[0]) }.flatten }.flatten.to_s.gsub('[', '').gsub(']', '')
  R.eval("sd = sd(c(#{raw_hits}))")
  sd = R.pull('sd')
  sd = standard_deviation(hits)
  (queryLength - mean).abs / sd
end

#distance(cluster, method = 0) ⇒ Object

Returns the euclidian distance between the current cluster and the one given as parameter Params: cluster: Cluster object method: 0 or 1 method = 0: do not into condseideration duplicate values method = 1: average linkage clusterization



190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/genevalidator/clusterization.rb', line 190

def distance(cluster, method = 0)
  d = 0
  norm = 0

  cluster.lengths.each do |elem1|
    lengths.each do |elem2|
      if method == 1
        d += elem1[1] * elem2[1] * (elem1[0] - elem2[0]).abs
        norm += elem1[1] * elem2[1]
      else
        d += (elem1[0] - elem2[0]).abs
        norm = cluster.lengths.length * lengths.length
      end
    end
  end

  # group average distance
  d /= (norm + 0.0)
  d.round(4)
end

#get_limitsObject

Returns the interval limits of the current cluster



282
283
284
# File 'lib/genevalidator/clusterization.rb', line 282

def get_limits
  lengths.map { |elem| elem[0] }.minmax
end

#inside_cluster(value) ⇒ Object

Returns whether the value is inside the cluster Params: value: value to compare Output: :ok or :shorter or :longer



292
293
294
295
296
297
298
299
300
# File 'lib/genevalidator/clusterization.rb', line 292

def inside_cluster(value)
  limits = get_limits
  left = limits[0]
  right = limits[1]

  :ok if left <= value && right >= value
  :shorter if left >= value
  :longer if right <= value
end

#meanObject

Returns the weighted mean value of the cluster



163
164
165
166
167
168
169
170
171
172
# File 'lib/genevalidator/clusterization.rb', line 163

def mean
  mean_len = 0
  weight = 0

  lengths.each do |length, n|
    mean_len += length * n
    weight += n
  end
  mean_len /= weight
end

Prints the current cluster



272
273
274
275
276
277
278
# File 'lib/genevalidator/clusterization.rb', line 272

def print
  $stderr.puts "Cluster: mean = #{mean}, density = #{density}"
  lengths.sort { |a, b| a <=> b }.each do |elem|
    $stderr.puts "#{elem[0]}, #{elem[1]}"
  end
  $stderr.puts '--------------------------'
end

#standard_deviation(lengths = nil) ⇒ Object

Returns the standard deviation of a set of values Params: lengths: a vector of values (optional, by default it takes the values in the cluster) Output: Real number



232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/genevalidator/clusterization.rb', line 232

def standard_deviation(lengths = nil)
  if lengths.nil?
    lengths = @lengths.map { |x| a = Array.new(x[1], x[0]) }.flatten
  end

  cluster_mean = mean
  std_deviation = 0
  lengths.each do |len|
    std_deviation += (cluster_mean - len) * (cluster_mean - len)
  end
  std_deviation = Math.sqrt(std_deviation.to_f / (lengths.length - 1))
end

#wss(lengths = nil) ⇒ Object

Returns within cluster sum of squares



213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/genevalidator/clusterization.rb', line 213

def wss(lengths = nil)
  if lengths.nil?
    lengths = @lengths.map { |x| a = Array.new(x[1], x[0]) }.flatten
  end

  cluster_mean = mean
  ss = 0
  lengths.each do |len|
    ss += (cluster_mean - len) * (cluster_mean - len)
  end
  ss
end