Class: Clusterer::Algorithms

Inherits:
Object
  • Object
show all
Defined in:
lib/clusterer/algorithms.rb

Class Method Summary collapse

Class Method Details

.bisecting_kmeans(documents, k, options = { }) ⇒ Object



65
66
67
68
69
70
71
72
73
# File 'lib/clusterer/algorithms.rb', line 65

def bisecting_kmeans(documents, k, options = { })
  clusters = [Cluster.new(documents)]
  while  clusters.size < k
    lg_clus = clusters.max {|a, b| a.documents.size <=> b.documents.size} #largest cluster
    clusters.delete(lg_clus)
    clusters.concat(kmeans(lg_clus.documents,2))
  end
  options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
end

.hierarchical(documents, k, options = { }) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/clusterer/algorithms.rb', line 75

def hierarchical(documents, k, options = { })
  clusters = documents.collect {|d| Cluster.new([d])}
  iter = 0
  sim_fun = options[:similarity_function] || :upgma
  options[:similarity_function] = nil
  while clusters.size > k
    puts "Iteration ....#{iter}"

    pairs = []
    clusters.each_with_index {|c,i| pairs.concat(clusters.slice(i+1,clusters.size).collect{|f| [c,f] })}
    pair = pairs.max {|a,b| a[0].send(sim_fun, a[1]) <=> b[0].send(sim_fun, b[1]) }
    clusters.delete(pair[1])
    pair[0].merge!(pair[1])

    iter += 1
  end
  options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
end

.kmeans(documents, k, options = { }) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/clusterer/algorithms.rb', line 41

def kmeans(documents, k, options = { })
  old_clusters = Array.new(k)
  max_iter = options[:maximum_iterations] || 10
  clusters = options[:seeds] || random_cluster_seeds(documents, k)
  sim_fun = options[:similarity_function] || :cosine_similarity
  
  iter = 0
  while (!max_iter || iter < max_iter) && clusters != old_clusters
    puts "Iteration ....#{iter}"
    k.times {|i| old_clusters[i] = clusters[i]; clusters[i] = []}

    documents.each do |document|
      max_index = (0..k-1).max do |i,j|
        document.send(sim_fun, old_clusters[i].centroid) <=> document.send(sim_fun, old_clusters[j].centroid)
      end
      clusters[max_index] << document
    end

    k.times {|i| clusters[i] = Cluster.new(clusters[i])}
    iter += 1
  end
  return clusters
end