Module: PdfExtract::Kmeans

Defined in:
lib/kmeans.rb

Class Method Summary collapse

Class Method Details

.cluster_centre(cluster) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/kmeans.rb', line 20

def self.cluster_centre cluster
  keys = cluster[:centre].keys

  centre = {}

  # Sum each key
  cluster[:items].each do |item|
    keys.each do |key|
      centre[key] ||= 0
      centre[key] += item[key]
    end
  end

  # Avg each key
  centre.each_key do |key|
    centre[key] = centre[key] / cluster[:items].length.to_f
  end

  centre
end

.clusters(items, keys, options = {}) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/kmeans.rb', line 41

def self.clusters items, keys, options = {}
  options = {
    :k => 10,
    :delta => 0.001,
    :init => [],
    :random => true
  }.merge options
  
  cs = []

  if not options[:init].empty?
    options[:init].each do |centre|
      cs << {:centre => centre, :items => []}
    end
  end
  
  # Make k clusters with random centre points
  if options[:random]
    options[:k].times do
      idx = (items.length * rand).to_i
      cs << {:centre => take_keys(items[idx], keys), :items => []}
    end
  end

  puts cs

  while true
    
    # Add each item to a cluster
    items.each do |item|
      min_distance = Float::MAX
      selected_cluster = nil

      cs.each do |cluster|
        distance = ndist(item, cluster[:centre], keys)
        if distance < min_distance
          min_distance = distance
          selected_cluster = cluster
        end
      end

      selected_cluster[:items] << item
    end

    # Drop clusters with no items (often because of duplicate
    # initial centre points)
    cs = cs.reject { |cluster| cluster[:items].empty? }

    max_delta = Float::MIN

    # Recalculate centre points and max delta
    cs.each do |cluster|
      old_centre = cluster[:centre]
      centre = cluster_centre cluster
      cluster[:centre] = centre

      max_delta = [ndist(old_centre, centre, keys), max_delta].max
    end
    
    if max_delta <= options[:delta]
      break
    else
      cs.each do |cluster|
        cluster[:items] = []
      end
    end
   
  end

  cs
end

.ndist(a, b, keys) ⇒ Object



12
13
14
15
16
17
18
# File 'lib/kmeans.rb', line 12

def self.ndist a, b, keys
  sum = 0
  keys.each do |key|
    sum += (a[key] - b[key]) ** 2
  end
  Math.sqrt sum
end

.take_keys(item, keys) ⇒ Object



4
5
6
7
8
9
10
# File 'lib/kmeans.rb', line 4

def self.take_keys item, keys
  r = {}
  keys.each do |key|
    r[key] = item[key]
  end
  r
end