Class: DecisionTree::ID3Tree
- Inherits:
-
Object
- Object
- DecisionTree::ID3Tree
- Defined in:
- lib/decision_tree/id3_tree.rb
Defined Under Namespace
Classes: Node
Instance Method Summary collapse
-
#entropy(p, n) ⇒ Object
calculate Information based on probabilities.
-
#entropy_num(p, n) ⇒ Object
calculate information based on number of positive and negative classifications.
- #graph(filename) ⇒ Object
-
#id3_continuous(data, attributes, attribute) ⇒ Object
ID3 for binary classification of continuous variables (e.g. healthy / sick based on temperature thresholds).
-
#id3_discrete(data, attributes, attribute) ⇒ Object
ID3 for discrete label cases.
-
#initialize(attributes, data, default, type) ⇒ ID3Tree
constructor
A new instance of ID3Tree.
- #predict(test) ⇒ Object
- #train(data = @data, attributes = @attributes, default = @default) ⇒ Object
Constructor Details
#initialize(attributes, data, default, type) ⇒ ID3Tree
Returns a new instance of ID3Tree.
20 21 22 23 |
# File 'lib/decision_tree/id3_tree.rb', line 20 def initialize(attributes, data, default, type) @used, @tree, @type = {}, {}, type @data, @attributes, @default = data, attributes, default end |
Instance Method Details
#entropy(p, n) ⇒ Object
calculate Information based on probabilities
87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/decision_tree/id3_tree.rb', line 87 def entropy(p, n) p = 0 if p.nan? n = 0 if n.nan? if(n < 0.00000001 and p < 0.00000001); return 0 elsif (p < 0.00000001); return - n.to_f/(p+n)*Math.log(n.to_f/(p+n))/Math.log(2.0) elsif (n < 0.00000001); return - p.to_f/(p+n)*Math.log(p.to_f/(p+n))/Math.log(2.0) end return (- p.to_f/(p+n)) * Math.log(p.to_f/(p+n))/Math.log(2.0) + (- n.to_f/(p+n)) * Math.log(n.to_f/(p+n))/Math.log(2.0) end |
#entropy_num(p, n) ⇒ Object
calculate information based on number of positive and negative classifications
84 |
# File 'lib/decision_tree/id3_tree.rb', line 84 def entropy_num(p,n); entropy(p.to_f/(p+n),n.to_f/(p+n)); end |
#graph(filename) ⇒ Object
101 102 103 104 |
# File 'lib/decision_tree/id3_tree.rb', line 101 def graph(filename) dgp = DotGraphPrinter.new(build_tree) dgp.write_to_file("#{filename}.png", "png") end |
#id3_continuous(data, attributes, attribute) ⇒ Object
ID3 for binary classification of continuous variables (e.g. healthy / sick based on temperature thresholds)
60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/decision_tree/id3_tree.rb', line 60 def id3_continuous(data, attributes, attribute) values, thresholds = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort, [] values.each_index { |i| thresholds.push((values[i]+(values[i+1].nil? ? values[i] : values[i+1])).to_f / 2) } thresholds -= @used[attribute] if @used.has_key? attribute gain = thresholds.collect { |threshold| sp = data.partition { |d| d[attributes.index(attribute)] > threshold } pos = (sp[0].size).to_f / data.size neg = (sp[1].size).to_f / data.size [entropy_num(data.count_p, data.count_n) - pos*entropy_num(sp[0].count_p, sp[0].count_n) - neg*entropy_num(sp[1].count_p, sp[1].count_n), threshold] }.max { |a,b| a[0] <=> b[0] } end |
#id3_discrete(data, attributes, attribute) ⇒ Object
ID3 for discrete label cases
75 76 77 78 79 80 81 |
# File 'lib/decision_tree/id3_tree.rb', line 75 def id3_discrete(data, attributes, attribute) values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort partitions = values.collect { |val| data.select { |d| d[attributes.index(attribute)] == val } } remainder = partitions.collect {|p| (p.size.to_f / data.size) * entropy_num(p.count_p, p.count_n)}.inject(0) {|i,s| s+=i } [entropy_num(data.count_p, data.count_n) - remainder, attributes.index(attribute)] end |
#predict(test) ⇒ Object
99 |
# File 'lib/decision_tree/id3_tree.rb', line 99 def predict(test); @type == :discrete ? descend_discrete(@tree, test) : descend_continuous(@tree,test); end |
#train(data = @data, attributes = @attributes, default = @default) ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/decision_tree/id3_tree.rb', line 25 def train(data=@data, attributes=@attributes, default=@default) # Choose a fitness algorithm case @type when :discrete; fitness = proc{|a,b,c| id3_discrete(a,b,c)} when :continuous; fitness = proc{|a,b,c| id3_continuous(a,b,c)} end return default if data.empty? # return classification if all examples have the same classification return data.first.last if data.classification.uniq.size == 1 # Choose best attribute (1. enumerate all attributes / 2. Pick best attribute) performance = attributes.collect { |attribute| fitness.call(data, attributes, attribute) } max = performance.max { |a,b| a[0] <=> b[0] } best = Node.new(attributes[performance.index(max)], max[1], max[0]) @used.has_key?(best.attribute) ? @used[best.attribute] += [best.threshold] : @used[best.attribute] = [best.threshold] tree, l = {best => {}}, ['gt', 'lt'] case @type when :continuous data.partition { |d| d[attributes.index(best.attribute)] > best.threshold }.each_with_index { |examples, i| tree[best][String.new(l[i])] = train(examples, attributes, (data.classification.mode rescue 0), &fitness) } when :discrete values = data.collect { |d| d[attributes.index(best.attribute)] }.uniq.sort partitions = values.collect { |val| data.select { |d| d[attributes.index(best.attribute)] == val } } partitions.each_with_index { |examples, i| tree[best][values[i]] = train(examples, attributes-[values[i]], (data.classification.mode rescue 0), &fitness) } end @tree = tree end |