Class: Idhja22::Dataset
- Inherits:
-
Object
show all
- Includes:
- TreeMethods
- Defined in:
- lib/idhja22/dataset.rb,
lib/idhja22/dataset/datum.rb,
lib/idhja22/dataset/errors.rb,
lib/idhja22/dataset/tree_methods.rb
Defined Under Namespace
Modules: TreeMethods
Classes: BadData, Datum, Example, InsufficientData, NonUniqueAttributeLabels
Instance Attribute Summary collapse
Class Method Summary
collapse
Instance Method Summary
collapse
#entropy, #partition, #terminating?
Constructor Details
#initialize(data, attr_labels, category_label) ⇒ Dataset
Returns a new instance of Dataset.
30
31
32
33
34
35
|
# File 'lib/idhja22/dataset.rb', line 30
def initialize(data, attr_labels, category_label)
@category_label = category_label
raise NonUniqueAttributeLabels, "repeated attributes in #{attr_labels}" unless attr_labels == attr_labels.uniq
@attribute_labels = attr_labels
@data = data
end
|
Instance Attribute Details
#attribute_labels ⇒ Object
Returns the value of attribute attribute_labels.
8
9
10
|
# File 'lib/idhja22/dataset.rb', line 8
def attribute_labels
@attribute_labels
end
|
#category_label ⇒ Object
Returns the value of attribute category_label.
8
9
10
|
# File 'lib/idhja22/dataset.rb', line 8
def category_label
@category_label
end
|
#data ⇒ Object
Returns the value of attribute data.
8
9
10
|
# File 'lib/idhja22/dataset.rb', line 8
def data
@data
end
|
Class Method Details
.from_csv(filename) ⇒ Object
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
# File 'lib/idhja22/dataset.rb', line 13
def from_csv(filename)
csv = CSV.read(filename)
labels = csv.shift
category_label = labels.pop
attribute_labels = labels
set = new([], attribute_labels, category_label)
csv.each do |row|
training_example = Example.new(row, attribute_labels, category_label)
set << training_example
end
return set
end
|
Instance Method Details
#category_counts ⇒ Object
37
38
39
40
41
42
43
44
|
# File 'lib/idhja22/dataset.rb', line 37
def category_counts
counts = Hash.new(0)
split_data = partition_by_category
split_data.each do |cat, d|
counts[cat] = d.size
end
return counts
end
|
#empty? ⇒ Boolean
50
51
52
|
# File 'lib/idhja22/dataset.rb', line 50
def empty?
return data.empty?
end
|
#m_estimate(prior) ⇒ Object
58
59
60
61
|
# File 'lib/idhja22/dataset.rb', line 58
def m_estimate(prior)
prior ||= Idhja22.config.default_probability
(category_counts['Y'] + (prior*Idhja22.config.equivalent_sample_size)).to_f/(size+Idhja22.config.equivalent_sample_size).to_f
end
|
#partition_by_category ⇒ Object
82
83
84
85
86
87
88
89
90
|
# File 'lib/idhja22/dataset.rb', line 82
def partition_by_category
output = Hash.new do |hash, key|
hash[key] = self.class.new([], attribute_labels, category_label)
end
self.data.each do |d|
output[d.category] << d
end
return output
end
|
#probability ⇒ Object
54
55
56
|
# File 'lib/idhja22/dataset.rb', line 54
def probability
category_counts['Y'].to_f/size.to_f
end
|
#size ⇒ Object
46
47
48
|
# File 'lib/idhja22/dataset.rb', line 46
def size
return data.size
end
|
#split(training_proportion) ⇒ Object
63
64
65
66
67
68
69
70
71
72
73
74
|
# File 'lib/idhja22/dataset.rb', line 63
def split(training_proportion)
shuffled_data = data.shuffle
cutoff_point = (training_proportion.to_f*size).to_i
training_data = shuffled_data[0...cutoff_point]
validation_data = shuffled_data[cutoff_point...size]
training_set = self.class.new(training_data, attribute_labels, category_label)
validation_set = self.class.new(validation_data, attribute_labels, category_label)
return training_set, validation_set
end
|