Class: Idhja22::Dataset

Inherits:
Object
  • Object
show all
Includes:
TreeMethods
Defined in:
lib/idhja22/dataset.rb,
lib/idhja22/dataset/datum.rb,
lib/idhja22/dataset/errors.rb,
lib/idhja22/dataset/tree_methods.rb

Defined Under Namespace

Modules: TreeMethods Classes: BadData, Datum, Example, InsufficientData, NonUniqueAttributeLabels

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from TreeMethods

#entropy, #partition, #terminating?

Constructor Details

#initialize(data, attr_labels, category_label) ⇒ Dataset

Returns a new instance of Dataset.



30
31
32
33
34
35
# File 'lib/idhja22/dataset.rb', line 30

def initialize(data, attr_labels, category_label)
  @category_label = category_label
  raise NonUniqueAttributeLabels, "repeated attributes in #{attr_labels}" unless attr_labels == attr_labels.uniq
  @attribute_labels = attr_labels
  @data = data
end

Instance Attribute Details

#attribute_labelsObject (readonly)

Returns the value of attribute attribute_labels.



8
9
10
# File 'lib/idhja22/dataset.rb', line 8

def attribute_labels
  @attribute_labels
end

#category_labelObject (readonly)

Returns the value of attribute category_label.



8
9
10
# File 'lib/idhja22/dataset.rb', line 8

def category_label
  @category_label
end

#dataObject (readonly)

Returns the value of attribute data.



8
9
10
# File 'lib/idhja22/dataset.rb', line 8

def data
  @data
end

Class Method Details

.from_csv(filename) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/idhja22/dataset.rb', line 13

def from_csv(filename)
  csv = CSV.read(filename)

  labels = csv.shift
  category_label = labels.pop
  attribute_labels = labels

  set = new([], attribute_labels, category_label)
  csv.each do |row|
    training_example = Example.new(row, attribute_labels, category_label)
    set << training_example
  end

  return set
end

Instance Method Details

#<<(example) ⇒ Object



76
77
78
79
80
# File 'lib/idhja22/dataset.rb', line 76

def <<(example)
  raise Idhja22::Dataset::Datum::UnknownCategoryLabel unless example.category_label == self.category_label
  raise Idhja22::Dataset::Datum::UnknownAttributeLabel unless example.attribute_labels == self.attribute_labels
  self.data << example
end

#category_countsObject



37
38
39
40
41
42
43
44
# File 'lib/idhja22/dataset.rb', line 37

def category_counts
  counts = Hash.new(0)
  split_data = partition_by_category
  split_data.each do |cat, d|
    counts[cat] = d.size
  end
  return counts
end

#empty?Boolean

Returns:

  • (Boolean)


50
51
52
# File 'lib/idhja22/dataset.rb', line 50

def empty?
  return data.empty?
end

#m_estimate(prior) ⇒ Object



58
59
60
61
# File 'lib/idhja22/dataset.rb', line 58

def m_estimate(prior)
  prior ||= Idhja22.config.default_probability
  (category_counts['Y'] + (prior*Idhja22.config.equivalent_sample_size)).to_f/(size+Idhja22.config.equivalent_sample_size).to_f
end

#partition_by_categoryObject



82
83
84
85
86
87
88
89
90
# File 'lib/idhja22/dataset.rb', line 82

def partition_by_category
  output = Hash.new do |hash, key|
    hash[key] = self.class.new([], attribute_labels, category_label)
  end
  self.data.each do |d|
    output[d.category] << d
  end
  return output
end

#probabilityObject



54
55
56
# File 'lib/idhja22/dataset.rb', line 54

def probability
  category_counts['Y'].to_f/size.to_f
end

#sizeObject



46
47
48
# File 'lib/idhja22/dataset.rb', line 46

def size
  return data.size
end

#split(training_proportion) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/idhja22/dataset.rb', line 63

def split(training_proportion)
  shuffled_data = data.shuffle
  cutoff_point = (training_proportion.to_f*size).to_i

  training_data = shuffled_data[0...cutoff_point]
  validation_data = shuffled_data[cutoff_point...size]

  training_set = self.class.new(training_data, attribute_labels, category_label)
  validation_set = self.class.new(validation_data, attribute_labels, category_label)

  return training_set, validation_set
end