Class: Idhja22::Dataset

Inherits:

Object

Object
Idhja22::Dataset

show all

Includes:: TreeMethods

Defined in:: lib/idhja22/dataset.rb,
lib/idhja22/dataset/datum.rb,
lib/idhja22/dataset/errors.rb,
lib/idhja22/dataset/tree_methods.rb

Defined Under Namespace

Modules: TreeMethods Classes: BadData, Datum, Example, InsufficientData, NonUniqueAttributeLabels

Instance Attribute Summary collapse

#attribute_labels ⇒ Object readonly

Returns the value of attribute attribute_labels.
#category_label ⇒ Object readonly

Returns the value of attribute category_label.
#data ⇒ Object readonly

Returns the value of attribute data.

Class Method Summary collapse

.from_csv(filename) ⇒ Object

Instance Method Summary collapse

#<<(example) ⇒ Object
#category_counts ⇒ Object
#empty? ⇒ Boolean
#initialize(data, attr_labels, category_label) ⇒ Dataset constructor

A new instance of Dataset.
#m_estimate(prior) ⇒ Object
#partition_by_category ⇒ Object
#probability ⇒ Object
#size ⇒ Object
#split(training_proportion) ⇒ Object

Methods included from TreeMethods

#entropy, #partition, #terminating?

Constructor Details

#initialize(data, attr_labels, category_label) ⇒ `Dataset`

Returns a new instance of Dataset.

Raises:

(NonUniqueAttributeLabels)

# File 'lib/idhja22/dataset.rb', line 30

def initialize(data, attr_labels, category_label)
  @category_label = category_label
  raise NonUniqueAttributeLabels, "repeated attributes in #{attr_labels}" unless attr_labels == attr_labels.uniq
  @attribute_labels = attr_labels
  @data = data
end

Instance Attribute Details

#attribute_labels ⇒ `Object` (readonly)

Returns the value of attribute attribute_labels.



8
9
10

# File 'lib/idhja22/dataset.rb', line 8

def attribute_labels
  @attribute_labels
end

#category_label ⇒ `Object` (readonly)

Returns the value of attribute category_label.



8
9
10

# File 'lib/idhja22/dataset.rb', line 8

def category_label
  @category_label
end

#data ⇒ `Object` (readonly)

Returns the value of attribute data.



8
9
10

# File 'lib/idhja22/dataset.rb', line 8

def data
  @data
end

Class Method Details

.from_csv(filename) ⇒ `Object`

# File 'lib/idhja22/dataset.rb', line 13

def from_csv(filename)
  csv = CSV.read(filename)

  labels = csv.shift
  category_label = labels.pop
  attribute_labels = labels

  set = new([], attribute_labels, category_label)
  csv.each do |row|
    training_example = Example.new(row, attribute_labels, category_label)
    set << training_example
  end

  return set
end

Instance Method Details

#<<(example) ⇒ `Object`

Raises:

(Idhja22::Dataset::Datum::UnknownCategoryLabel)

# File 'lib/idhja22/dataset.rb', line 76

def <<(example)
  raise Idhja22::Dataset::Datum::UnknownCategoryLabel unless example.category_label == self.category_label
  raise Idhja22::Dataset::Datum::UnknownAttributeLabel unless example.attribute_labels == self.attribute_labels
  self.data << example
end

#category_counts ⇒ `Object`

# File 'lib/idhja22/dataset.rb', line 37

def category_counts
  counts = Hash.new(0)
  split_data = partition_by_category
  split_data.each do |cat, d|
    counts[cat] = d.size
  end
  return counts
end

#empty? ⇒ `Boolean`

Returns:

(Boolean)



50
51
52

# File 'lib/idhja22/dataset.rb', line 50

def empty?
  return data.empty?
end

#m_estimate(prior) ⇒ `Object`

# File 'lib/idhja22/dataset.rb', line 58

def m_estimate(prior)
  prior ||= Idhja22.config.default_probability
  (category_counts['Y'] + (prior*Idhja22.config.equivalent_sample_size)).to_f/(size+Idhja22.config.equivalent_sample_size).to_f
end

#partition_by_category ⇒ `Object`

# File 'lib/idhja22/dataset.rb', line 82

def partition_by_category
  output = Hash.new do |hash, key|
    hash[key] = self.class.new([], attribute_labels, category_label)
  end
  self.data.each do |d|
    output[d.category] << d
  end
  return output
end

#probability ⇒ `Object`



54
55
56

# File 'lib/idhja22/dataset.rb', line 54

def probability
  category_counts['Y'].to_f/size.to_f
end

#size ⇒ `Object`



46
47
48

# File 'lib/idhja22/dataset.rb', line 46

def size
  return data.size
end

#split(training_proportion) ⇒ `Object`

# File 'lib/idhja22/dataset.rb', line 63

def split(training_proportion)
  shuffled_data = data.shuffle
  cutoff_point = (training_proportion.to_f*size).to_i

  training_data = shuffled_data[0...cutoff_point]
  validation_data = shuffled_data[cutoff_point...size]

  training_set = self.class.new(training_data, attribute_labels, category_label)
  validation_set = self.class.new(validation_data, attribute_labels, category_label)

  return training_set, validation_set
end

Class: Idhja22::Dataset

Defined Under Namespace

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from TreeMethods

Constructor Details

#initialize(data, attr_labels, category_label) ⇒ Dataset

Instance Attribute Details

#attribute_labels ⇒ Object (readonly)

#category_label ⇒ Object (readonly)

#data ⇒ Object (readonly)

Class Method Details

.from_csv(filename) ⇒ Object

Instance Method Details

#<<(example) ⇒ Object

#category_counts ⇒ Object

#empty? ⇒ Boolean

#m_estimate(prior) ⇒ Object

#partition_by_category ⇒ Object

#probability ⇒ Object

#size ⇒ Object

#split(training_proportion) ⇒ Object