Class: FlatKit::FieldStats

Inherits:
Object
  • Object
show all
Defined in:
lib/flat_kit/field_stats.rb

Overview

Internal: Collect stats on a single field.

We may not know what the field data type is to start with, so collect a bunch of values until we have the threshold, and then calculte states based upon the data types determined by the guess

Constant Summary collapse

DEFAULT_GUESS_THRESHOLD =
1000
CORE_STATS =
:core
CARDINALITY_STATS =
:cardinality
ALL_STATS =
[CORE_STATS, CARDINALITY_STATS].freeze
EXPORT_FIELDS =
%w[
  name
  type
  count
  max
  mean
  min
  stddev
  sum
  mode
  unique_count

  max_length
  mean_length
  min_length
  stddev_length
  mode_length
  unique_count_lengths

  null_count
  unknown_count
  out_of_type_count
  total_count
  null_percent
  unknown_percent
].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(name:, stats_to_collect: CORE_STATS, type: ::FlatKit::FieldType::GuessType, guess_threshold: DEFAULT_GUESS_THRESHOLD) ⇒ FieldStats

Returns a new instance of FieldStats.

Raises:

  • (ArgumentError)


47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/flat_kit/field_stats.rb', line 47

def initialize(name:, stats_to_collect: CORE_STATS,
               type: ::FlatKit::FieldType::GuessType,
               guess_threshold: DEFAULT_GUESS_THRESHOLD)
  @name              = name
  @field_type        = type
  @guess_threshold   = guess_threshold
  @type_counts       = Hash.new(0)
  @out_of_type_count = 0
  @values            = []
  @stats             = nil
  @length_stats      = nil
  @stats_to_collect  = [stats_to_collect].flatten

  @stats_to_collect.each do |collection_set|
    next if ALL_STATS.include?(collection_set)

    valid_sets = ALL_STATS.map(&:to_s).join(", ")

    raise ArgumentError, "#{collection_set} is not a valid stats collection set, must be one of #{valid_sets}"
  end
  return if type.is_a?(Class) && (type.superclass == ::FlatKit::FieldType)

  raise ArgumentError, "type: must be FieldType subclasses - not #{type}"
end

Instance Attribute Details

#field_typeObject (readonly)

Returns the value of attribute field_type.



45
46
47
# File 'lib/flat_kit/field_stats.rb', line 45

def field_type
  @field_type
end

#nameObject (readonly)

Returns the value of attribute name.



45
46
47
# File 'lib/flat_kit/field_stats.rb', line 45

def name
  @name
end

#out_of_type_countObject (readonly)

Returns the value of attribute out_of_type_count.



45
46
47
# File 'lib/flat_kit/field_stats.rb', line 45

def out_of_type_count
  @out_of_type_count
end

#type_countsObject (readonly)

Returns the value of attribute type_counts.



45
46
47
# File 'lib/flat_kit/field_stats.rb', line 45

def type_counts
  @type_counts
end

Instance Method Details

#collecting_frequencies?Boolean

Returns:

  • (Boolean)


88
89
90
# File 'lib/flat_kit/field_stats.rb', line 88

def collecting_frequencies?
  @stats_to_collect.include?(CARDINALITY_STATS)
end

#countObject



96
97
98
# File 'lib/flat_kit/field_stats.rb', line 96

def count
  stats.count
end

#field_type_determined?Boolean

Returns:

  • (Boolean)


72
73
74
# File 'lib/flat_kit/field_stats.rb', line 72

def field_type_determined?
  @field_type != ::FlatKit::FieldType::GuessType
end

#frequenciesObject



132
133
134
# File 'lib/flat_kit/field_stats.rb', line 132

def frequencies
  stats.frequencies if collecting_frequencies?
end

#length_frequenciesObject



164
165
166
# File 'lib/flat_kit/field_stats.rb', line 164

def length_frequencies
  length_stats.frequencies if @length_stats && collecting_frequencies?
end

#maxObject



100
101
102
# File 'lib/flat_kit/field_stats.rb', line 100

def max
  stats.max if stats.respond_to?(:max)
end

#max_lengthObject



140
141
142
# File 'lib/flat_kit/field_stats.rb', line 140

def max_length
  length_stats.max if @length_stats
end

#meanObject



104
105
106
# File 'lib/flat_kit/field_stats.rb', line 104

def mean
  stats.mean if stats.respond_to?(:mean)
end

#mean_lengthObject



144
145
146
# File 'lib/flat_kit/field_stats.rb', line 144

def mean_length
  length_stats.mean if @length_stats
end

#minObject



108
109
110
# File 'lib/flat_kit/field_stats.rb', line 108

def min
  stats.min if stats.respond_to?(:min)
end

#min_lengthObject



136
137
138
# File 'lib/flat_kit/field_stats.rb', line 136

def min_length
  length_stats.min if @length_stats
end

#modeObject



120
121
122
# File 'lib/flat_kit/field_stats.rb', line 120

def mode
  stats.mode if collecting_frequencies?
end

#mode_lengthObject



152
153
154
# File 'lib/flat_kit/field_stats.rb', line 152

def mode_length
  length_stats.mode if @length_stats && collecting_frequencies?
end

#null_countObject



168
169
170
# File 'lib/flat_kit/field_stats.rb', line 168

def null_count
  type_counts[FieldType::NullType]
end

#null_percentObject



176
177
178
179
180
# File 'lib/flat_kit/field_stats.rb', line 176

def null_percent
  return 0 if total_count.zero?

  ((null_count.to_f / total_count) * 100.0).truncate(2)
end

#stddevObject



112
113
114
# File 'lib/flat_kit/field_stats.rb', line 112

def stddev
  stats.stddev if stats.respond_to?(:stddev)
end

#stddev_lengthObject



148
149
150
# File 'lib/flat_kit/field_stats.rb', line 148

def stddev_length
  length_stats.stddev if @length_stats
end

#sumObject



116
117
118
# File 'lib/flat_kit/field_stats.rb', line 116

def sum
  stats.sum if stats.respond_to?(:sum)
end

#to_hashObject



192
193
194
195
196
197
198
199
200
# File 'lib/flat_kit/field_stats.rb', line 192

def to_hash
  resolve_guess

  {}.tap do |h|
    EXPORT_FIELDS.each do |n|
      h[n] = send(n)
    end
  end
end

#total_countObject



172
173
174
# File 'lib/flat_kit/field_stats.rb', line 172

def total_count
  stats.count + @out_of_type_count
end

#typeObject



92
93
94
# File 'lib/flat_kit/field_stats.rb', line 92

def type
  @field_type.type_name
end

#unique_countObject



124
125
126
# File 'lib/flat_kit/field_stats.rb', line 124

def unique_count
  stats.unique_count if collecting_frequencies?
end

#unique_count_lengthsObject



156
157
158
# File 'lib/flat_kit/field_stats.rb', line 156

def unique_count_lengths
  length_stats.unique_count if @length_stats && collecting_frequencies?
end

#unique_valuesObject



128
129
130
# File 'lib/flat_kit/field_stats.rb', line 128

def unique_values
  stats.unique_values if collecting_frequencies?
end

#unique_values_lengthsObject



160
161
162
# File 'lib/flat_kit/field_stats.rb', line 160

def unique_values_lengths
  length_stats.unique_values if @length_stats && collecting_frequencies?
end

#unknown_countObject



182
183
184
# File 'lib/flat_kit/field_stats.rb', line 182

def unknown_count
  type_counts[FieldType::UnknownType]
end

#unknown_percentObject



186
187
188
189
190
# File 'lib/flat_kit/field_stats.rb', line 186

def unknown_percent
  return 0 if total_count.zero?

  ((unknown_count.to_f / total_count) * 100.0).truncate(2)
end

#update(value) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
# File 'lib/flat_kit/field_stats.rb', line 76

def update(value)
  update_type_count(value)

  if field_type_determined?
    update_stats(value)
  else
    @values << value

    resolve_guess if @values.size >= @guess_threshold
  end
end