Class: Daru::Core::GroupBy
- Extended by:
- Gem::Deprecate
- Defined in:
- lib/daru/core/group_by.rb
Constant Summary collapse
- TUPLE_SORTER =
lambda do |left, right| return -1 unless right return 1 unless left left = left.compact right = right.compact return left <=> right || 0 if left.length == right.length left.length <=> right.length end
Instance Attribute Summary collapse
-
#group_vectors ⇒ Object
readonly
The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors.
-
#non_group_vectors ⇒ Object
readonly
The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors.
Class Method Summary collapse
- .df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ Object
- .get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ Object
- .get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ Object
- .group_by_index_to_positions(indexes_with_positions, sort: false) ⇒ Object (also: get_positions_group_map_on)
- .group_map_from_positions_to_indexes(positions_group_map, index) ⇒ Object
Instance Method Summary collapse
-
#aggregate(options = {}) ⇒ Daru::DataFrame
Function to use for aggregating the data.
-
#count ⇒ Object
Count groups, excludes missing values.
-
#df ⇒ Object
(also: #grouped_df)
lazy accessor/attr_reader for the attribute df.
-
#each_group ⇒ Object
Iterate over each group created by group_by.
-
#first ⇒ Object
Get the first group.
-
#get_group(group) ⇒ Object
Returns one of the selected groups as a DataFrame.
-
#groups ⇒ Object
(also: #groups_by_idx)
lazy accessor/attr_reader for the attribute groups.
-
#head(quantity = 5) ⇒ Object
Get the top ‘n’ groups.
-
#initialize(context, names) ⇒ GroupBy
constructor
A new instance of GroupBy.
- #inspect ⇒ Object
-
#last ⇒ Object
Get the last group.
-
#max ⇒ Object
Find the max element of each numeric vector group.
-
#mean ⇒ Object
Calculate mean of numeric groups, excluding missing values.
-
#median ⇒ Object
Calculate the median of numeric groups, excluding missing values.
-
#min ⇒ Object
Find the min element of each numeric vector group.
-
#reduce(init = nil) {|block| ... } ⇒ Object
Iteratively applies a function to the values in a group and accumulates the result.
-
#size ⇒ Object
Get a Daru::Vector of the size of each group.
-
#std ⇒ Object
Calculate sample standard deviation of numeric vector groups, excluding missing values.
-
#sum ⇒ Object
Calculate sum of numeric groups, excluding missing values.
-
#tail(quantity = 5) ⇒ Object
Get the bottom ‘n’ groups.
Constructor Details
#initialize(context, names) ⇒ GroupBy
Returns a new instance of GroupBy.
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
# File 'lib/daru/core/group_by.rb', line 99 def initialize context, names @group_vectors = names @non_group_vectors = context.vectors.to_a - names @context = context # TODO: maybe rename in @original_df # FIXME: It feels like we don't want to sort here. Ruby's #group_by # never sorts: # # ['test', 'me', 'please'].group_by(&:size) # # => {4=>["test"], 2=>["me"], 6=>["please"]} # # - zverok, 2016-09-12 @groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true) end |
Instance Attribute Details
#group_vectors ⇒ Object (readonly)
The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors
65 66 67 |
# File 'lib/daru/core/group_by.rb', line 65 def group_vectors @group_vectors end |
#non_group_vectors ⇒ Object (readonly)
The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors
65 66 67 |
# File 'lib/daru/core/group_by.rb', line 65 def non_group_vectors @non_group_vectors end |
Class Method Details
.df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/daru/core/group_by.rb', line 48 def df_from_group_map(df, group_map, remaining_vectors, from_position: true) return nil if group_map == {} new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } } new_index = Daru::MultiIndex.from_tuples(new_index) return Daru::DataFrame.new({}, index: new_index) if remaining_vectors == [] new_rows_order = group_map.values.flatten new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position) new_df.index = new_index new_df end |
.get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ Object
26 27 28 29 30 31 32 33 |
# File 'lib/daru/core/group_by.rb', line 26 def get_positions_group_for_aggregation(multi_index, level=-1) raise unless multi_index.is_a?(Daru::MultiIndex) new_index = multi_index.dup new_index.remove_layer(level) # TODO: recheck code of Daru::MultiIndex#remove_layer group_by_index_to_positions(new_index.each_with_index) end |
.get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ Object
36 37 38 39 40 |
# File 'lib/daru/core/group_by.rb', line 36 def get_positions_group_map_for_df(df, group_by_keys, sort: true) indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index group_by_index_to_positions(indexes_with_positions, sort: sort) end |
.group_by_index_to_positions(indexes_with_positions, sort: false) ⇒ Object Also known as: get_positions_group_map_on
8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
# File 'lib/daru/core/group_by.rb', line 8 def group_by_index_to_positions(indexes_with_positions, sort: false) index_to_positions = {} indexes_with_positions.each do |idx, position| (index_to_positions[idx] ||= []) << position end if sort # TODO: maybe add a more "stable" sorting option? sorted_keys = index_to_positions.keys.sort(&Daru::Core::GroupBy::TUPLE_SORTER) index_to_positions = sorted_keys.map { |k| [k, index_to_positions[k]] }.to_h end index_to_positions end |
.group_map_from_positions_to_indexes(positions_group_map, index) ⇒ Object
43 44 45 |
# File 'lib/daru/core/group_by.rb', line 43 def group_map_from_positions_to_indexes(positions_group_map, index) positions_group_map.map { |k, positions| [k, positions.map { |pos| index.at(pos) }] }.to_h end |
Instance Method Details
#aggregate(options = {}) ⇒ Daru::DataFrame
Function to use for aggregating the data. ‘group_by` is using Daru::DataFrame#aggregate
349 350 351 352 353 |
# File 'lib/daru/core/group_by.rb', line 349 def aggregate(={}) new_index = get_grouped_index @context.aggregate() { [@groups_by_pos.values, new_index] } end |
#count ⇒ Object
Count groups, excludes missing values.
228 229 230 231 |
# File 'lib/daru/core/group_by.rb', line 228 def count width = @non_group_vectors.size Daru::DataFrame.new([size]*width, order: @non_group_vectors) end |
#df ⇒ Object Also known as: grouped_df
lazy accessor/attr_reader for the attribute df
74 75 76 |
# File 'lib/daru/core/group_by.rb', line 74 def df @df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors) end |
#each_group ⇒ Object
Iterate over each group created by group_by. A DataFrame is yielded in block.
81 82 83 84 85 86 87 |
# File 'lib/daru/core/group_by.rb', line 81 def each_group return to_enum(:each_group) unless block_given? groups.keys.each do |k| yield get_group(k) end end |
#first ⇒ Object
Get the first group
124 125 126 |
# File 'lib/daru/core/group_by.rb', line 124 def first head(1) end |
#get_group(group) ⇒ Object
Returns one of the selected groups as a DataFrame.
265 266 267 268 269 270 271 272 273 274 |
# File 'lib/daru/core/group_by.rb', line 265 def get_group group indexes = groups_by_idx[group] elements = @context.each_vector.map(&:to_a) transpose = elements.transpose rows = indexes.each.map { |idx| transpose[idx] } Daru::DataFrame.rows( rows, index: indexes, order: @context.vectors ) end |
#groups ⇒ Object Also known as: groups_by_idx
lazy accessor/attr_reader for the attribute groups
68 69 70 |
# File 'lib/daru/core/group_by.rb', line 68 def groups @groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index) end |
#head(quantity = 5) ⇒ Object
Get the top ‘n’ groups
152 153 154 |
# File 'lib/daru/core/group_by.rb', line 152 def head quantity=5 select_groups_from :first, quantity end |
#inspect ⇒ Object
309 310 311 |
# File 'lib/daru/core/group_by.rb', line 309 def inspect grouped_df.inspect end |
#last ⇒ Object
Get the last group
129 130 131 |
# File 'lib/daru/core/group_by.rb', line 129 def last tail(1) end |
#max ⇒ Object
Find the max element of each numeric vector group.
240 241 242 |
# File 'lib/daru/core/group_by.rb', line 240 def max apply_method :numeric, :max end |
#mean ⇒ Object
Calculate mean of numeric groups, excluding missing values.
196 197 198 |
# File 'lib/daru/core/group_by.rb', line 196 def mean apply_method :numeric, :mean end |
#median ⇒ Object
Calculate the median of numeric groups, excluding missing values.
201 202 203 |
# File 'lib/daru/core/group_by.rb', line 201 def median apply_method :numeric, :median end |
#min ⇒ Object
Find the min element of each numeric vector group.
245 246 247 |
# File 'lib/daru/core/group_by.rb', line 245 def min apply_method :numeric, :min end |
#reduce(init = nil) {|block| ... } ⇒ Object
Iteratively applies a function to the values in a group and accumulates the result.
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 |
# File 'lib/daru/core/group_by.rb', line 292 def reduce(init=nil) result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h| group_indices = indices.map { |v| @context.index.to_a[v] } grouped_result = init group_indices.each do |idx| grouped_result = yield(grouped_result, @context.row[idx]) end h[group] = grouped_result end index = get_grouped_index(result_hash.keys) Daru::Vector.new(result_hash.values, index: index) end |
#size ⇒ Object
Get a Daru::Vector of the size of each group.
116 117 118 119 120 121 |
# File 'lib/daru/core/group_by.rb', line 116 def size index = get_grouped_index values = @groups_by_pos.values.map(&:size) Daru::Vector.new(values, index: index, name: :size) end |
#std ⇒ Object
Calculate sample standard deviation of numeric vector groups, excluding missing values.
235 236 237 |
# File 'lib/daru/core/group_by.rb', line 235 def std apply_method :numeric, :std end |
#sum ⇒ Object
Calculate sum of numeric groups, excluding missing values.
206 207 208 |
# File 'lib/daru/core/group_by.rb', line 206 def sum apply_method :numeric, :sum end |
#tail(quantity = 5) ⇒ Object
Get the bottom ‘n’ groups
175 176 177 |
# File 'lib/daru/core/group_by.rb', line 175 def tail quantity=5 select_groups_from :last, quantity end |