Class: Daru::Core::GroupBy
Constant Summary collapse
- TUPLE_SORTER =
lambda do |left, right| return -1 unless right return 1 unless left left = left.compact right = right.compact return left <=> right || 0 if left.length == right.length left.length <=> right.length end
Instance Attribute Summary collapse
-
#df ⇒ Object
readonly
Returns the value of attribute df.
-
#groups ⇒ Object
readonly
Returns the value of attribute groups.
Class Method Summary collapse
- .df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ Object
- .get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ Object
- .get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ Object
- .get_positions_group_map_on(indexes_with_positions, sort: false) ⇒ Object
- .group_map_from_positions_to_indexes(positions_group_map, index) ⇒ Object
Instance Method Summary collapse
-
#aggregate(options = {}) ⇒ Daru::DataFrame
Function to use for aggregating the data.
-
#count ⇒ Object
Count groups, excludes missing values.
-
#each_group ⇒ Object
Iterate over each group created by group_by.
-
#first ⇒ Object
Get the first group.
-
#get_group(group) ⇒ Object
Returns one of the selected groups as a DataFrame.
-
#head(quantity = 5) ⇒ Object
Get the top ‘n’ groups.
-
#initialize(context, names) ⇒ GroupBy
constructor
A new instance of GroupBy.
- #inspect ⇒ Object
-
#last ⇒ Object
Get the last group.
-
#max ⇒ Object
Find the max element of each numeric vector group.
-
#mean ⇒ Object
Calculate mean of numeric groups, excluding missing values.
-
#median ⇒ Object
Calculate the median of numeric groups, excluding missing values.
-
#min ⇒ Object
Find the min element of each numeric vector group.
-
#reduce(init = nil) {|block| ... } ⇒ Object
Iteratively applies a function to the values in a group and accumulates the result.
-
#size ⇒ Object
Get a Daru::Vector of the size of each group.
-
#std ⇒ Object
Calculate sample standard deviation of numeric vector groups, excluding missing values.
-
#sum ⇒ Object
Calculate sum of numeric groups, excluding missing values.
-
#tail(quantity = 5) ⇒ Object
Get the bottom ‘n’ groups.
Constructor Details
#initialize(context, names) ⇒ GroupBy
Returns a new instance of GroupBy.
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/daru/core/group_by.rb', line 77 def initialize context, names @non_group_vectors = context.vectors.to_a - names @context = context # FIXME: It feels like we don't want to sort here. Ruby's #group_by # never sorts: # # ['test', 'me', 'please'].group_by(&:size) # # => {4=>["test"], 2=>["me"], 6=>["please"]} # # - zverok, 2016-09-12 positions_groups = GroupBy.get_positions_group_map_for_df(@context, names, sort: true) @groups = GroupBy.group_map_from_positions_to_indexes(positions_groups, @context.index) @df = GroupBy.df_from_group_map(@context, positions_groups, @non_group_vectors) end |
Instance Attribute Details
#df ⇒ Object (readonly)
Returns the value of attribute df.
55 56 57 |
# File 'lib/daru/core/group_by.rb', line 55 def df @df end |
#groups ⇒ Object (readonly)
Returns the value of attribute groups.
55 56 57 |
# File 'lib/daru/core/group_by.rb', line 55 def groups @groups end |
Class Method Details
.df_from_group_map(df, group_map, remaining_vectors, from_position: true) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/daru/core/group_by.rb', line 39 def df_from_group_map(df, group_map, remaining_vectors, from_position: true) return nil if group_map == {} new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } } new_index = Daru::MultiIndex.from_tuples(new_index) return Daru::DataFrame.new({}, index: new_index) if remaining_vectors == [] new_rows_order = group_map.values.flatten new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position) new_df.index = new_index new_df end |
.get_positions_group_for_aggregation(multi_index, level = -1)) ⇒ Object
20 21 22 23 24 25 26 27 |
# File 'lib/daru/core/group_by.rb', line 20 def get_positions_group_for_aggregation(multi_index, level=-1) raise unless multi_index.is_a?(Daru::MultiIndex) new_index = multi_index.dup new_index.remove_layer(level) # TODO: recheck code of Daru::MultiIndex#remove_layer get_positions_group_map_on(new_index.each_with_index) end |
.get_positions_group_map_for_df(df, group_by_keys, sort: true) ⇒ Object
29 30 31 32 33 |
# File 'lib/daru/core/group_by.rb', line 29 def get_positions_group_map_for_df(df, group_by_keys, sort: true) indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index get_positions_group_map_on(indexes_with_positions, sort: sort) end |
.get_positions_group_map_on(indexes_with_positions, sort: false) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/daru/core/group_by.rb', line 5 def get_positions_group_map_on(indexes_with_positions, sort: false) group_map = {} indexes_with_positions.each do |idx, position| (group_map[idx] ||= []) << position end if sort # TODO: maybe add a more "stable" sorting option? sorted_keys = group_map.keys.sort(&Daru::Core::GroupBy::TUPLE_SORTER) group_map = sorted_keys.map { |k| [k, group_map[k]] }.to_h end group_map end |
.group_map_from_positions_to_indexes(positions_group_map, index) ⇒ Object
35 36 37 |
# File 'lib/daru/core/group_by.rb', line 35 def group_map_from_positions_to_indexes(positions_group_map, index) positions_group_map.map { |k, positions| [k, positions.map { |pos| index.at(pos) }] }.to_h end |
Instance Method Details
#aggregate(options = {}) ⇒ Daru::DataFrame
Function to use for aggregating the data. ‘group_by` is using Daru::DataFrame#aggregate
337 338 339 |
# File 'lib/daru/core/group_by.rb', line 337 def aggregate(={}) @df.aggregate() end |
#count ⇒ Object
Count groups, excludes missing values.
211 212 213 214 |
# File 'lib/daru/core/group_by.rb', line 211 def count width = @non_group_vectors.size Daru::DataFrame.new([size]*width, order: @non_group_vectors) end |
#each_group ⇒ Object
Iterate over each group created by group_by. A DataFrame is yielded in block.
59 60 61 62 63 64 65 |
# File 'lib/daru/core/group_by.rb', line 59 def each_group return to_enum(:each_group) unless block_given? groups.keys.each do |k| yield get_group(k) end end |
#first ⇒ Object
Get the first group
107 108 109 |
# File 'lib/daru/core/group_by.rb', line 107 def first head(1) end |
#get_group(group) ⇒ Object
Returns one of the selected groups as a DataFrame.
248 249 250 251 252 253 254 255 256 257 |
# File 'lib/daru/core/group_by.rb', line 248 def get_group group indexes = @groups[group] elements = @context.each_vector.map(&:to_a) transpose = elements.transpose rows = indexes.each.map { |idx| transpose[idx] } Daru::DataFrame.rows( rows, index: indexes, order: @context.vectors ) end |
#head(quantity = 5) ⇒ Object
Get the top ‘n’ groups
135 136 137 |
# File 'lib/daru/core/group_by.rb', line 135 def head quantity=5 select_groups_from :first, quantity end |
#inspect ⇒ Object
297 298 299 |
# File 'lib/daru/core/group_by.rb', line 297 def inspect @df.inspect end |
#last ⇒ Object
Get the last group
112 113 114 |
# File 'lib/daru/core/group_by.rb', line 112 def last tail(1) end |
#max ⇒ Object
Find the max element of each numeric vector group.
223 224 225 |
# File 'lib/daru/core/group_by.rb', line 223 def max apply_method :numeric, :max end |
#mean ⇒ Object
Calculate mean of numeric groups, excluding missing values.
179 180 181 |
# File 'lib/daru/core/group_by.rb', line 179 def mean apply_method :numeric, :mean end |
#median ⇒ Object
Calculate the median of numeric groups, excluding missing values.
184 185 186 |
# File 'lib/daru/core/group_by.rb', line 184 def median apply_method :numeric, :median end |
#min ⇒ Object
Find the min element of each numeric vector group.
228 229 230 |
# File 'lib/daru/core/group_by.rb', line 228 def min apply_method :numeric, :min end |
#reduce(init = nil) {|block| ... } ⇒ Object
Iteratively applies a function to the values in a group and accumulates the result.
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
# File 'lib/daru/core/group_by.rb', line 275 def reduce(init=nil) result_hash = @groups.each_with_object({}) do |(group, indices), h| group_indices = indices.map { |v| @context.index.to_a[v] } grouped_result = init group_indices.each do |idx| grouped_result = yield(grouped_result, @context.row[idx]) end h[group] = grouped_result end index = if multi_indexed_grouping? Daru::MultiIndex.from_tuples result_hash.keys else Daru::Index.new result_hash.keys.flatten end Daru::Vector.new(result_hash.values, index: index) end |
#size ⇒ Object
Get a Daru::Vector of the size of each group.
94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/daru/core/group_by.rb', line 94 def size index = if multi_indexed_grouping? Daru::MultiIndex.from_tuples @groups.keys else Daru::Index.new @groups.keys.flatten end values = @groups.values.map(&:size) Daru::Vector.new(values, index: index, name: :size) end |
#std ⇒ Object
Calculate sample standard deviation of numeric vector groups, excluding missing values.
218 219 220 |
# File 'lib/daru/core/group_by.rb', line 218 def std apply_method :numeric, :std end |
#sum ⇒ Object
Calculate sum of numeric groups, excluding missing values.
189 190 191 |
# File 'lib/daru/core/group_by.rb', line 189 def sum apply_method :numeric, :sum end |
#tail(quantity = 5) ⇒ Object
Get the bottom ‘n’ groups
158 159 160 |
# File 'lib/daru/core/group_by.rb', line 158 def tail quantity=5 select_groups_from :last, quantity end |