Class: Polars::GroupBy

Inherits:
Object
  • Object
show all
Defined in:
lib/polars/group_by.rb

Overview

Starts a new GroupBy operation.

Instance Method Summary collapse

Instance Method Details

#agg(*aggs, **named_aggs) ⇒ DataFrame

Compute aggregations for each group of a group by operation.

Examples:

Compute the aggregation of the columns for each group.

df = Polars::DataFrame.new(
  {
    "a" => ["a", "b", "a", "b", "c"],
    "b" => [1, 2, 1, 3, 3],
    "c" => [5, 4, 3, 2, 1]
  }
)
df.group_by("a").agg(Polars.col("b"), Polars.col("c"))
# =>
# shape: (3, 3)
# ┌─────┬───────────┬───────────┐
# │ a   ┆ b         ┆ c         │
# │ --- ┆ ---       ┆ ---       │
# │ str ┆ list[i64] ┆ list[i64] │
# ╞═════╪═══════════╪═══════════╡
# │ a   ┆ [1, 1]    ┆ [5, 3]    │
# │ b   ┆ [2, 3]    ┆ [4, 2]    │
# │ c   ┆ [3]       ┆ [1]       │
# └─────┴───────────┴───────────┘

Compute the sum of a column for each group.

df.group_by("a").agg(Polars.col("b").sum)
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ str ┆ i64 │
# ╞═════╪═════╡
# │ a   ┆ 2   │
# │ b   ┆ 5   │
# │ c   ┆ 3   │
# └─────┴─────┘

Compute multiple aggregates at once by passing a list of expressions.

df.group_by("a").agg([Polars.sum("b"), Polars.mean("c")])
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ a   ┆ b   ┆ c   │
# │ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ f64 │
# ╞═════╪═════╪═════╡
# │ c   ┆ 3   ┆ 1.0 │
# │ a   ┆ 2   ┆ 4.0 │
# │ b   ┆ 5   ┆ 3.0 │
# └─────┴─────┴─────┘

Or use positional arguments to compute multiple aggregations in the same way.

df.group_by("a").agg(
  Polars.sum("b").name.suffix("_sum"),
  (Polars.col("c") ** 2).mean.name.suffix("_mean_squared")
)
# =>
# shape: (3, 3)
# ┌─────┬───────┬────────────────┐
# │ a   ┆ b_sum ┆ c_mean_squared │
# │ --- ┆ ---   ┆ ---            │
# │ str ┆ i64   ┆ f64            │
# ╞═════╪═══════╪════════════════╡
# │ a   ┆ 2     ┆ 17.0           │
# │ c   ┆ 3     ┆ 1.0            │
# │ b   ┆ 5     ┆ 10.0           │
# └─────┴───────┴────────────────┘

Use keyword arguments to easily name your expression inputs.

df.group_by("a").agg(
  b_sum: Polars.sum("b"),
  c_mean_squared: (Polars.col("c") ** 2).mean
)
# =>
# shape: (3, 3)
# ┌─────┬───────┬────────────────┐
# │ a   ┆ b_sum ┆ c_mean_squared │
# │ --- ┆ ---   ┆ ---            │
# │ str ┆ i64   ┆ f64            │
# ╞═════╪═══════╪════════════════╡
# │ a   ┆ 2     ┆ 17.0           │
# │ c   ┆ 3     ┆ 1.0            │
# │ b   ┆ 5     ┆ 10.0           │
# └─────┴───────┴────────────────┘

Parameters:

  • aggs (Array)

    Aggregations to compute for each group of the group by operation, specified as positional arguments. Accepts expression input. Strings are parsed as column names.

  • named_aggs (Hash)

    Additional aggregations, specified as keyword arguments. The resulting columns will be renamed to the keyword used.

Returns:



203
204
205
206
207
208
# File 'lib/polars/group_by.rb', line 203

def agg(*aggs, **named_aggs)
  @df.lazy
    .group_by(@by, maintain_order: @maintain_order)
    .agg(*aggs, **named_aggs)
    .collect(no_optimization: true)
end

#countDataFrame

Count the number of values in each group.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 4, 10, 13, 14],
    "c" => [true, true, true, false, false, true],
    "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
  }
)
df.group_by("d", maintain_order: true).count
# =>
# shape: (3, 2)
# ┌────────┬───────┐
# │ d      ┆ count │
# │ ---    ┆ ---   │
# │ str    ┆ u32   │
# ╞════════╪═══════╡
# │ Apple  ┆ 3     │
# │ Orange ┆ 1     │
# │ Banana ┆ 2     │
# └────────┴───────┘

Returns:



482
483
484
# File 'lib/polars/group_by.rb', line 482

def count
  agg(Polars.len.alias("count"))
end

#eachObject

Allows iteration over the groups of the group by operation.

Examples:

df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]})
df.group_by("foo", maintain_order: true).each.to_h
# =>
# {"a"=>shape: (2, 2)
# ┌─────┬─────┐
# │ foo ┆ bar │
# │ --- ┆ --- │
# │ str ┆ i64 │
# ╞═════╪═════╡
# │ a   ┆ 1   │
# │ a   ┆ 2   │
# └─────┴─────┘, "b"=>shape: (1, 2)
# ┌─────┬─────┐
# │ foo ┆ bar │
# │ --- ┆ --- │
# │ str ┆ i64 │
# ╞═════╪═════╡
# │ b   ┆ 3   │
# └─────┴─────┘}

Returns:



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/polars/group_by.rb', line 35

def each
  return to_enum(:each) unless block_given?

  temp_col = "__POLARS_GB_GROUP_INDICES"
  groups_df =
    @df.lazy
      .with_row_index(name: temp_col)
      .group_by(@by, maintain_order: @maintain_order)
      .agg(Polars.col(temp_col))
      .collect(no_optimization: true)

  group_names = groups_df.select(Polars.all.exclude(temp_col))

  # When grouping by a single column, group name is a single value
  # When grouping by multiple columns, group name is a tuple of values
  if @by.is_a?(::String) || @by.is_a?(Expr)
    _group_names = group_names.to_series.each
  else
    _group_names = group_names.iter_rows
  end

  _group_indices = groups_df.select(temp_col).to_series
  _current_index = 0

  while _current_index < _group_indices.length
    group_name = _group_names.next
    group_data = @df[_group_indices[_current_index]]
    _current_index += 1

    yield group_name, group_data
  end
end

#firstDataFrame

Aggregate the first values in the group.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 4, 10, 13, 14],
    "c" => [true, true, true, false, false, true],
    "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
  }
)
df.group_by("d", maintain_order: true).first
# =>
# shape: (3, 4)
# ┌────────┬─────┬──────┬───────┐
# │ d      ┆ a   ┆ b    ┆ c     │
# │ ---    ┆ --- ┆ ---  ┆ ---   │
# │ str    ┆ i64 ┆ f64  ┆ bool  │
# ╞════════╪═════╪══════╪═══════╡
# │ Apple  ┆ 1   ┆ 0.5  ┆ true  │
# │ Orange ┆ 2   ┆ 0.5  ┆ true  │
# │ Banana ┆ 4   ┆ 13.0 ┆ false │
# └────────┴─────┴──────┴───────┘

Returns:



337
338
339
# File 'lib/polars/group_by.rb', line 337

def first
  agg(Polars.all.first)
end

#head(n = 5) ⇒ DataFrame

Get the first n rows of each group.

Examples:

df = Polars::DataFrame.new(
  {
    "letters" => ["c", "c", "a", "c", "a", "b"],
    "nrs" => [1, 2, 3, 4, 5, 6]
  }
)
# =>
# shape: (6, 2)
# ┌─────────┬─────┐
# │ letters ┆ nrs │
# │ ---     ┆ --- │
# │ str     ┆ i64 │
# ╞═════════╪═════╡
# │ c       ┆ 1   │
# │ c       ┆ 2   │
# │ a       ┆ 3   │
# │ c       ┆ 4   │
# │ a       ┆ 5   │
# │ b       ┆ 6   │
# └─────────┴─────┘
df.group_by("letters").head(2).sort("letters")
# =>
# shape: (5, 2)
# ┌─────────┬─────┐
# │ letters ┆ nrs │
# │ ---     ┆ --- │
# │ str     ┆ i64 │
# ╞═════════╪═════╡
# │ a       ┆ 3   │
# │ a       ┆ 5   │
# │ b       ┆ 6   │
# │ c       ┆ 1   │
# │ c       ┆ 2   │
# └─────────┴─────┘

Parameters:

  • n (Integer) (defaults to: 5)

    Number of rows to return.

Returns:



254
255
256
257
258
259
# File 'lib/polars/group_by.rb', line 254

def head(n = 5)
  @df.lazy
    .group_by(@by, maintain_order: @maintain_order)
    .head(n)
    .collect(no_optimization: true)
end

#lastDataFrame

Aggregate the last values in the group.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 4, 10, 13, 14],
    "c" => [true, true, true, false, false, true],
    "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
  }
)
df.group_by("d", maintain_order: true).last
# =>
# shape: (3, 4)
# ┌────────┬─────┬──────┬───────┐
# │ d      ┆ a   ┆ b    ┆ c     │
# │ ---    ┆ --- ┆ ---  ┆ ---   │
# │ str    ┆ i64 ┆ f64  ┆ bool  │
# ╞════════╪═════╪══════╪═══════╡
# │ Apple  ┆ 3   ┆ 10.0 ┆ false │
# │ Orange ┆ 2   ┆ 0.5  ┆ true  │
# │ Banana ┆ 5   ┆ 14.0 ┆ true  │
# └────────┴─────┴──────┴───────┘

Returns:



366
367
368
# File 'lib/polars/group_by.rb', line 366

def last
  agg(Polars.all.last)
end

#maxDataFrame

Reduce the groups to the maximal value.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 4, 10, 13, 14],
    "c" => [true, true, true, false, false, true],
    "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
  }
)
df.group_by("d", maintain_order: true).max
# =>
# shape: (3, 4)
# ┌────────┬─────┬──────┬──────┐
# │ d      ┆ a   ┆ b    ┆ c    │
# │ ---    ┆ --- ┆ ---  ┆ ---  │
# │ str    ┆ i64 ┆ f64  ┆ bool │
# ╞════════╪═════╪══════╪══════╡
# │ Apple  ┆ 3   ┆ 10.0 ┆ true │
# │ Orange ┆ 2   ┆ 0.5  ┆ true │
# │ Banana ┆ 5   ┆ 14.0 ┆ true │
# └────────┴─────┴──────┴──────┘

Returns:



453
454
455
# File 'lib/polars/group_by.rb', line 453

def max
  agg(Polars.all.max)
end

#meanDataFrame

Reduce the groups to the mean values.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 4, 10, 13, 14],
    "c" => [true, true, true, false, false, true],
    "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
  }
)
df.group_by("d", maintain_order: true).mean
# =>
# shape: (3, 4)
# ┌────────┬─────┬──────────┬──────────┐
# │ d      ┆ a   ┆ b        ┆ c        │
# │ ---    ┆ --- ┆ ---      ┆ ---      │
# │ str    ┆ f64 ┆ f64      ┆ f64      │
# ╞════════╪═════╪══════════╪══════════╡
# │ Apple  ┆ 2.0 ┆ 4.833333 ┆ 0.666667 │
# │ Orange ┆ 2.0 ┆ 0.5      ┆ 1.0      │
# │ Banana ┆ 4.5 ┆ 13.5     ┆ 0.5      │
# └────────┴─────┴──────────┴──────────┘

Returns:



511
512
513
# File 'lib/polars/group_by.rb', line 511

def mean
  agg(Polars.all.mean)
end

#medianDataFrame

Return the median per group.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 4, 10, 13, 14],
    "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
  }
)
df.group_by("d", maintain_order: true).median
# =>
# shape: (2, 3)
# ┌────────┬─────┬──────┐
# │ d      ┆ a   ┆ b    │
# │ ---    ┆ --- ┆ ---  │
# │ str    ┆ f64 ┆ f64  │
# ╞════════╪═════╪══════╡
# │ Apple  ┆ 2.0 ┆ 4.0  │
# │ Banana ┆ 4.0 ┆ 13.0 │
# └────────┴─────┴──────┘

Returns:



598
599
600
# File 'lib/polars/group_by.rb', line 598

def median
  agg(Polars.all.median)
end

#minDataFrame

Reduce the groups to the minimal value.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 4, 10, 13, 14],
    "c" => [true, true, true, false, false, true],
    "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
  }
)
df.group_by("d", maintain_order: true).min
# =>
# shape: (3, 4)
# ┌────────┬─────┬──────┬───────┐
# │ d      ┆ a   ┆ b    ┆ c     │
# │ ---    ┆ --- ┆ ---  ┆ ---   │
# │ str    ┆ i64 ┆ f64  ┆ bool  │
# ╞════════╪═════╪══════╪═══════╡
# │ Apple  ┆ 1   ┆ 0.5  ┆ false │
# │ Orange ┆ 2   ┆ 0.5  ┆ true  │
# │ Banana ┆ 4   ┆ 13.0 ┆ false │
# └────────┴─────┴──────┴───────┘

Returns:



424
425
426
# File 'lib/polars/group_by.rb', line 424

def min
  agg(Polars.all.min)
end

#n_uniqueDataFrame

Count the unique values per group.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 1, 3, 4, 5],
    "b" => [0.5, 0.5, 0.5, 10, 13, 14],
    "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
  }
)
df.group_by("d", maintain_order: true).n_unique
# =>
# shape: (2, 3)
# ┌────────┬─────┬─────┐
# │ d      ┆ a   ┆ b   │
# │ ---    ┆ --- ┆ --- │
# │ str    ┆ u32 ┆ u32 │
# ╞════════╪═════╪═════╡
# │ Apple  ┆ 2   ┆ 2   │
# │ Banana ┆ 3   ┆ 3   │
# └────────┴─────┴─────┘

Returns:



538
539
540
# File 'lib/polars/group_by.rb', line 538

def n_unique
  agg(Polars.all.n_unique)
end

#plot(*args, **options) ⇒ Vega::LiteChart

Plot data.

Returns:

  • (Vega::LiteChart)

Raises:

  • (ArgumentError)


605
606
607
608
609
610
611
# File 'lib/polars/group_by.rb', line 605

def plot(*args, **options)
  raise ArgumentError, "Multiple groups not supported" if @by.is_a?(::Array) && @by.size > 1
  # same message as Ruby
  raise ArgumentError, "unknown keyword: :group" if options.key?(:group)

  @df.plot(*args, **options, group: @by)
end

#quantile(quantile, interpolation: "nearest") ⇒ DataFrame

Compute the quantile per group.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 4, 10, 13, 14],
    "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
  }
)
df.group_by("d", maintain_order: true).quantile(1)
# =>
# shape: (3, 3)
# ┌────────┬─────┬──────┐
# │ d      ┆ a   ┆ b    │
# │ ---    ┆ --- ┆ ---  │
# │ str    ┆ f64 ┆ f64  │
# ╞════════╪═════╪══════╡
# │ Apple  ┆ 3.0 ┆ 10.0 │
# │ Orange ┆ 2.0 ┆ 0.5  │
# │ Banana ┆ 5.0 ┆ 14.0 │
# └────────┴─────┴──────┘

Parameters:

  • quantile (Float)

    Quantile between 0.0 and 1.0.

  • interpolation ("nearest", "higher", "lower", "midpoint", "linear") (defaults to: "nearest")

    Interpolation method.

Returns:



571
572
573
# File 'lib/polars/group_by.rb', line 571

def quantile(quantile, interpolation: "nearest")
  agg(Polars.all.quantile(quantile, interpolation: interpolation))
end

#sumDataFrame

Reduce the groups to the sum.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 4, 10, 13, 14],
    "c" => [true, true, true, false, false, true],
    "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
  }
)
df.group_by("d", maintain_order: true).sum
# =>
# shape: (3, 4)
# ┌────────┬─────┬──────┬─────┐
# │ d      ┆ a   ┆ b    ┆ c   │
# │ ---    ┆ --- ┆ ---  ┆ --- │
# │ str    ┆ i64 ┆ f64  ┆ u32 │
# ╞════════╪═════╪══════╪═════╡
# │ Apple  ┆ 6   ┆ 14.5 ┆ 2   │
# │ Orange ┆ 2   ┆ 0.5  ┆ 1   │
# │ Banana ┆ 9   ┆ 27.0 ┆ 1   │
# └────────┴─────┴──────┴─────┘

Returns:



395
396
397
# File 'lib/polars/group_by.rb', line 395

def sum
  agg(Polars.all.sum)
end

#tail(n = 5) ⇒ DataFrame

Get the last n rows of each group.

Examples:

df = Polars::DataFrame.new(
  {
    "letters" => ["c", "c", "a", "c", "a", "b"],
    "nrs" => [1, 2, 3, 4, 5, 6]
  }
)
# =>
# shape: (6, 2)
# ┌─────────┬─────┐
# │ letters ┆ nrs │
# │ ---     ┆ --- │
# │ str     ┆ i64 │
# ╞═════════╪═════╡
# │ c       ┆ 1   │
# │ c       ┆ 2   │
# │ a       ┆ 3   │
# │ c       ┆ 4   │
# │ a       ┆ 5   │
# │ b       ┆ 6   │
# └─────────┴─────┘
df.group_by("letters").tail(2).sort("letters")
# =>
# shape: (5, 2)
# ┌─────────┬─────┐
# │ letters ┆ nrs │
# │ ---     ┆ --- │
# │ str     ┆ i64 │
# ╞═════════╪═════╡
# │ a       ┆ 3   │
# │ a       ┆ 5   │
# │ b       ┆ 6   │
# │ c       ┆ 2   │
# │ c       ┆ 4   │
# └─────────┴─────┘

Parameters:

  • n (Integer) (defaults to: 5)

    Number of rows to return.

Returns:



305
306
307
308
309
310
# File 'lib/polars/group_by.rb', line 305

def tail(n = 5)
  @df.lazy
    .group_by(@by, maintain_order: @maintain_order)
    .tail(n)
    .collect(no_optimization: true)
end