Module: Polars

Extended by:
Convert, Functions, IO
Defined in:
lib/polars.rb,
lib/polars/expr.rb,
lib/polars/slice.rb,
lib/polars/utils.rb,
lib/polars/config.rb,
lib/polars/io/csv.rb,
lib/polars/io/ipc.rb,
lib/polars/schema.rb,
lib/polars/series.rb,
lib/polars/catalog.rb,
lib/polars/convert.rb,
lib/polars/io/avro.rb,
lib/polars/io/json.rb,
lib/polars/testing.rb,
lib/polars/version.rb,
lib/polars/cat_expr.rb,
lib/polars/group_by.rb,
lib/polars/io/cloud.rb,
lib/polars/io/delta.rb,
lib/polars/io/utils.rb,
lib/polars/selector.rb,
lib/polars/whenthen.rb,
lib/polars/io/ndjson.rb,
lib/polars/list_expr.rb,
lib/polars/meta_expr.rb,
lib/polars/name_expr.rb,
lib/polars/selectors.rb,
lib/polars/array_expr.rb,
lib/polars/data_frame.rb,
lib/polars/data_types.rb,
lib/polars/exceptions.rb,
lib/polars/io/iceberg.rb,
lib/polars/io/parquet.rb,
lib/polars/lazy_frame.rb,
lib/polars/utils/wrap.rb,
lib/polars/binary_expr.rb,
lib/polars/io/database.rb,
lib/polars/series_plot.rb,
lib/polars/sql_context.rb,
lib/polars/string_expr.rb,
lib/polars/struct_expr.rb,
lib/polars/utils/parse.rb,
lib/polars/utils/serde.rb,
lib/polars/string_cache.rb,
lib/polars/expr_dispatch.rb,
lib/polars/functions/col.rb,
lib/polars/functions/len.rb,
lib/polars/functions/lit.rb,
lib/polars/lazy_group_by.rb,
lib/polars/utils/convert.rb,
lib/polars/utils/various.rb,
lib/polars/cat_name_space.rb,
lib/polars/data_type_expr.rb,
lib/polars/date_time_expr.rb,
lib/polars/extension_expr.rb,
lib/polars/functions/lazy.rb,
lib/polars/utils/unstable.rb,
lib/polars/collect_batches.rb,
lib/polars/data_frame_plot.rb,
lib/polars/data_type_group.rb,
lib/polars/functions/eager.rb,
lib/polars/iceberg_dataset.rb,
lib/polars/io/scan_options.rb,
lib/polars/io/sink_options.rb,
lib/polars/list_name_space.rb,
lib/polars/query_opt_flags.rb,
lib/polars/utils/constants.rb,
lib/polars/array_name_space.rb,
lib/polars/dynamic_group_by.rb,
lib/polars/functions/random.rb,
lib/polars/functions/repeat.rb,
lib/polars/in_process_query.rb,
lib/polars/rolling_group_by.rb,
lib/polars/binary_name_space.rb,
lib/polars/scan_cast_options.rb,
lib/polars/string_name_space.rb,
lib/polars/struct_name_space.rb,
lib/polars/utils/deprecation.rb,
lib/polars/batched_csv_reader.rb,
lib/polars/functions/business.rb,
lib/polars/functions/datatype.rb,
lib/polars/functions/whenthen.rb,
lib/polars/date_time_name_space.rb,
lib/polars/extension_name_space.rb,
lib/polars/functions/as_datatype.rb,
lib/polars/functions/escape_regex.rb,
lib/polars/catalog/unity/table_info.rb,
lib/polars/utils/construction/utils.rb,
lib/polars/catalog/unity/column_info.rb,
lib/polars/functions/range/int_range.rb,
lib/polars/utils/construction/series.rb,
lib/polars/catalog/unity/catalog_info.rb,
lib/polars/functions/range/date_range.rb,
lib/polars/functions/range/time_range.rb,
lib/polars/catalog/unity/namespace_info.rb,
lib/polars/functions/range/linear_space.rb,
lib/polars/utils/construction/data_frame.rb,
lib/polars/functions/aggregation/vertical.rb,
lib/polars/functions/range/datetime_range.rb,
lib/polars/functions/aggregation/horizontal.rb

Defined Under Namespace

Modules: Convert, Functions, IO, Selectors, Testing Classes: Array, ArrayExpr, ArrayNameSpace, Binary, BinaryExpr, BinaryNameSpace, Boolean, CatExpr, CatNameSpace, Catalog, Categorical, Categories, Config, DataFrame, DataFramePlot, DataType, DataTypeExpr, Date, DateTimeExpr, DateTimeNameSpace, Datetime, Decimal, Duration, DynamicGroupBy, Enum, Expr, ExtensionExpr, ExtensionNameSpace, Field, Float16, Float32, Float64, FloatType, GroupBy, InProcessQuery, Int128, Int16, Int32, Int64, Int8, IntegerType, LazyFrame, LazyGroupBy, List, ListExpr, ListNameSpace, MetaExpr, NameExpr, NestedType, Null, NumericType, Object, QueryOptFlags, RollingGroupBy, SQLContext, ScanCastOptions, Schema, Selector, Series, SeriesPlot, SignedIntegerType, String, StringCache, StringExpr, StringNameSpace, Struct, StructExpr, StructNameSpace, TemporalType, Time, UInt128, UInt16, UInt32, UInt64, UInt8, Unknown, UnsignedIntegerType

Constant Summary collapse

SIGNED_INTEGER_DTYPES =
DataTypeGroup.new(
  [
    Int8,
    Int16,
    Int32,
    Int64
  ]
)
UNSIGNED_INTEGER_DTYPES =
DataTypeGroup.new(
  [
    UInt8,
    UInt16,
    UInt32,
    UInt64
  ]
)
INTEGER_DTYPES =
(
  SIGNED_INTEGER_DTYPES | UNSIGNED_INTEGER_DTYPES
)
FLOAT_DTYPES =
DataTypeGroup.new([Float32, Float64])
NUMERIC_DTYPES =
DataTypeGroup.new(
  FLOAT_DTYPES + INTEGER_DTYPES | [Decimal]
)
SinkOptions =
IO::SinkOptions
DEFAULT_QUERY_OPT_FLAGS =
QueryOptFlags.new

Class Method Summary collapse

Class Method Details

.align_frames(*frames, on:, how: nil, select: nil, descending: false) ⇒ Object Originally defined in module Functions

Align an array of frames using the unique values from one or more columns as a key.

Frames that do not contain the given key values have rows injected (with nulls filling the non-key columns), and each resulting frame is sorted by the key.

The original column order of input frames is not changed unless select is specified (in which case the final column order is determined from that).

Note that this does not result in a joined frame - you receive the same number of frames back that you passed in, but each is now aligned by key and has the same number of rows.

Examples:

df1 = Polars::DataFrame.new(
  {
    "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
    "x" => [3.5, 4.0, 1.0],
    "y" => [10.0, 2.5, 1.5]
  }
)
df2 = Polars::DataFrame.new(
  {
    "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
    "x" => [8.0, 1.0, 3.5],
    "y" => [1.5, 12.0, 5.0]
  }
)
df3 = Polars::DataFrame.new(
  {
    "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
    "x" => [2.0, 5.0],
    "y" => [2.5, 2.0]
  }
)
af1, af2, af3 = Polars.align_frames(
  df1, df2, df3, on: "dt", how: "left", select: ["x", "y"]
)
(af1 * af2 * af3).fill_null(0).select(Polars.sum_horizontal("*").alias("dot"))
# =>
# shape: (3, 1)
# ┌───────┐
# │ dot   │
# │ ---   │
# │ f64   │
# ╞═══════╡
# │ 0.0   │
# │ 167.5 │
# │ 47.0  │
# └───────┘

Parameters:

  • Array of DataFrames or LazyFrames.

  • One or more columns whose unique values will be used to align the frames.

  • (defaults to: nil)

    Optional post-alignment column select to constrain and/or order the columns returned from the newly aligned frames.

  • (defaults to: false)

    Sort the alignment column values in descending order; can be a single boolean or a list of booleans associated with each column in on.

Returns:

.all(*names, ignore_nulls: true) ⇒ Expr Originally defined in module Functions

Either return an expression representing all columns, or evaluate a bitwise AND operation.

If no arguments are passed, this function is syntactic sugar for col("*"). Otherwise, this function is syntactic sugar for col(names).all.

Examples:

Selecting all columns.

df = Polars::DataFrame.new(
  {
    "a" => [true, false, true],
    "b" => [false, false, false]
  }
)
df.select(Polars.all.sum)
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ u32 ┆ u32 │
# ╞═════╪═════╡
# │ 2   ┆ 0   │
# └─────┴─────┘

Evaluate bitwise AND for a column.

df.select(Polars.all("a"))
# =>
# shape: (1, 1)
# ┌───────┐
# │ a     │
# │ ---   │
# │ bool  │
# ╞═══════╡
# │ false │
# └───────┘

Parameters:

  • Name(s) of the columns to use in the aggregation.

  • (defaults to: true)

    Ignore null values (default).

Returns:

.all_horizontal(*exprs) ⇒ Expr Originally defined in module Functions

Compute the bitwise AND horizontally across columns.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [false, false, true, true, false, nil],
    "b" => [false, true, true, nil, nil, nil],
    "c" => ["u", "v", "w", "x", "y", "z"]
  }
)
df.with_columns(all: Polars.all_horizontal("a", "b"))
# =>
# shape: (6, 4)
# ┌───────┬───────┬─────┬───────┐
# │ a     ┆ b     ┆ c   ┆ all   │
# │ ---   ┆ ---   ┆ --- ┆ ---   │
# │ bool  ┆ bool  ┆ str ┆ bool  │
# ╞═══════╪═══════╪═════╪═══════╡
# │ false ┆ false ┆ u   ┆ false │
# │ false ┆ true  ┆ v   ┆ false │
# │ true  ┆ true  ┆ w   ┆ true  │
# │ true  ┆ null  ┆ x   ┆ null  │
# │ false ┆ null  ┆ y   ┆ false │
# │ null  ┆ null  ┆ z   ┆ null  │
# └───────┴───────┴─────┴───────┘

Parameters:

  • Column(s) to use in the aggregation. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

Returns:

.any(*names, ignore_nulls: true) ⇒ Expr Originally defined in module Functions

Evaluate a bitwise OR operation.

Syntactic sugar for col(names).any.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [true, false, true],
    "b" => [false, false, false]
  }
)
df.select(Polars.any("a"))
# =>
# shape: (1, 1)
# ┌──────┐
# │ a    │
# │ ---  │
# │ bool │
# ╞══════╡
# │ true │
# └──────┘

Parameters:

  • Name(s) of the columns to use in the aggregation.

  • (defaults to: true)

    Ignore null values (default).

Returns:

.any_horizontal(*exprs) ⇒ Expr Originally defined in module Functions

Compute the bitwise OR horizontally across columns.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [false, false, true, true, false, nil],
    "b" => [false, true, true, nil, nil, nil],
    "c" => ["u", "v", "w", "x", "y", "z"]
  }
)
df.with_columns(any: Polars.any_horizontal("a", "b"))
# =>
# shape: (6, 4)
# ┌───────┬───────┬─────┬───────┐
# │ a     ┆ b     ┆ c   ┆ any   │
# │ ---   ┆ ---   ┆ --- ┆ ---   │
# │ bool  ┆ bool  ┆ str ┆ bool  │
# ╞═══════╪═══════╪═════╪═══════╡
# │ false ┆ false ┆ u   ┆ false │
# │ false ┆ true  ┆ v   ┆ true  │
# │ true  ┆ true  ┆ w   ┆ true  │
# │ true  ┆ null  ┆ x   ┆ true  │
# │ false ┆ null  ┆ y   ┆ null  │
# │ null  ┆ null  ┆ z   ┆ null  │
# └───────┴───────┴─────┴───────┘

Parameters:

  • Column(s) to use in the aggregation. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

Returns:

.approx_n_unique(*columns) ⇒ Expr Originally defined in module Functions

Approximate count of unique values.

This function is syntactic sugar for col(columns).approx_n_unique, and uses the HyperLogLog++ algorithm for cardinality estimation.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 1],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.approx_n_unique("a"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ u32 │
# ╞═════╡
# │ 2   │
# └─────┘
df.select(Polars.approx_n_unique("b", "c"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ b   ┆ c   │
# │ --- ┆ --- │
# │ u32 ┆ u32 │
# ╞═════╪═════╡
# │ 3   ┆ 2   │
# └─────┴─────┘

Parameters:

  • One or more column names.

Returns:

.arctan2(y, x) ⇒ Expr Originally defined in module Functions

Compute two argument arctan in radians.

Returns the angle (in radians) in the plane between the positive x-axis and the ray from the origin to (x,y).

Examples:

c = Math.sqrt(2) / 2
df = Polars::DataFrame.new(
  {
    "y" => [c, -c, c, -c],
    "x" => [c, c, -c, -c]
  }
)
df.with_columns(Polars.arctan2("y", "x").alias("atan2"))
# =>
# shape: (4, 3)
# ┌───────────┬───────────┬───────────┐
# │ y         ┆ x         ┆ atan2     │
# │ ---       ┆ ---       ┆ ---       │
# │ f64       ┆ f64       ┆ f64       │
# ╞═══════════╪═══════════╪═══════════╡
# │ 0.707107  ┆ 0.707107  ┆ 0.785398  │
# │ -0.707107 ┆ 0.707107  ┆ -0.785398 │
# │ 0.707107  ┆ -0.707107 ┆ 2.356194  │
# │ -0.707107 ┆ -0.707107 ┆ -2.356194 │
# └───────────┴───────────┴───────────┘

Parameters:

  • Column name or Expression.

  • Column name or Expression.

Returns:

.arg_sort_by(exprs, *more_exprs, descending: false, nulls_last: false, multithreaded: true, maintain_order: false) ⇒ Expr Originally defined in module Functions

Find the indexes that would sort the columns.

Argsort by multiple columns. The first column will be used for the ordering. If there are duplicates in the first column, the second column will be used to determine the ordering and so on.

Examples:

Pass a single column name to compute the arg sort by that column.

df = Polars::DataFrame.new(
  {
    "a" => [0, 1, 1, 0],
    "b" => [3, 2, 3, 2],
    "c" => [1, 2, 3, 4]
  }
)
df.select(Polars.arg_sort_by("a"))
# =>
# shape: (4, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ u32 │
# ╞═════╡
# │ 0   │
# │ 3   │
# │ 1   │
# │ 2   │
# └─────┘

Compute the arg sort by multiple columns by either passing a list of columns, or by specifying each column as a positional argument.

df.select(Polars.arg_sort_by(["a", "b"], descending: true))
# =>
# shape: (4, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ u32 │
# ╞═════╡
# │ 2   │
# │ 1   │
# │ 0   │
# │ 3   │
# └─────┘

Use gather to apply the arg sort to other columns.

df.select(Polars.col("c").gather(Polars.arg_sort_by("a")))
# =>
# shape: (4, 1)
# ┌─────┐
# │ c   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# │ 4   │
# │ 2   │
# │ 3   │
# └─────┘

Parameters:

  • Columns use to determine the ordering.

  • Additional columns to arg sort by, specified as positional arguments.

  • (defaults to: false)

    Default is ascending.

  • (defaults to: false)

    Place null values last.

  • (defaults to: true)

    Sort using multiple threads.

  • (defaults to: false)

    Whether the order should be maintained if elements are equal.

Returns:

.arg_where(condition, eager: false) ⇒ Expr, Series Originally defined in module Functions

Return indices where condition evaluates true.

Examples:

df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
df.select(
  [
    Polars.arg_where(Polars.col("a") % 2 == 0)
  ]
).to_series
# =>
# shape: (2,)
# Series: 'a' [u32]
# [
#         1
#         3
# ]

Parameters:

  • Boolean expression to evaluate

  • (defaults to: false)

    Whether to apply this function eagerly (as opposed to lazily).

Returns:

.build_infoHash

Return detailed Polars build information.

Examples:

Polars.build_info

Returns:



159
160
161
# File 'lib/polars.rb', line 159

def self.build_info
  {"version" => VERSION}
end

.business_day_count(start, stop, week_mask: [true, true, true, true, true, false, false], holidays: []) ⇒ Expr Originally defined in module Functions

Note:

This functionality is considered unstable. It may be changed at any point without it being considered a breaking change.

Count the number of business days between start and end (not including end).

Examples:

df = Polars::DataFrame.new(
  {
    "start" => [Date.new(2020, 1, 1), Date.new(2020, 1, 2)],
    "end" => [Date.new(2020, 1, 2), Date.new(2020, 1, 10)]
  }
)
df.with_columns(
  business_day_count: Polars.business_day_count("start", "end")
)
# =>
# shape: (2, 3)
# ┌────────────┬────────────┬────────────────────┐
# │ start      ┆ end        ┆ business_day_count │
# │ ---        ┆ ---        ┆ ---                │
# │ date       ┆ date       ┆ i32                │
# ╞════════════╪════════════╪════════════════════╡
# │ 2020-01-01 ┆ 2020-01-02 ┆ 1                  │
# │ 2020-01-02 ┆ 2020-01-10 ┆ 6                  │
# └────────────┴────────────┴────────────────────┘

You can pass a custom weekend - for example, if you only take Sunday off:

week_mask = [true, true, true, true, true, true, false]
df.with_columns(
  business_day_count: Polars.business_day_count(
    "start", "end", week_mask: week_mask
  )
)
# =>
# shape: (2, 3)
# ┌────────────┬────────────┬────────────────────┐
# │ start      ┆ end        ┆ business_day_count │
# │ ---        ┆ ---        ┆ ---                │
# │ date       ┆ date       ┆ i32                │
# ╞════════════╪════════════╪════════════════════╡
# │ 2020-01-01 ┆ 2020-01-02 ┆ 1                  │
# │ 2020-01-02 ┆ 2020-01-10 ┆ 7                  │
# └────────────┴────────────┴────────────────────┘

You can also pass a list of holidays to exclude from the count:

holidays = [Date.new(2020, 1, 1), Date.new(2020, 1, 2)]
df.with_columns(
  business_day_count: Polars.business_day_count("start", "end", holidays: holidays)
)
# =>
# shape: (2, 3)
# ┌────────────┬────────────┬────────────────────┐
# │ start      ┆ end        ┆ business_day_count │
# │ ---        ┆ ---        ┆ ---                │
# │ date       ┆ date       ┆ i32                │
# ╞════════════╪════════════╪════════════════════╡
# │ 2020-01-01 ┆ 2020-01-02 ┆ 0                  │
# │ 2020-01-02 ┆ 2020-01-10 ┆ 5                  │
# └────────────┴────────────┴────────────────────┘

Parameters:

  • Start dates.

  • End dates.

  • (defaults to: [true, true, true, true, true, false, false])

    Which days of the week to count. The default is Monday to Friday. If you wanted to count only Monday to Thursday, you would pass [true, true, true, true, false, false, false].

  • (defaults to: [])

    Holidays to exclude from the count.

Returns:

.coalesce(exprs, *more_exprs, eager: false) ⇒ Expr Originally defined in module Functions

Folds the columns from left to right, keeping the first non-null value.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, nil, nil, nil],
    "b" => [1, 2, nil, nil],
    "c" => [5, nil, 3, nil]
  }
)
df.with_columns(Polars.coalesce(["a", "b", "c", 10]).alias("d"))
# =>
# shape: (4, 4)
# ┌──────┬──────┬──────┬─────┐
# │ a    ┆ b    ┆ c    ┆ d   │
# │ ---  ┆ ---  ┆ ---  ┆ --- │
# │ i64  ┆ i64  ┆ i64  ┆ i64 │
# ╞══════╪══════╪══════╪═════╡
# │ 1    ┆ 1    ┆ 5    ┆ 1   │
# │ null ┆ 2    ┆ null ┆ 2   │
# │ null ┆ null ┆ 3    ┆ 3   │
# │ null ┆ null ┆ null ┆ 10  │
# └──────┴──────┴──────┴─────┘
df.with_columns(Polars.coalesce(Polars.col(["a", "b", "c"]), 10.0).alias("d"))
# =>
# shape: (4, 4)
# ┌──────┬──────┬──────┬──────┐
# │ a    ┆ b    ┆ c    ┆ d    │
# │ ---  ┆ ---  ┆ ---  ┆ ---  │
# │ i64  ┆ i64  ┆ i64  ┆ f64  │
# ╞══════╪══════╪══════╪══════╡
# │ 1    ┆ 1    ┆ 5    ┆ 1.0  │
# │ null ┆ 2    ┆ null ┆ 2.0  │
# │ null ┆ null ┆ 3    ┆ 3.0  │
# │ null ┆ null ┆ null ┆ 10.0 │
# └──────┴──────┴──────┴──────┘
s1 = Polars::Series.new("a", [nil, 2, nil])
s2 = Polars::Series.new("b", [1, nil, 3])
Polars.coalesce(s1, s2, eager: true)
# =>
# shape: (3,)
# Series: 'a' [i64]
# [
#         1
#         2
#         3
# ]

Parameters:

  • Columns to coalesce. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

  • Additional columns to coalesce, specified as positional arguments.

  • (defaults to: false)

    Evaluate immediately and return a Series; this requires that at least one of the given arguments is a Series. If set to false (default), return an expression instead.

Returns:

.col(name, *more_names) ⇒ Expr Originally defined in module Functions

Return an expression representing a column in a DataFrame.

Returns:

.collect_all(lazy_frames, optimizations: DEFAULT_QUERY_OPT_FLAGS, engine: "auto", lazy: false) ⇒ Array Originally defined in module Functions

Collect multiple LazyFrames at the same time.

This runs all the computation graphs in parallel on Polars threadpool.

Parameters:

  • A list of LazyFrames to collect.

  • (defaults to: DEFAULT_QUERY_OPT_FLAGS)

    The optimization passes done during query optimization.

    This has no effect if lazy is set to true.

  • (defaults to: "auto")

    Select the engine used to process the query, optional. At the moment, if set to "auto" (default), the query is run using the polars streaming engine. Polars will also attempt to use the engine set by the POLARS_ENGINE_AFFINITY environment variable. If it cannot run the query using the selected engine, the query is run using the polars streaming engine.

  • (defaults to: false)

    Return as LazyFrame that can be collected later. This is only correct if all inputs sink to disk.

    This functionality is considered unstable. It may be changed at any point without it being considered a breaking change.

Returns:

.concat(items, rechunk: false, how: "vertical", parallel: true, strict: false) ⇒ Object Originally defined in module Functions

Aggregate multiple Dataframes/Series to a single DataFrame/Series.

Examples:

df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
Polars.concat([df1, df2])  # default is 'vertical' strategy
# =>
# shape: (2, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 3   │
# │ 2   ┆ 4   │
# └─────┴─────┘
df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
df2 = Polars::DataFrame.new({"a" => [2.5], "b" => [4]})
Polars.concat([df1, df2], how: "vertical_relaxed")  # 'a' coerced into f64
# =>
# shape: (2, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ f64 ┆ i64 │
# ╞═════╪═════╡
# │ 1.0 ┆ 3   │
# │ 2.5 ┆ 4   │
# └─────┴─────┘
df_h1 = Polars::DataFrame.new({"l1" => [1, 2], "l2" => [3, 4]})
df_h2 = Polars::DataFrame.new({"r1" => [5, 6], "r2" => [7, 8], "r3" => [9, 10]})
Polars.concat([df_h1, df_h2], how: "horizontal")
# =>
# shape: (2, 5)
# ┌─────┬─────┬─────┬─────┬─────┐
# │ l1  ┆ l2  ┆ r1  ┆ r2  ┆ r3  │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╪═════╪═════╡
# │ 1   ┆ 3   ┆ 5   ┆ 7   ┆ 9   │
# │ 2   ┆ 4   ┆ 6   ┆ 8   ┆ 10  │
# └─────┴─────┴─────┴─────┴─────┘
df_d1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
df_d2 = Polars::DataFrame.new({"a" => [2], "c" => [4]})
Polars.concat([df_d1, df_d2], how: "diagonal")
# =>
# shape: (2, 3)
# ┌─────┬──────┬──────┐
# │ a   ┆ b    ┆ c    │
# │ --- ┆ ---  ┆ ---  │
# │ i64 ┆ i64  ┆ i64  │
# ╞═════╪══════╪══════╡
# │ 1   ┆ 3    ┆ null │
# │ 2   ┆ null ┆ 4    │
# └─────┴──────┴──────┘
df_a1 = Polars::DataFrame.new({"id" => [1, 2], "x" => [3, 4]})
df_a2 = Polars::DataFrame.new({"id" => [2, 3], "y" => [5, 6]})
df_a3 = Polars::DataFrame.new({"id" => [1, 3], "z" => [7, 8]})
Polars.concat([df_a1, df_a2, df_a3], how: "align")
# =>
# shape: (3, 4)
# ┌─────┬──────┬──────┬──────┐
# │ id  ┆ x    ┆ y    ┆ z    │
# │ --- ┆ ---  ┆ ---  ┆ ---  │
# │ i64 ┆ i64  ┆ i64  ┆ i64  │
# ╞═════╪══════╪══════╪══════╡
# │ 1   ┆ 3    ┆ null ┆ 7    │
# │ 2   ┆ 4    ┆ 5    ┆ null │
# │ 3   ┆ null ┆ 6    ┆ 8    │
# └─────┴──────┴──────┴──────┘

Parameters:

  • DataFrames/Series/LazyFrames to concatenate.

  • (defaults to: false)

    Make sure that all data is in contiguous memory.

  • (defaults to: "vertical")
    • Vertical: applies multiple vstack operations.
    • Diagonal: finds a union between the column schemas and fills missing column values with null.
    • Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
  • (defaults to: true)

    Only relevant for LazyFrames. This determines if the concatenated lazy computations may be executed in parallel.

  • (defaults to: false)

    When how=horizontal, require all DataFrames to be the same height, raising an error if not.

Returns:

.concat_arr(exprs, *more_exprs) ⇒ Expr Originally defined in module Functions

Note:

This functionality is considered unstable. It may be changed at any point without it being considered a breaking change.

Horizontally concatenate columns into a single array column.

Non-array columns are reshaped to a unit-width array. All columns must have a dtype of either Polars::Array.new(<DataType>, width) or Polars::<DataType>.

Examples:

Concatenate 2 array columns:

Polars.select(
  a: Polars::Series.new([[1], [3], nil], dtype: Polars::Array.new(Polars::Int64, 1)),
  b: Polars::Series.new([[3], [nil], [5]], dtype: Polars::Array.new(Polars::Int64, 1))
).with_columns(
  Polars.concat_arr("a", "b").alias("concat_arr(a, b)"),
  Polars.concat_arr("a", Polars.first("b")).alias("concat_arr(a, first(b))")
)
# =>
# shape: (3, 4)
# ┌───────────────┬───────────────┬──────────────────┬─────────────────────────┐
# │ a             ┆ b             ┆ concat_arr(a, b) ┆ concat_arr(a, first(b)) │
# │ ---           ┆ ---           ┆ ---              ┆ ---                     │
# │ array[i64, 1] ┆ array[i64, 1] ┆ array[i64, 2]    ┆ array[i64, 2]           │
# ╞═══════════════╪═══════════════╪══════════════════╪═════════════════════════╡
# │ [1]           ┆ [3]           ┆ [1, 3]           ┆ [1, 3]                  │
# │ [3]           ┆ [null]        ┆ [3, null]        ┆ [3, 3]                  │
# │ null          ┆ [5]           ┆ null             ┆ null                    │
# └───────────────┴───────────────┴──────────────────┴─────────────────────────┘

Parameters:

  • Columns to concatenate into a single array column. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

  • Additional columns to concatenate into a single array column, specified as positional arguments.

Returns:

.concat_list(exprs, *more_exprs) ⇒ Expr Originally defined in module Functions

Concat the arrays in a Series dtype List in linear time.

Examples:

Concatenate two existing list columns. Null values are propagated.

df = Polars::DataFrame.new({"a" => [[1, 2], [3], [4, 5]], "b" => [[4], [], nil]})
df.with_columns(concat_list: Polars.concat_list("a", "b"))
# =>
# shape: (3, 3)
# ┌───────────┬───────────┬─────────────┐
# │ a         ┆ b         ┆ concat_list │
# │ ---       ┆ ---       ┆ ---         │
# │ list[i64] ┆ list[i64] ┆ list[i64]   │
# ╞═══════════╪═══════════╪═════════════╡
# │ [1, 2]    ┆ [4]       ┆ [1, 2, 4]   │
# │ [3]       ┆ []        ┆ [3]         │
# │ [4, 5]    ┆ null      ┆ null        │
# └───────────┴───────────┴─────────────┘

Non-list columns are cast to a list before concatenation. The output data type is the supertype of the concatenated columns.

df.select("a", concat_list: Polars.concat_list("a", Polars.lit("x")))
# =>
# shape: (3, 2)
# ┌───────────┬─────────────────┐
# │ a         ┆ concat_list     │
# │ ---       ┆ ---             │
# │ list[i64] ┆ list[str]       │
# ╞═══════════╪═════════════════╡
# │ [1, 2]    ┆ ["1", "2", "x"] │
# │ [3]       ┆ ["3", "x"]      │
# │ [4, 5]    ┆ ["4", "5", "x"] │
# └───────────┴─────────────────┘

Create lagged columns and collect them into a list. This mimics a rolling window.

df = Polars::DataFrame.new({"A" => [1.0, 2.0, 9.0, 2.0, 13.0]})
df = df.select(3.times.map { |i| Polars.col("A").shift(i).alias("A_lag_#{i}") })
df.select(
  Polars.concat_list(3.times.map { |i| "A_lag_#{i}" }.reverse).alias("A_rolling")
)
# =>
# shape: (5, 1)
# ┌───────────────────┐
# │ A_rolling         │
# │ ---               │
# │ list[f64]         │
# ╞═══════════════════╡
# │ [null, null, 1.0] │
# │ [null, 1.0, 2.0]  │
# │ [1.0, 2.0, 9.0]   │
# │ [2.0, 9.0, 2.0]   │
# │ [9.0, 2.0, 13.0]  │
# └───────────────────┘

Returns:

.concat_str(exprs, *more_exprs, separator: "", ignore_nulls: false) ⇒ Expr Originally defined in module Functions

Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3],
    "b" => ["dogs", "cats", nil],
    "c" => ["play", "swim", "walk"]
  }
)
df.with_columns(
  [
    Polars.concat_str(
      [
        Polars.col("a") * 2,
        Polars.col("b"),
        Polars.col("c")
      ],
      separator: " "
    ).alias("full_sentence")
  ]
)
# =>
# shape: (3, 4)
# ┌─────┬──────┬──────┬───────────────┐
# │ a   ┆ b    ┆ c    ┆ full_sentence │
# │ --- ┆ ---  ┆ ---  ┆ ---           │
# │ i64 ┆ str  ┆ str  ┆ str           │
# ╞═════╪══════╪══════╪═══════════════╡
# │ 1   ┆ dogs ┆ play ┆ 2 dogs play   │
# │ 2   ┆ cats ┆ swim ┆ 4 cats swim   │
# │ 3   ┆ null ┆ walk ┆ null          │
# └─────┴──────┴──────┴───────────────┘

Parameters:

  • Columns to concat into a Utf8 Series.

  • Additional columns to concatenate into a single string column, specified as positional arguments.

  • (defaults to: "")

    String value that will be used to separate the values.

  • (defaults to: false)

    Ignore null values (default).

Returns:

.configObject



531
532
533
# File 'lib/polars/config.rb', line 531

def self.config(...)
  Config.new(...)
end

.corr(a, b, method: "pearson", ddof: nil, propagate_nans: false, eager: false) ⇒ Expr Originally defined in module Functions

Compute the Pearson's or Spearman rank correlation correlation between two columns.

Examples:

Pearson's correlation:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.corr("a", "b"))
# =>
# shape: (1, 1)
# ┌──────────┐
# │ a        │
# │ ---      │
# │ f64      │
# ╞══════════╡
# │ 0.544705 │
# └──────────┘

Spearman rank correlation:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.corr("a", "b", method: "spearman"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ f64 │
# ╞═════╡
# │ 0.5 │
# └─────┘

Eager evaluation:

s1 = Polars::Series.new("a", [1, 8, 3])
s2 = Polars::Series.new("b", [4, 5, 2])
Polars.corr(s1, s2, eager: true)
# =>
# shape: (1,)
# Series: 'a' [f64]
# [
#         0.544705
# ]
Polars.corr(s1, s2, method: "spearman", eager: true)
# =>
# shape: (1,)
# Series: 'a' [f64]
# [
#         0.5
# ]

Parameters:

  • Column name or Expression.

  • Column name or Expression.

  • (defaults to: "pearson")

    Correlation method.

  • (defaults to: nil)

    "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1.

  • (defaults to: false)

    If true any NaN encountered will lead to NaN in the output. Defaults to false where NaN are regarded as larger than any finite number and thus lead to the highest rank.

  • (defaults to: false)

    Evaluate immediately and return a Series; this requires that at least one of the given arguments is a Series. If set to false (default), return an expression instead.

Returns:

.count(*columns) ⇒ Expr Originally defined in module Functions

Return the number of non-null values in the column.

This function is syntactic sugar for col(columns).count.

Calling this function without any arguments returns the number of rows in the context. This way of using the function is deprecated. Please use len instead.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, nil],
    "b" => [3, nil, nil],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.count("a"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ u32 │
# ╞═════╡
# │ 2   │
# └─────┘

Return the number of non-null values in multiple columns.

df.select(Polars.count("b", "c"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ b   ┆ c   │
# │ --- ┆ --- │
# │ u32 ┆ u32 │
# ╞═════╪═════╡
# │ 1   ┆ 3   │
# └─────┴─────┘

Parameters:

  • One or more column names.

Returns:

.cov(a, b, ddof: 1, eager: false) ⇒ Expr Originally defined in module Functions

Compute the covariance between two columns/ expressions.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.cov("a", "b"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ f64 │
# ╞═════╡
# │ 3.0 │
# └─────┘

Eager evaluation:

s1 = Polars::Series.new("a", [1, 8, 3])
s2 = Polars::Series.new("b", [4, 5, 2])
Polars.cov(s1, s2, eager: true)
# =>
# shape: (1,)
# Series: 'a' [f64]
# [
#         3.0
# ]

Parameters:

  • Column name or Expression.

  • Column name or Expression.

  • (defaults to: 1)

    "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1.

  • (defaults to: false)

    Evaluate immediately and return a Series; this requires that at least one of the given arguments is a Series. If set to false (default), return an expression instead.

Returns:

.csObject



1988
1989
1990
# File 'lib/polars/selectors.rb', line 1988

def self.cs
  Selectors
end

.cum_count(*columns, reverse: false) ⇒ Expr Originally defined in module Functions

Return the cumulative count of the non-null values in the column.

This function is syntactic sugar for col(columns).cum_count.

If no arguments are passed, returns the cumulative count of a context. Rows containing null values count towards the result.

Examples:

df = Polars::DataFrame.new({"a" => [1, 2, nil], "b" => [3, nil, nil]})
df.select(Polars.cum_count("a"))
# =>
# shape: (3, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ u32 │
# ╞═════╡
# │ 1   │
# │ 2   │
# │ 2   │
# └─────┘

Parameters:

  • Name(s) of the columns to use.

  • (defaults to: false)

    Reverse the operation.

Returns:

.cum_fold(acc, exprs, returns_scalar: false, return_dtype: nil, include_init: false, &function) ⇒ Object Originally defined in module Functions

Note:

If you simply want the first encountered expression as accumulator, consider using cum_reduce.

Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.

Every cumulative result is added as a separate field in a Struct column.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3],
    "b" => [3, 4, 5],
    "c" => [5, 6, 7]
  }
)
df.with_columns(
  Polars.cum_fold(Polars.lit(1), Polars.all) { |acc, x| acc + x }
)
# =>
# shape: (3, 4)
# ┌─────┬─────┬─────┬───────────┐
# │ a   ┆ b   ┆ c   ┆ cum_fold  │
# │ --- ┆ --- ┆ --- ┆ ---       │
# │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
# ╞═════╪═════╪═════╪═══════════╡
# │ 1   ┆ 3   ┆ 5   ┆ {2,5,10}  │
# │ 2   ┆ 4   ┆ 6   ┆ {3,7,13}  │
# │ 3   ┆ 5   ┆ 7   ┆ {4,9,16}  │
# └─────┴─────┴─────┴───────────┘

Parameters:

  • Accumulator Expression. This is the value that will be initialized when the fold starts. For a sum this could for instance be lit(0).

  • Expressions to aggregate over. May also be a wildcard expression.

  • (defaults to: false)

    Whether or not function applied returns a scalar. This must be set correctly by the user.

  • (defaults to: nil)

    Output datatype. If not set, the dtype will be inferred based on the dtype of the accumulator.

  • (defaults to: false)

    Include the initial accumulator state as struct field.

Returns:

.cum_reduce(exprs, returns_scalar: false, return_dtype: nil, &function) ⇒ Expr Originally defined in module Functions

Cumulatively reduce horizontally across columns with a left fold.

Every cumulative result is added as a separate field in a Struct column.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3],
    "b" => [3, 4, 5],
    "c" => [5, 6, 7]
  }
)
df.with_columns(Polars.cum_reduce(Polars.all) { |acc, x| acc + x })
# =>
# shape: (3, 4)
# ┌─────┬─────┬─────┬────────────┐
# │ a   ┆ b   ┆ c   ┆ cum_reduce │
# │ --- ┆ --- ┆ --- ┆ ---        │
# │ i64 ┆ i64 ┆ i64 ┆ struct[3]  │
# ╞═════╪═════╪═════╪════════════╡
# │ 1   ┆ 3   ┆ 5   ┆ {1,4,9}    │
# │ 2   ┆ 4   ┆ 6   ┆ {2,6,12}   │
# │ 3   ┆ 5   ┆ 7   ┆ {3,8,15}   │
# └─────┴─────┴─────┴────────────┘

Parameters:

  • Expressions to aggregate over. May also be a wildcard expression.

  • (defaults to: false)

    Whether or not function applied returns a scalar. This must be set correctly by the user.

  • (defaults to: nil)

    Output datatype. If not set, the dtype will be inferred based on the dtype of the input expressions.

Returns:

.cum_sum(*names) ⇒ Expr Originally defined in module Functions

Cumulatively sum all values.

Syntactic sugar for col(names).cum_sum.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3],
    "b" => [4, 5, 6]
  }
)
df.select(Polars.cum_sum("a"))
# =>
# shape: (3, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# │ 3   │
# │ 6   │
# └─────┘

Parameters:

  • Name(s) of the columns to use in the aggregation.

Returns:

.cum_sum_horizontal(*exprs) ⇒ Expr Originally defined in module Functions

Cumulatively sum all values horizontally across columns.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, nil],
    "c" => ["x", "y", "z"]
  }
)
df.with_columns(Polars.cum_sum_horizontal("a", "b"))
# =>
# shape: (3, 4)
# ┌─────┬──────┬─────┬───────────┐
# │ a   ┆ b    ┆ c   ┆ cum_sum   │
# │ --- ┆ ---  ┆ --- ┆ ---       │
# │ i64 ┆ i64  ┆ str ┆ struct[2] │
# ╞═════╪══════╪═════╪═══════════╡
# │ 1   ┆ 4    ┆ x   ┆ {1,5}     │
# │ 8   ┆ 5    ┆ y   ┆ {8,13}    │
# │ 3   ┆ null ┆ z   ┆ {3,null}  │
# └─────┴──────┴─────┴───────────┘

Parameters:

  • Column(s) to use in the aggregation. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

Returns:

.date(year, month, day) ⇒ Expr Originally defined in module Functions

Create a Polars literal expression of type Date.

Examples:

df = Polars::DataFrame.new(
  {
    "month" => [1, 2, 3],
    "day" => [4, 5, 6]
  }
)
df.with_columns(Polars.date(2024, Polars.col("month"), Polars.col("day")))
# =>
# shape: (3, 3)
# ┌───────┬─────┬────────────┐
# │ month ┆ day ┆ date       │
# │ ---   ┆ --- ┆ ---        │
# │ i64   ┆ i64 ┆ date       │
# ╞═══════╪═════╪════════════╡
# │ 1     ┆ 4   ┆ 2024-01-04 │
# │ 2     ┆ 5   ┆ 2024-02-05 │
# │ 3     ┆ 6   ┆ 2024-03-06 │
# └───────┴─────┴────────────┘

We can also use pl.date for filtering:

df = Polars::DataFrame.new(
  {
    "start" => [Date.new(2024, 1, 1), Date.new(2024, 1, 1), Date.new(2024, 1, 1)],
    "end" => [Date.new(2024, 5, 1), Date.new(2024, 7, 1), Date.new(2024, 9, 1)]
  }
)
df.filter(Polars.col("end") > Polars.date(2024, 6, 1))
# =>
# shape: (2, 2)
# ┌────────────┬────────────┐
# │ start      ┆ end        │
# │ ---        ┆ ---        │
# │ date       ┆ date       │
# ╞════════════╪════════════╡
# │ 2024-01-01 ┆ 2024-07-01 │
# │ 2024-01-01 ┆ 2024-09-01 │
# └────────────┴────────────┘

Parameters:

  • column or literal.

  • column or literal, ranging from 1-12.

  • column or literal, ranging from 1-31.

Returns:

.date_range(start, stop, interval = "1d", closed: "both", eager: false) ⇒ Object Originally defined in module Functions

Note:

If both low and high are passed as date types (not datetime), and the interval granularity is no finer than 1d, the returned range is also of type date. All other permutations return a datetime Series.

Create a range of type Datetime (or Date).

Examples:

Using polars duration string to specify the interval

Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", eager: true).alias(
  "date"
)
# =>
# shape: (3,)
# Series: 'date' [date]
# [
#         2022-01-01
#         2022-02-01
#         2022-03-01
# ]

Parameters:

  • Lower bound of the date range.

  • Upper bound of the date range.

  • (defaults to: "1d")

    Interval periods. It can be a polars duration string, such as 3d12h4m25s representing 3 days, 12 hours, 4 minutes, and 25 seconds.

  • (defaults to: "both")

    Define whether the temporal window interval is closed or not.

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false (default), return an expression instead.

Returns:

.date_ranges(start, stop, interval = "1d", closed: "both", eager: false) ⇒ Object Originally defined in module Functions

Note:

interval is created according to the following string language:

  • 1ns (1 nanosecond)
  • 1us (1 microsecond)
  • 1ms (1 millisecond)
  • 1s (1 second)
  • 1m (1 minute)
  • 1h (1 hour)
  • 1d (1 calendar day)
  • 1w (1 calendar week)
  • 1mo (1 calendar month)
  • 1q (1 calendar quarter)
  • 1y (1 calendar year)

Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds

By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".

Create a column of date ranges.

Examples:

df = Polars::DataFrame.new(
  {
    "start" => [Date.new(2022, 1, 1), Date.new(2022, 1, 2)],
    "end" => Date.new(2022, 1, 3)
  }
)
df.with_columns(date_range: Polars.date_ranges("start", "end"))
# =>
# shape: (2, 3)
# ┌────────────┬────────────┬─────────────────────────────────┐
# │ start      ┆ end        ┆ date_range                      │
# │ ---        ┆ ---        ┆ ---                             │
# │ date       ┆ date       ┆ list[date]                      │
# ╞════════════╪════════════╪═════════════════════════════════╡
# │ 2022-01-01 ┆ 2022-01-03 ┆ [2022-01-01, 2022-01-02, 2022-… │
# │ 2022-01-02 ┆ 2022-01-03 ┆ [2022-01-02, 2022-01-03]        │
# └────────────┴────────────┴─────────────────────────────────┘

Parameters:

  • Lower bound of the date range.

  • Upper bound of the date range.

  • (defaults to: "1d")

    Interval of the range periods, specified using the Polars duration string language (see "Notes" section below).

  • (defaults to: "both")

    Define which sides of the range are closed (inclusive).

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false (default), return an expression instead.

Returns:

.datetime(year, month, day, hour = nil, minute = nil, second = nil, microsecond = nil, time_unit: "us", time_zone: nil, ambiguous: "raise") ⇒ Expr Originally defined in module Functions

Create a Polars literal expression of type Datetime.

Examples:

df = Polars::DataFrame.new(
  {
    "month" => [1, 2, 3],
    "day" => [4, 5, 6],
    "hour" => [12, 13, 14],
    "minute" => [15, 30, 45]
  }
)
df.with_columns(
  Polars.datetime(
    2024,
    Polars.col("month"),
    Polars.col("day"),
    Polars.col("hour"),
    Polars.col("minute"),
    time_zone: "Australia/Sydney"
  )
)
# =>
# shape: (3, 5)
# ┌───────┬─────┬──────┬────────┬────────────────────────────────┐
# │ month ┆ day ┆ hour ┆ minute ┆ datetime                       │
# │ ---   ┆ --- ┆ ---  ┆ ---    ┆ ---                            │
# │ i64   ┆ i64 ┆ i64  ┆ i64    ┆ datetime[μs, Australia/Sydney] │
# ╞═══════╪═════╪══════╪════════╪════════════════════════════════╡
# │ 1     ┆ 4   ┆ 12   ┆ 15     ┆ 2024-01-04 12:15:00 AEDT       │
# │ 2     ┆ 5   ┆ 13   ┆ 30     ┆ 2024-02-05 13:30:00 AEDT       │
# │ 3     ┆ 6   ┆ 14   ┆ 45     ┆ 2024-03-06 14:45:00 AEDT       │
# └───────┴─────┴──────┴────────┴────────────────────────────────┘

We can also use Polars.datetime for filtering:

df = Polars::DataFrame.new(
  {
    "start" => [
      DateTime.new(2024, 1, 1, 0, 0, 0),
      DateTime.new(2024, 1, 1, 0, 0, 0),
      DateTime.new(2024, 1, 1, 0, 0, 0)
    ],
    "end" => [
      DateTime.new(2024, 5, 1, 20, 15, 10),
      DateTime.new(2024, 7, 1, 21, 25, 20),
      DateTime.new(2024, 9, 1, 22, 35, 30)
    ]
  }
)
df.filter(Polars.col("end") > Polars.datetime(2024, 6, 1))
# =>
# shape: (2, 2)
# ┌─────────────────────┬─────────────────────┐
# │ start               ┆ end                 │
# │ ---                 ┆ ---                 │
# │ datetime[ns]        ┆ datetime[ns]        │
# ╞═════════════════════╪═════════════════════╡
# │ 2024-01-01 00:00:00 ┆ 2024-07-01 21:25:20 │
# │ 2024-01-01 00:00:00 ┆ 2024-09-01 22:35:30 │
# └─────────────────────┴─────────────────────┘

Parameters:

  • Column or literal.

  • Column or literal, ranging from 1-12.

  • Column or literal, ranging from 1-31.

  • (defaults to: nil)

    Column or literal, ranging from 0-23.

  • (defaults to: nil)

    Column or literal, ranging from 0-59.

  • (defaults to: nil)

    Column or literal, ranging from 0-59.

  • (defaults to: nil)

    Column or literal, ranging from 0-999999.

  • (defaults to: "us")

    Time unit of the resulting expression.

  • (defaults to: nil)

    Time zone of the resulting expression.

  • (defaults to: "raise")

    Determine how to deal with ambiguous datetimes:

    • 'raise' (default): raise
    • 'earliest': use the earliest datetime
    • 'latest': use the latest datetime
    • 'null': set to null

Returns:

.datetime_range(start, stop, interval = "1d", closed: "both", time_unit: nil, time_zone: nil, eager: false) ⇒ Object Originally defined in module Functions

Generate a datetime range.

Examples:

Using Polars duration string to specify the interval:

Polars.datetime_range(
  DateTime.new(2022, 1, 1), DateTime.new(2022, 3, 1), "1mo", eager: true
).alias("datetime")
# =>
# shape: (3,)
# Series: 'datetime' [datetime[ns]]
# [
#         2022-01-01 00:00:00
#         2022-02-01 00:00:00
#         2022-03-01 00:00:00
# ]

Specifying a time zone:

Polars.datetime_range(
  DateTime.new(2022, 1, 1),
  DateTime.new(2022, 3, 1),
  "1mo",
  time_zone: "America/New_York",
  eager: true
).alias("datetime")
# =>
# shape: (3,)
# Series: 'datetime' [datetime[ns, America/New_York]]
# [
#         2022-01-01 00:00:00 EST
#         2022-02-01 00:00:00 EST
#         2022-03-01 00:00:00 EST
# ]

Parameters:

  • Lower bound of the datetime range.

  • Upper bound of the datetime range.

  • (defaults to: "1d")

    Interval of the range periods, specified using the Polars duration string language.

  • (defaults to: "both")

    Define which sides of the range are closed (inclusive).

  • (defaults to: nil)

    Time unit of the resulting Datetime data type.

  • (defaults to: nil)

    Time zone of the resulting Datetime data type.

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false (default), return an expression instead.

Returns:

.datetime_ranges(start, stop, interval: "1d", closed: "both", time_unit: nil, time_zone: nil, eager: false) ⇒ Object Originally defined in module Functions

Create a column of datetime ranges.

Examples:

df = Polars::DataFrame.new(
  {
    "start" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
    "end" => DateTime.new(2022, 1, 3),
  }
)
df.select(datetime_range: Polars.datetime_ranges("start", "end"))
# =>
# shape: (2, 1)
# ┌─────────────────────────────────┐
# │ datetime_range                  │
# │ ---                             │
# │ list[datetime[ns]]              │
# ╞═════════════════════════════════╡
# │ [2022-01-01 00:00:00, 2022-01-… │
# │ [2022-01-02 00:00:00, 2022-01-… │
# └─────────────────────────────────┘

Parameters:

  • Lower bound of the datetime range.

  • Upper bound of the datetime range.

  • (defaults to: "1d")

    Interval of the range periods, specified using the Polars duration string language.

  • (defaults to: "both")

    Define which sides of the range are closed (inclusive).

  • (defaults to: nil)

    Time unit of the resulting Datetime data type.

  • (defaults to: nil)

    Time zone of the resulting Datetime data type.

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false (default), return an expression instead.

Returns:

.disable_string_cachenil Originally defined in module Functions

Disable and clear the global string cache.

Examples:

Construct two Series using the same global string cache.

Polars.enable_string_cache
s1 = Polars::Series.new("color", ["red", "green", "red"], dtype: Polars::Categorical)
s2 = Polars::Series.new("color", ["blue", "red", "green"], dtype: Polars::Categorical)
Polars.disable_string_cache

As both Series are constructed under the same global string cache, they can be concatenated.

Polars.concat([s1, s2])
# =>
# shape: (6,)
# Series: 'color' [cat]
# [
#         "red"
#         "green"
#         "red"
#         "blue"
#         "red"
#         "green"
# ]

Returns:

.dtype_of(col_or_expr) ⇒ DataTypeExpr Originally defined in module Functions

Note:

This functionality is considered unstable. It may be changed at any point without it being considered a breaking change.

Get a lazily evaluated :class:DataType of a column or expression.

Returns:

.duration(weeks: nil, days: nil, hours: nil, minutes: nil, seconds: nil, milliseconds: nil, microseconds: nil, nanoseconds: nil, time_unit: nil) ⇒ Expr Originally defined in module Functions

Create polars Duration from distinct time components.

Examples:

df = Polars::DataFrame.new(
  {
    "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
    "add" => [1, 2]
  }
)
df.select(
  [
    (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
    (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
    (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
    (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
      "add_milliseconds"
    ),
    (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
  ]
)
# =>
# shape: (2, 5)
# ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
# │ add_weeks           ┆ add_days            ┆ add_seconds         ┆ add_milliseconds        ┆ add_hours           │
# │ ---                 ┆ ---                 ┆ ---                 ┆ ---                     ┆ ---                 │
# │ datetime[ns]        ┆ datetime[ns]        ┆ datetime[ns]        ┆ datetime[ns]            ┆ datetime[ns]        │
# ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
# │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
# │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
# └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘

Returns:

.elementExpr Originally defined in module Functions

Alias for an element in evaluated in an eval expression.

Examples:

A horizontal rank computation by taking the elements of a list

df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
df.with_columns(
  Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
)
# =>
# shape: (3, 3)
# ┌─────┬─────┬────────────┐
# │ a   ┆ b   ┆ rank       │
# │ --- ┆ --- ┆ ---        │
# │ i64 ┆ i64 ┆ list[f64]  │
# ╞═════╪═════╪════════════╡
# │ 1   ┆ 4   ┆ [1.0, 2.0] │
# │ 8   ┆ 5   ┆ [2.0, 1.0] │
# │ 3   ┆ 2   ┆ [2.0, 1.0] │
# └─────┴─────┴────────────┘

Returns:

.enable_string_cachenil Originally defined in module Functions

Enable the global string cache.

Categorical columns created under the same global string cache have the same underlying physical value when string values are equal. This allows the columns to be concatenated or used in a join operation, for example.

Examples:

Construct two Series using the same global string cache.

Polars.enable_string_cache
s1 = Polars::Series.new("color", ["red", "green", "red"], dtype: Polars::Categorical)
s2 = Polars::Series.new("color", ["blue", "red", "green"], dtype: Polars::Categorical)
Polars.disable_string_cache

As both Series are constructed under the same global string cache, they can be concatenated.

Polars.concat([s1, s2])
# =>
# shape: (6,)
# Series: 'color' [cat]
# [
#         "red"
#         "green"
#         "red"
#         "blue"
#         "red"
#         "green"
# ]

Returns:

.escape_regex(s) ⇒ String Originally defined in module Functions

Escapes string regex meta characters.

Parameters:

  • The string whose meta characters will be escaped.

Returns:

.exclude(columns, *more_columns) ⇒ Object Originally defined in module Functions

Exclude certain columns from a wildcard/regex selection.

Examples:

df = Polars::DataFrame.new(
  {
    "aa" => [1, 2, 3],
    "ba" => ["a", "b", nil],
    "cc" => [nil, 2.5, 1.5]
  }
)
# =>
# shape: (3, 3)
# ┌─────┬──────┬──────┐
# │ aa  ┆ ba   ┆ cc   │
# │ --- ┆ ---  ┆ ---  │
# │ i64 ┆ str  ┆ f64  │
# ╞═════╪══════╪══════╡
# │ 1   ┆ a    ┆ null │
# │ 2   ┆ b    ┆ 2.5  │
# │ 3   ┆ null ┆ 1.5  │
# └─────┴──────┴──────┘

Exclude by column name(s):

df.select(Polars.exclude("ba"))
# =>
# shape: (3, 2)
# ┌─────┬──────┐
# │ aa  ┆ cc   │
# │ --- ┆ ---  │
# │ i64 ┆ f64  │
# ╞═════╪══════╡
# │ 1   ┆ null │
# │ 2   ┆ 2.5  │
# │ 3   ┆ 1.5  │
# └─────┴──────┘

Exclude by regex, e.g. removing all columns whose names end with the letter "a":

df.select(Polars.exclude("^.*a$"))
# =>
# shape: (3, 1)
# ┌──────┐
# │ cc   │
# │ ---  │
# │ f64  │
# ╞══════╡
# │ null │
# │ 2.5  │
# │ 1.5  │
# └──────┘

Parameters:

  • The name or datatype of the column(s) to exclude. Accepts regular expression input. Regular expressions should start with ^ and end with $.

  • Additional names or datatypes of columns to exclude, specified as positional arguments.

Returns:

.field(name) ⇒ Expr Originally defined in module Functions

Select a field in the current struct.with_fields scope.

Examples:

df = Polars::DataFrame.new({"a" => [{"x" => 5, "y" => 2}, {"x" => 3, "y" => 4}]})
df.select(Polars.col("a").struct.with_fields(Polars.field("x") ** 2))
# =>
# shape: (2, 1)
# ┌───────────┐
# │ a         │
# │ ---       │
# │ struct[2] │
# ╞═══════════╡
# │ {25,2}    │
# │ {9,4}     │
# └───────────┘

Parameters:

  • Name of the field(s) to select.

Returns:

.first(*columns) ⇒ Expr Originally defined in module Functions

Get the first value.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "baz"]
  }
)
df.select(Polars.first)
# =>
# shape: (3, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# │ 8   │
# │ 3   │
# └─────┘
df.select(Polars.first("b"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ b   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 4   │
# └─────┘
df.select(Polars.first("a", "c"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ a   ┆ c   │
# │ --- ┆ --- │
# │ i64 ┆ str │
# ╞═════╪═════╡
# │ 1   ┆ foo │
# └─────┴─────┘

Parameters:

  • One or more column names. If not provided (default), returns an expression to take the first column of the context instead.

Returns:

.fold(acc, exprs, returns_scalar: false, return_dtype: nil, &function) ⇒ Expr Originally defined in module Functions

Accumulate over multiple columns horizontally/row wise with a left fold.

Examples:

Horizontally sum over all columns and add 1.

df = Polars::DataFrame.new(
 {
   "a" => [1, 2, 3],
   "b" => [3, 4, 5],
   "c" => [5, 6, 7]
 }
)
df.select(
  Polars.fold(Polars.lit(1), Polars.col("*")) { |acc, x| acc + x }.alias("sum")
)
# =>
# shape: (3, 1)
# ┌─────┐
# │ sum │
# │ --- │
# │ i32 │
# ╞═════╡
# │ 10  │
# │ 13  │
# │ 16  │
# └─────┘

You can also apply a condition/predicate on all columns:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3],
    "b" => [0, 1, 2]
  }
)
df.filter(
  Polars.fold(Polars.lit(true), Polars.col("*") > 1) { |acc, x| acc & x }
)
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 3   ┆ 2   │
# └─────┴─────┘

Returns:

.format(f_string, *args) ⇒ Expr Originally defined in module Functions

Format expressions as a string.

Examples:

df = Polars::DataFrame.new(
  {
    "a": ["a", "b", "c"],
    "b": [1, 2, 3]
  }
)
df.select(
  [
    Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
  ]
)
# =>
# shape: (3, 1)
# ┌─────────────┐
# │ fmt         │
# │ ---         │
# │ str         │
# ╞═════════════╡
# │ foo_a_bar_1 │
# │ foo_b_bar_2 │
# │ foo_c_bar_3 │
# └─────────────┘

Parameters:

  • A string that with placeholders. For example: "hello_{}" or "{}_world

  • Expression(s) that fill the placeholders

Returns:

.from_epoch(column, time_unit: "s") ⇒ Object Originally defined in module Functions

Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).

Depending on the unit provided, this function will return a different dtype:

  • time_unit: "d" returns pl.Date
  • time_unit: "s" returns pl.Datetime"us"
  • time_unit: "ms" returns pl.Datetime["ms"]
  • time_unit: "us" returns pl.Datetime["us"]
  • time_unit: "ns" returns pl.Datetime["ns"]

Examples:

df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
df.select(Polars.from_epoch(Polars.col("timestamp"), time_unit: "s")).collect
# =>
# shape: (2, 1)
# ┌─────────────────────┐
# │ timestamp           │
# │ ---                 │
# │ datetime[μs]        │
# ╞═════════════════════╡
# │ 2022-10-25 07:31:17 │
# │ 2022-10-25 07:31:39 │
# └─────────────────────┘

Parameters:

  • Series or expression to parse integers to pl.Datetime.

  • (defaults to: "s")

    The unit of the timesteps since epoch time.

Returns:

.from_hash(data, schema: nil, schema_overrides: nil, strict: true) ⇒ DataFrame Originally defined in module Convert

Construct a DataFrame from a hash of arrays.

This operation clones data, unless you pass in a Hash<String, Series>.

Examples:

data = {"a" => [1, 2], "b" => [3, 4]}
Polars.from_hash(data)
# =>
# shape: (2, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 3   │
# │ 2   ┆ 4   │
# └─────┴─────┘

Parameters:

  • Two-dimensional data represented as a hash. Hash must contain arrays.

  • (defaults to: nil)

    The DataFrame schema may be declared in several ways:

    • As a hash of \{name:type} pairs; if type is nil, it will be auto-inferred.
    • As an array of column names; in this case types are automatically inferred.
    • As an array of [name,type] pairs; this is equivalent to the hash form.

    If you supply an array of column names that does not match the names in the underlying data, the names given here will overwrite them. The number of names given in the schema should match the underlying data dimensions.

  • (defaults to: nil)

    Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden.

  • (defaults to: true)

    Throw an error if any data value does not exactly match the given or inferred data type for that column. If set to false, values that do not match the data type are cast to that data type or, if casting is not possible, set to null instead.

Returns:

.from_hashes(data, schema: nil, schema_overrides: nil, strict: true, infer_schema_length: N_INFER_DEFAULT) ⇒ DataFrame Originally defined in module Convert

Construct a DataFrame from an array of hashes. This operation clones data.

Examples:

data = [{"a" => 1, "b" => 4}, {"a" => 2, "b" => 5}, {"a" => 3, "b" => 6}]
Polars.from_hashes(data)
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 4   │
# │ 2   ┆ 5   │
# │ 3   ┆ 6   │
# └─────┴─────┘

Declaring a partial schema will drop the omitted columns.

Polars.from_hashes(data, schema: {"a" => Polars::Int32})
# =>
# shape: (3, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i32 │
# ╞═════╡
# │ 1   │
# │ 2   │
# │ 3   │
# └─────┘

Parameters:

  • Array with hashes mapping column name to value

  • (defaults to: nil)

    The DataFrame schema may be declared in several ways:

    • As a dict of \{name:type} pairs; if type is nil, it will be auto-inferred.
    • As a list of column names; in this case types are automatically inferred.
    • As a list of (name,type) pairs; this is equivalent to the hash form.

    If a list of column names is supplied that does NOT match the names in the underlying data, the names given here will overwrite the actual fields in the order that they appear - however, in this case it is typically clearer to rename after loading the frame.

    If you want to drop some of the fields found in the input hashes, a partial schema can be declared, in which case omitted fields will not be loaded. Similarly, you can extend the loaded frame with empty columns by adding them to the schema.

  • (defaults to: nil)

    Support override of inferred types for one or more columns.

  • (defaults to: true)

    Throw an error if any data value does not exactly match the given or inferred data type for that column. If set to false, values that do not match the data type are cast to that data type or, if casting is not possible, set to null instead.

  • (defaults to: N_INFER_DEFAULT)

    The maximum number of rows to scan for schema inference. If set to nil, the full data may be scanned (this is slow).

Returns:

.from_numo(data, schema: nil, schema_overrides: nil, orient: nil) ⇒ DataFrame Originally defined in module Convert

Construct a DataFrame from a NumPy ndarray. This operation clones data.

Note that this is slower than creating from columnar memory.

Examples:

data = Numo::NArray.cast([[1, 2, 3], [4, 5, 6]])
Polars.from_numo(data, schema: ["a", "b"], orient: "col")
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 4   │
# │ 2   ┆ 5   │
# │ 3   ┆ 6   │
# └─────┴─────┘

Parameters:

  • Two-dimensional data represented as a NumPy ndarray.

  • (defaults to: nil)

    The DataFrame schema may be declared in several ways:

    • As a dict of \{name:type} pairs; if type is nil, it will be auto-inferred.
    • As a list of column names; in this case types are automatically inferred.
    • As a list of (name,type) pairs; this is equivalent to the hash form.

    If you supply a list of column names that does not match the names in the underlying data, the names given here will overwrite them. The number of names given in the schema should match the underlying data dimensions.

  • (defaults to: nil)

    Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden.

  • (defaults to: nil)

    Whether to interpret two-dimensional data as columns or as rows. If nil, the orientation is inferred by matching the columns and data dimensions. If this does not yield conclusive results, column orientation is used.

Returns:

Raises:

.from_records(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: N_INFER_DEFAULT) ⇒ DataFrame Originally defined in module Convert

Construct a DataFrame from an array of arrays. This operation clones data.

Note that this is slower than creating from columnar memory.

Examples:

data = [[1, 2, 3], [4, 5, 6]]
Polars.from_records(data, schema: ["a", "b"])
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 4   │
# │ 2   ┆ 5   │
# │ 3   ┆ 6   │
# └─────┴─────┘

Parameters:

  • Two-dimensional data represented as an array of arrays.

  • (defaults to: nil)

    The DataFrame schema may be declared in several ways:

    • As a dict of \{name:type} pairs; if type is nil, it will be auto-inferred.
    • As a list of column names; in this case types are automatically inferred.
    • As a list of (name,type) pairs; this is equivalent to the hash form.

    If you supply a list of column names that does not match the names in the underlying data, the names given here will overwrite them. The number of names given in the schema should match the underlying data dimensions.

  • (defaults to: nil)

    Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden.

  • (defaults to: true)

    Throw an error if any data value does not exactly match the given or inferred data type for that column. If set to false, values that do not match the data type are cast to that data type or, if casting is not possible, set to null instead.

  • (defaults to: nil)

    Whether to interpret two-dimensional data as columns or as rows. If nil, the orientation is inferred by matching the columns and data dimensions. If this does not yield conclusive results, column orientation is used.

  • (defaults to: N_INFER_DEFAULT)

    The maximum number of rows to scan for schema inference. If set to nil, the full data may be scanned (this is slow).

Returns:

.get_index_typeObject

Return the data type used for Polars indexing.

Examples:

Polars.get_index_type
# => Polars::UInt32

Returns:



149
150
151
# File 'lib/polars.rb', line 149

def self.get_index_type
  Plr.get_index_type
end

.groups(column) ⇒ Object Originally defined in module Functions

Syntactic sugar for Polars.col("foo").agg_groups.

Returns:

.head(column, n = 10) ⇒ Expr Originally defined in module Functions

Get the first n rows.

This function is syntactic sugar for col(column).head(n).

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.head("a"))
# =>
# shape: (3, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# │ 8   │
# │ 3   │
# └─────┘
df.select(Polars.head("a", 2))
# =>
# shape: (2, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# │ 8   │
# └─────┘

Parameters:

  • Column name.

  • (defaults to: 10)

    Number of rows to return.

Returns:

.implode(*columns) ⇒ Expr Originally defined in module Functions

Aggregate all column values into a list.

This function is syntactic sugar for col(name).implode.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3],
    "b" => [9, 8, 7],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.implode("a"))
# =>
# shape: (1, 1)
# ┌───────────┐
# │ a         │
# │ ---       │
# │ list[i64] │
# ╞═══════════╡
# │ [1, 2, 3] │
# └───────────┘
df.select(Polars.implode("b", "c"))
# =>
# shape: (1, 2)
# ┌───────────┬───────────────────────┐
# │ b         ┆ c                     │
# │ ---       ┆ ---                   │
# │ list[i64] ┆ list[str]             │
# ╞═══════════╪═══════════════════════╡
# │ [9, 8, 7] ┆ ["foo", "bar", "foo"] │
# └───────────┴───────────────────────┘

Parameters:

  • One or more column names.

Returns:

.int_range(start = 0, stop = nil, step: 1, eager: false, dtype: Int64) ⇒ Expr, Series Also known as: arange Originally defined in module Functions

Create a range expression (or Series).

This can be used in a select, with_column, etc. Be sure that the resulting range size is equal to the length of the DataFrame you are collecting.

Examples:

Polars.arange(0, 3, eager: true)
# =>
# shape: (3,)
# Series: 'arange' [i64]
# [
#         0
#         1
#         2
# ]

Parameters:

  • (defaults to: 0)

    Lower bound of range.

  • (defaults to: nil)

    Upper bound of range.

  • (defaults to: 1)

    Step size of the range.

  • (defaults to: false)

    If eager evaluation is true, a Series is returned instead of an Expr.

  • (defaults to: Int64)

    Apply an explicit integer dtype to the resulting expression (default is Int64).

Returns:

.int_ranges(start = 0, stop = nil, step: 1, dtype: Int64, eager: false) ⇒ Expr, Series Originally defined in module Functions

Generate a range of integers for each row of the input columns.

Examples:

df = Polars::DataFrame.new({"start" => [1, -1], "end" => [3, 2]})
df.with_columns(int_range: Polars.int_ranges("start", "end"))
# =>
# shape: (2, 3)
# ┌───────┬─────┬────────────┐
# │ start ┆ end ┆ int_range  │
# │ ---   ┆ --- ┆ ---        │
# │ i64   ┆ i64 ┆ list[i64]  │
# ╞═══════╪═════╪════════════╡
# │ 1     ┆ 3   ┆ [1, 2]     │
# │ -1    ┆ 2   ┆ [-1, 0, 1] │
# └───────┴─────┴────────────┘

end can be omitted for a shorter syntax.

df.select("end", int_range: Polars.int_ranges("end"))
# =>
# shape: (2, 2)
# ┌─────┬───────────┐
# │ end ┆ int_range │
# │ --- ┆ ---       │
# │ i64 ┆ list[i64] │
# ╞═════╪═══════════╡
# │ 3   ┆ [0, 1, 2] │
# │ 2   ┆ [0, 1]    │
# └─────┴───────────┘

Parameters:

  • (defaults to: 0)

    Start of the range (inclusive). Defaults to 0.

  • (defaults to: nil)

    End of the range (exclusive). If set to nil (default), the value of start is used and start is set to 0.

  • (defaults to: 1)

    Step size of the range.

  • (defaults to: Int64)

    Integer data type of the ranges. Defaults to Int64.

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false (default), return an expression instead.

Returns:

.last(*columns) ⇒ Expr Originally defined in module Functions

Get the last value.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "baz"]
  }
)
df.select(Polars.last)
# =>
# shape: (3, 1)
# ┌─────┐
# │ c   │
# │ --- │
# │ str │
# ╞═════╡
# │ foo │
# │ bar │
# │ baz │
# └─────┘
df.select(Polars.last("a"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 3   │
# └─────┘
df.select(Polars.last("b", "c"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ b   ┆ c   │
# │ --- ┆ --- │
# │ i64 ┆ str │
# ╞═════╪═════╡
# │ 2   ┆ baz │
# └─────┴─────┘

Parameters:

  • One or more column names. If set to nil (default), returns an expression to take the last column of the context instead.

Returns:

.lenExpr Also known as: length Originally defined in module Functions

Return the number of rows in the context.

This is similar to COUNT(*) in SQL.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, nil],
    "b" => [3, nil, nil],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.len)
# =>
# shape: (1, 1)
# ┌─────┐
# │ len │
# │ --- │
# │ u32 │
# ╞═════╡
# │ 3   │
# └─────┘

Generate an index column by using len in conjunction with int_range.

df.select([
  Polars.int_range(Polars.len, dtype: Polars::UInt32).alias("index"),
  Polars.all
])
# =>
# shape: (3, 4)
# ┌───────┬──────┬──────┬─────┐
# │ index ┆ a    ┆ b    ┆ c   │
# │ ---   ┆ ---  ┆ ---  ┆ --- │
# │ u32   ┆ i64  ┆ i64  ┆ str │
# ╞═══════╪══════╪══════╪═════╡
# │ 0     ┆ 1    ┆ 3    ┆ foo │
# │ 1     ┆ 2    ┆ null ┆ bar │
# │ 2     ┆ null ┆ null ┆ foo │
# └───────┴──────┴──────┴─────┘

Returns:

.linear_spaces(start, stop, num_samples, closed: "both", as_array: false, eager: false) ⇒ Expr, Series Originally defined in module Functions

Note:

This functionality is experimental. It may be changed at any point without it being considered a breaking change.

Generate a sequence of evenly-spaced values for each row between start and end.

The number of values in each sequence is determined by num_samples.

Examples:

df = Polars::DataFrame.new({"start" => [1, -1], "end" => [3, 2], "num_samples" => [4, 5]})
df.with_columns(ls: Polars.linear_spaces("start", "end", "num_samples"))
# =>
# shape: (2, 4)
# ┌───────┬─────┬─────────────┬────────────────────────┐
# │ start ┆ end ┆ num_samples ┆ ls                     │
# │ ---   ┆ --- ┆ ---         ┆ ---                    │
# │ i64   ┆ i64 ┆ i64         ┆ list[f64]              │
# ╞═══════╪═════╪═════════════╪════════════════════════╡
# │ 1     ┆ 3   ┆ 4           ┆ [1.0, 1.666667, … 3.0] │
# │ -1    ┆ 2   ┆ 5           ┆ [-1.0, -0.25, … 2.0]   │
# └───────┴─────┴─────────────┴────────────────────────┘
df.with_columns(ls: Polars.linear_spaces("start", "end", 3, as_array: true))
# =>
# shape: (2, 4)
# ┌───────┬─────┬─────────────┬──────────────────┐
# │ start ┆ end ┆ num_samples ┆ ls               │
# │ ---   ┆ --- ┆ ---         ┆ ---              │
# │ i64   ┆ i64 ┆ i64         ┆ array[f64, 3]    │
# ╞═══════╪═════╪═════════════╪══════════════════╡
# │ 1     ┆ 3   ┆ 4           ┆ [1.0, 2.0, 3.0]  │
# │ -1    ┆ 2   ┆ 5           ┆ [-1.0, 0.5, 2.0] │
# └───────┴─────┴─────────────┴──────────────────┘

Parameters:

  • Lower bound of the range.

  • Upper bound of the range.

  • Number of samples in the output sequence.

  • (defaults to: "both")

    Define which sides of the interval are closed (inclusive).

  • (defaults to: false)

    Return result as a fixed-length Array. num_samples must be a constant.

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false (default), return an expression instead.

Returns:

.lit(value, dtype: nil, allow_object: false) ⇒ Expr Originally defined in module Functions

Return an expression representing a literal value.

Examples:

Literal scalar values:

Polars.lit(1)
Polars.lit(5.5)
Polars.lit(nil)
Polars.lit("foo_bar")
Polars.lit(Date.new(2021, 1, 20))
Polars.lit(DateTime.new(2023, 3, 31, 10, 30, 45))

Literal list/Series data (1D):

Polars.lit([1, 2, 3])
Polars.lit(Polars::Series.new("x", [1, 2, 3]))

Literal list/Series data (2D):

Polars.lit([[1, 2], [3, 4]])
Polars.lit(Polars::Series.new("y", [[1, 2], [3, 4]]))

Returns:

.map_batches(exprs, return_dtype: nil, is_elementwise: false, returns_scalar: false, &function) ⇒ Expr Originally defined in module Functions

Note:

This method is much slower than the native expressions API. Only use it if you cannot implement your logic otherwise.

Note:

A UDF passed to map_batches must be pure, meaning that it cannot modify or depend on state other than its arguments. We may call the function with arbitrary input data.

Map a custom function over multiple columns/expressions.

Produces a single Series result.

Examples:

test_func = lambda do |a, b, c|
  a + b + c
end
df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3, 4],
    "b" => [4, 5, 6, 7]
  }
)

df.with_columns(
  (
    Polars.struct(["a", "b"]).map_batches { |x| test_func.(x.struct.field("a"), x.struct.field("b"), 1) }
  ).alias("a+b+c")
)
# =>
# shape: (4, 3)
# ┌─────┬─────┬───────┐
# │ a   ┆ b   ┆ a+b+c │
# │ --- ┆ --- ┆ ---   │
# │ i64 ┆ i64 ┆ i64   │
# ╞═════╪═════╪═══════╡
# │ 1   ┆ 4   ┆ 6     │
# │ 2   ┆ 5   ┆ 8     │
# │ 3   ┆ 6   ┆ 10    │
# │ 4   ┆ 7   ┆ 12    │
# └─────┴─────┴───────┘

Parameters:

  • Expression(s) representing the input Series to the function.

  • (defaults to: nil)

    Datatype of the output Series.

    It is recommended to set this whenever possible. If this is nil, it tries to infer the datatype by calling the function with dummy data and looking at the output.

  • (defaults to: false)

    Set to true if the operations is elementwise for better performance and optimization.

    An elementwise operations has unit or equal length for all inputs and can be ran sequentially on slices without results being affected.

  • (defaults to: false)

    If the function returns a scalar, by default it will be wrapped in a list in the output, since the assumption is that the function always returns something Series-like. If you want to keep the result as a scalar, set this argument to True.

Returns:

.map_groups(exprs, return_dtype: nil, is_elementwise: false, returns_scalar: false, &function) ⇒ Expr Originally defined in module Functions

Note:

This method is much slower than the native expressions API. Only use it if you cannot implement your logic otherwise.

Apply a custom/user-defined function (UDF) in a GroupBy context.

Examples:

df = Polars::DataFrame.new(
  {
    "group" => [1, 1, 2],
    "a" => [1, 3, 3],
    "b" => [5, 6, 7]
  }
)
(
  df.group_by("group").agg(
    Polars.map_groups(["a", "b"], return_dtype: Polars::Float64) { |list_of_series| list_of_series[0] / list_of_series[0].sum + list_of_series[1] }
    .alias("my_custom_aggregation")
  )
).sort("group")
# =>
# shape: (2, 2)
# ┌───────┬───────────────────────┐
# │ group ┆ my_custom_aggregation │
# │ ---   ┆ ---                   │
# │ i64   ┆ list[f64]             │
# ╞═══════╪═══════════════════════╡
# │ 1     ┆ [5.25, 6.75]          │
# │ 2     ┆ [8.0]                 │
# └───────┴───────────────────────┘

Parameters:

  • Expression(s) representing the input Series to the function.

  • (defaults to: nil)

    Datatype of the output Series.

    It is recommended to set this whenever possible. If this is nil, it tries to infer the datatype by calling the function with dummy data and looking at the output.

  • (defaults to: false)

    Set to true if the operations is elementwise for better performance and optimization.

    An elementwise operations has unit or equal length for all inputs and can be ran sequentially on slices without results being affected.

  • (defaults to: false)

    If the function returns a single scalar as output.

Returns:

.max(*names) ⇒ Expr Originally defined in module Functions

Get the maximum value.

Syntactic sugar for col(names).max.

Examples:

Get the maximum value of a column.

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.max("a"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 8   │
# └─────┘

Get the maximum value of multiple columns.

df.select(Polars.max("^a|b$"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 8   ┆ 5   │
# └─────┴─────┘
df.select(Polars.max("a", "b"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 8   ┆ 5   │
# └─────┴─────┘

Parameters:

  • Name(s) of the columns to use in the aggregation.

Returns:

.max_horizontal(*exprs) ⇒ Expr Originally defined in module Functions

Get the maximum value horizontally across columns.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, nil],
    "c" => ["x", "y", "z"]
  }
)
df.with_columns(max: Polars.max_horizontal("a", "b"))
# =>
# shape: (3, 4)
# ┌─────┬──────┬─────┬─────┐
# │ a   ┆ b    ┆ c   ┆ max │
# │ --- ┆ ---  ┆ --- ┆ --- │
# │ i64 ┆ i64  ┆ str ┆ i64 │
# ╞═════╪══════╪═════╪═════╡
# │ 1   ┆ 4    ┆ x   ┆ 4   │
# │ 8   ┆ 5    ┆ y   ┆ 8   │
# │ 3   ┆ null ┆ z   ┆ 3   │
# └─────┴──────┴─────┴─────┘

Parameters:

  • Column(s) to use in the aggregation. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

Returns:

.mean(*columns) ⇒ Expr Originally defined in module Functions

Get the mean value.

This function is syntactic sugar for col(columns).mean.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.mean("a"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ f64 │
# ╞═════╡
# │ 4.0 │
# └─────┘
df.select(Polars.mean("a", "b"))
# =>
# shape: (1, 2)
# ┌─────┬──────────┐
# │ a   ┆ b        │
# │ --- ┆ ---      │
# │ f64 ┆ f64      │
# ╞═════╪══════════╡
# │ 4.0 ┆ 3.666667 │
# └─────┴──────────┘

Parameters:

  • One or more column names.

Returns:

.mean_horizontal(*exprs, ignore_nulls: true) ⇒ Expr Originally defined in module Functions

Compute the mean of all values horizontally across columns.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, nil],
    "c" => ["x", "y", "z"]
  }
)
df.with_columns(mean: Polars.mean_horizontal("a", "b"))
# =>
# shape: (3, 4)
# ┌─────┬──────┬─────┬──────┐
# │ a   ┆ b    ┆ c   ┆ mean │
# │ --- ┆ ---  ┆ --- ┆ ---  │
# │ i64 ┆ i64  ┆ str ┆ f64  │
# ╞═════╪══════╪═════╪══════╡
# │ 1   ┆ 4    ┆ x   ┆ 2.5  │
# │ 8   ┆ 5    ┆ y   ┆ 6.5  │
# │ 3   ┆ null ┆ z   ┆ 3.0  │
# └─────┴──────┴─────┴──────┘

Parameters:

  • Column(s) to use in the aggregation. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

  • (defaults to: true)

    Ignore null values (default). If set to false, any null value in the input will lead to a null output.

Returns:

.median(*columns) ⇒ Expr Originally defined in module Functions

Get the median value.

This function is syntactic sugar for pl.col(columns).median.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.median("a"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ f64 │
# ╞═════╡
# │ 3.0 │
# └─────┘
df.select(Polars.median("a", "b"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ f64 ┆ f64 │
# ╞═════╪═════╡
# │ 3.0 ┆ 4.0 │
# └─────┴─────┘

Parameters:

  • One or more column names.

Returns:

.min(*names) ⇒ Expr Originally defined in module Functions

Get the minimum value.

Syntactic sugar for col(names).min.

Examples:

Get the minimum value of a column.

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.min("a"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# └─────┘

Get the minimum value of multiple columns.

df.select(Polars.min("^a|b$"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 2   │
# └─────┴─────┘
df.select(Polars.min("a", "b"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 2   │
# └─────┴─────┘

Parameters:

  • Name(s) of the columns to use in the aggregation.

Returns:

.min_horizontal(*exprs) ⇒ Expr Originally defined in module Functions

Get the minimum value horizontally across columns.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, nil],
    "c" => ["x", "y", "z"]
  }
)
df.with_columns(min: Polars.min_horizontal("a", "b"))
# =>
# shape: (3, 4)
# ┌─────┬──────┬─────┬─────┐
# │ a   ┆ b    ┆ c   ┆ min │
# │ --- ┆ ---  ┆ --- ┆ --- │
# │ i64 ┆ i64  ┆ str ┆ i64 │
# ╞═════╪══════╪═════╪═════╡
# │ 1   ┆ 4    ┆ x   ┆ 1   │
# │ 8   ┆ 5    ┆ y   ┆ 5   │
# │ 3   ┆ null ┆ z   ┆ 3   │
# └─────┴──────┴─────┴─────┘

Parameters:

  • Column(s) to use in the aggregation. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

Returns:

.n_unique(*columns) ⇒ Expr Originally defined in module Functions

Count unique values.

This function is syntactic sugar for col(columns).n_unique.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 1],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.n_unique("a"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ u32 │
# ╞═════╡
# │ 2   │
# └─────┘
df.select(Polars.n_unique("b", "c"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ b   ┆ c   │
# │ --- ┆ --- │
# │ u32 ┆ u32 │
# ╞═════╪═════╡
# │ 3   ┆ 2   │
# └─────┴─────┘

Parameters:

  • One or more column names.

Returns:

.nth(*indices, strict: true) ⇒ Expr Originally defined in module Functions

Get the nth column(s) of the context.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "baz"]
  }
)
df.select(Polars.nth(1))
# =>
# shape: (3, 1)
# ┌─────┐
# │ b   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 4   │
# │ 5   │
# │ 2   │
# └─────┘
df.select(Polars.nth(2, 0))
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ c   ┆ a   │
# │ --- ┆ --- │
# │ str ┆ i64 │
# ╞═════╪═════╡
# │ foo ┆ 1   │
# │ bar ┆ 8   │
# │ baz ┆ 3   │
# └─────┴─────┘

Parameters:

  • One or more indices representing the columns to retrieve.

Returns:

.ones(n, dtype: Float64, eager: false) ⇒ Object Originally defined in module Functions

Construct a column of length n filled with ones.

This is syntactic sugar for the repeat function.

Examples:

Polars.ones(3, dtype: Polars::Int8, eager: true)
# =>
# shape: (3,)
# Series: 'ones' [i8]
# [
#         1
#         1
#         1
# ]

Parameters:

  • Length of the resulting column.

  • (defaults to: Float64)

    Data type of the resulting column. Defaults to Float64.

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false, return an expression instead.

Returns:

.quantile(column, quantile, interpolation: "nearest") ⇒ Expr Originally defined in module Functions

Syntactic sugar for Polars.col("foo").quantile(...).

Parameters:

  • Column name.

  • Quantile between 0.0 and 1.0.

  • (defaults to: "nearest")

    Interpolation method.

Returns:

.read_avro(source, columns: nil, n_rows: nil) ⇒ DataFrame Originally defined in module IO

Read into a DataFrame from Apache Avro format.

Parameters:

  • Path to a file or a file-like object.

  • (defaults to: nil)

    Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.

  • (defaults to: nil)

    Stop reading from Apache Avro file after reading n_rows.

Returns:

.read_csv(source, has_header: true, columns: nil, new_columns: nil, separator: ",", comment_prefix: nil, quote_char: '"', skip_rows: 0, skip_lines: 0, schema: nil, schema_overrides: nil, null_values: nil, missing_utf8_is_empty_string: false, ignore_errors: false, try_parse_dates: false, n_threads: nil, infer_schema: true, infer_schema_length: N_INFER_DEFAULT, batch_size: 8192, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: false, storage_options: nil, skip_rows_after_header: 0, row_index_name: nil, row_index_offset: 0, eol_char: "\n", raise_if_empty: true, truncate_ragged_lines: false, decimal_comma: false, glob: true) ⇒ DataFrame Originally defined in module IO

Note:

This operation defaults to a rechunk operation at the end, meaning that all data will be stored continuously in memory. Set rechunk: false if you are benchmarking the csv-reader. A rechunk is an expensive operation.

Read a CSV file into a DataFrame.

Parameters:

  • Path to a file or a file-like object.

  • (defaults to: true)

    Indicate if the first row of dataset is a header or not. If set to false, column names will be autogenerated in the following format: column_x, with x being an enumeration over every column in the dataset starting at 1.

  • (defaults to: nil)

    Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.

  • (defaults to: nil)

    Rename columns right after parsing the CSV file. If the given list is shorter than the width of the DataFrame the remaining columns will have their original name.

  • (defaults to: ",")

    Single byte character to use as separator in the file.

  • (defaults to: nil)

    A string used to indicate the start of a comment line. Comment lines are skipped during parsing. Common examples of comment prefixes are # and //.

  • (defaults to: '"')

    Single byte character used for csv quoting. Set to nil to turn off special handling and escaping of quotes.

  • (defaults to: 0)

    Start reading after skip_rows lines.

  • (defaults to: 0)

    Start reading after skip_lines lines. The header will be parsed at this offset. Note that CSV escaping will not be respected when skipping lines. If you want to skip valid CSV rows, use skip_rows.

  • (defaults to: nil)

    Provide the schema. This means that polars doesn't do schema inference. This argument expects the complete schema, whereas schema_overrides can be used to partially overwrite a schema. Note that the order of the columns in the provided schema must match the order of the columns in the CSV being read.

  • (defaults to: nil)

    Overwrite dtypes for specific or all columns during schema inference.

  • (defaults to: nil)

    Values to interpret as null values. You can provide a:

    • String: All values equal to this string will be null.
    • Array: All values equal to any string in this array will be null.
    • Hash: A hash that maps column name to a null value string.
  • (defaults to: false)

    By default a missing value is considered to be null; if you would prefer missing utf8 values to be treated as the empty string you can set this param true.

  • (defaults to: false)

    Try to keep reading lines if some lines yield errors. First try infer_schema_length: 0 to read all columns as :str to check which values might cause an issue.

  • (defaults to: false)

    Try to automatically parse dates. If this does not succeed, the column remains of data type :str.

  • (defaults to: nil)

    Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.

  • (defaults to: true)

    When true, the schema is inferred from the data using the first infer_schema_length rows. When false, the schema is not inferred and will be Polars::String if not specified in schema or schema_overrides.

  • (defaults to: N_INFER_DEFAULT)

    The maximum number of rows to scan for schema inference. If set to nil, the full data may be scanned (this is slow). Set infer_schema: false to read all columns as Polars::String.

  • (defaults to: 8192)

    Number of lines to read into the buffer at once. Modify this to change performance.

  • (defaults to: nil)

    Stop reading from CSV file after reading n_rows. During multi-threaded parsing, an upper bound of n_rows rows cannot be guaranteed.

  • (defaults to: "utf8")

    Lossy means that invalid utf8 values are replaced with characters. When using other encodings than utf8 or utf8-lossy, the input is first decoded im memory with Ruby.

  • (defaults to: false)

    Reduce memory usage at expense of performance.

  • (defaults to: false)

    Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.

  • (defaults to: nil)

    Extra options that make sense for a particular storage connection.

  • (defaults to: 0)

    Skip this number of rows when the header is parsed.

  • (defaults to: nil)

    If not nil, this will insert a row count column with the given name into the DataFrame.

  • (defaults to: 0)

    Offset to start the row_count column (only used if the name is set).

  • (defaults to: "\n")

    Single byte end of line character.

  • (defaults to: true)

    When there is no data in the source, NoDataError is raised. If this parameter is set to false, an empty DataFrame (with no columns) is returned instead.

  • (defaults to: false)

    Truncate lines that are longer than the schema.

  • (defaults to: false)

    Parse floats using a comma as the decimal separator instead of a period.

  • (defaults to: true)

    Expand path given via globbing rules.

Returns:

.read_csv_batched(source, has_header: true, columns: nil, new_columns: nil, separator: ",", comment_prefix: nil, quote_char: '"', skip_rows: 0, skip_lines: 0, schema_overrides: nil, null_values: nil, missing_utf8_is_empty_string: false, ignore_errors: false, try_parse_dates: false, n_threads: nil, infer_schema_length: N_INFER_DEFAULT, batch_size: 50_000, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: false, skip_rows_after_header: 0, row_index_name: nil, row_index_offset: 0, eol_char: "\n", raise_if_empty: true, truncate_ragged_lines: false, decimal_comma: false) ⇒ BatchedCsvReader Originally defined in module IO

Deprecated.

Use scan_csv().collect_batches instead.

Read a CSV file in batches.

Upon creation of the BatchedCsvReader, polars will gather statistics and determine the file chunks. After that work will only be done if next_batches is called.

Examples:

reader = Polars.read_csv_batched(
  "./tpch/tables_scale_100/lineitem.tbl", separator: "|", try_parse_dates: true
)
reader.next_batches(5)

Parameters:

  • Path to a file or a file-like object.

  • (defaults to: true)

    Indicate if the first row of dataset is a header or not. If set to false, column names will be autogenerated in the following format: column_x, with x being an enumeration over every column in the dataset starting at 1.

  • (defaults to: nil)

    Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.

  • (defaults to: nil)

    Rename columns right after parsing the CSV file. If the given list is shorter than the width of the DataFrame the remaining columns will have their original name.

  • (defaults to: ",")

    Single byte character to use as separator in the file.

  • (defaults to: nil)

    A string used to indicate the start of a comment line. Comment lines are skipped during parsing. Common examples of comment prefixes are # and //.

  • (defaults to: '"')

    Single byte character used for csv quoting, default = ". Set to nil to turn off special handling and escaping of quotes.

  • (defaults to: 0)

    Start reading after skip_rows lines.

  • (defaults to: 0)

    Start reading after skip_lines lines. The header will be parsed at this offset. Note that CSV escaping will not be respected when skipping lines. If you want to skip valid CSV rows, use skip_rows.

  • (defaults to: nil)

    Overwrite dtypes during inference.

  • (defaults to: nil)

    Values to interpret as null values. You can provide a:

    • String: All values equal to this string will be null.
    • Array: All values equal to any string in this array will be null.
    • Hash: A hash that maps column name to a null value string.
  • (defaults to: false)

    By default a missing value is considered to be null; if you would prefer missing utf8 values to be treated as the empty string you can set this param true.

  • (defaults to: false)

    Try to keep reading lines if some lines yield errors. First try infer_schema_length: 0 to read all columns as :str to check which values might cause an issue.

  • (defaults to: false)

    Try to automatically parse dates. If this does not succeed, the column remains of data type :str.

  • (defaults to: nil)

    Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.

  • (defaults to: N_INFER_DEFAULT)

    Maximum number of lines to read to infer schema. If set to 0, all columns will be read as :str. If set to nil, a full table scan will be done (slow).

  • (defaults to: 50_000)

    Number of lines to read into the buffer at once. Modify this to change performance.

  • (defaults to: nil)

    Stop reading from CSV file after reading n_rows. During multi-threaded parsing, an upper bound of n_rows rows cannot be guaranteed.

  • (defaults to: "utf8")

    Lossy means that invalid utf8 values are replaced with characters. When using other encodings than utf8 or utf8-lossy, the input is first decoded im memory with Ruby. Defaults to utf8.

  • (defaults to: false)

    Reduce memory usage at expense of performance.

  • (defaults to: false)

    Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.

  • (defaults to: 0)

    Skip this number of rows when the header is parsed.

  • (defaults to: nil)

    If not nil, this will insert a row count column with the given name into the DataFrame.

  • (defaults to: 0)

    Offset to start the row_count column (only used if the name is set).

  • (defaults to: "\n")

    Single byte end of line character.

  • (defaults to: true)

    When there is no data in the source,NoDataError is raised. If this parameter is set to false, nil will be returned from next_batches(n) instead.

  • (defaults to: false)

    Truncate lines that are longer than the schema.

  • (defaults to: false)

    Parse floats using a comma as the decimal separator instead of a period.

Returns:

.read_database(query, schema_overrides: nil) ⇒ DataFrame Originally defined in module IO

Read a SQL query into a DataFrame.

Parameters:

  • ActiveRecord::Relation or ActiveRecord::Result.

  • (defaults to: nil)

    A hash mapping column names to dtypes, used to override the schema inferred from the query.

Returns:

.read_delta(source, version: nil, columns: nil, rechunk: nil, storage_options: nil, delta_table_options: nil) ⇒ DataFrame Originally defined in module IO

Reads into a DataFrame from a Delta lake table.

Parameters:

  • DeltaTable or a Path or URI to the root of the Delta lake table.

  • (defaults to: nil)

    Numerical version or timestamp version of the Delta lake table.

  • (defaults to: nil)

    Columns to select. Accepts a list of column names.

  • (defaults to: nil)

    Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.

  • (defaults to: nil)

    Extra options for the storage backends supported by deltalake-rb.

  • (defaults to: nil)

    Additional keyword arguments while reading a Delta lake Table.

Returns:

.read_ipc(source, columns: nil, n_rows: nil, memory_map: true, storage_options: nil, row_index_name: nil, row_index_offset: 0, rechunk: true) ⇒ DataFrame Originally defined in module IO

Read into a DataFrame from Arrow IPC (Feather v2) file.

Parameters:

  • Path to a file or a file-like object.

  • (defaults to: nil)

    Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.

  • (defaults to: nil)

    Stop reading from IPC file after reading n_rows.

  • (defaults to: true)

    Try to memory map the file. This can greatly improve performance on repeated queries as the OS may cache pages. Only uncompressed IPC files can be memory mapped.

  • (defaults to: nil)

    Extra options that make sense for a particular storage connection.

  • (defaults to: nil)

    If not nil, this will insert a row count column with give name into the DataFrame.

  • (defaults to: 0)

    Offset to start the row_count column (only use if the name is set).

  • (defaults to: true)

    Make sure that all data is contiguous.

Returns:

.read_ipc_schema(source) ⇒ Hash Originally defined in module IO

Get a schema of the IPC file without reading data.

Parameters:

  • Path to a file or a file-like object.

Returns:

.read_ipc_stream(source, columns: nil, n_rows: nil, storage_options: nil, row_index_name: nil, row_index_offset: 0, rechunk: true) ⇒ DataFrame Originally defined in module IO

Read into a DataFrame from Arrow IPC record batch stream.

See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html.

Parameters:

  • Path to a file or a file-like object.

  • (defaults to: nil)

    Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.

  • (defaults to: nil)

    Stop reading from IPC stream after reading n_rows.

  • (defaults to: nil)

    Extra options that make sense for a particular storage connection.

  • (defaults to: nil)

    Insert a row index column with the given name into the DataFrame as the first column. If set to nil (default), no row index column is created.

  • (defaults to: 0)

    Start the row index at this offset. Cannot be negative. Only used if row_index_name is set.

  • (defaults to: true)

    Make sure that all data is contiguous.

Returns:

.read_json(source, schema: nil, schema_overrides: nil, infer_schema_length: N_INFER_DEFAULT) ⇒ DataFrame Originally defined in module IO

Read into a DataFrame from a JSON file.

Parameters:

  • Path to a file or a file-like object.

  • (defaults to: nil)

    The DataFrame schema may be declared in several ways:

    • As a hash of \{name:type} pairs; if type is nil, it will be auto-inferred.
    • As an array of column names; in this case types are automatically inferred.
    • As an array of [name,type] pairs; this is equivalent to the hash form.

    If you supply an array of column names that does not match the names in the underlying data, the names given here will overwrite them. The number of names given in the schema should match the underlying data dimensions.

  • (defaults to: nil)

    Support type specification or override of one or more columns; note that any dtypes inferred from the schema param will be overridden.

  • (defaults to: N_INFER_DEFAULT)

    The maximum number of rows to scan for schema inference. If set to nil, the full data may be scanned (this is slow).

Returns:

.read_ndjson(source, schema: nil, schema_overrides: nil, infer_schema_length: N_INFER_DEFAULT, batch_size: 1024, n_rows: nil, low_memory: false, rechunk: false, row_index_name: nil, row_index_offset: 0, ignore_errors: false, storage_options: nil, credential_provider: "auto", retries: nil, file_cache_ttl: nil, include_file_paths: nil) ⇒ DataFrame Originally defined in module IO

Read into a DataFrame from a newline delimited JSON file.

Parameters:

  • Path to a file.

  • (defaults to: nil)

    The DataFrame schema may be declared in several ways:

    • As a dict of \{name:type} pairs; if type is nil, it will be auto-inferred.
    • As a list of column names; in this case types are automatically inferred.
    • As a list of (name,type) pairs; this is equivalent to the hash form.

    If you supply a list of column names that does not match the names in the underlying data, the names given here will overwrite them. The number of names given in the schema should match the underlying data dimensions.

  • (defaults to: nil)

    Support type specification or override of one or more columns; note that any dtypes inferred from the schema param will be overridden.

  • (defaults to: N_INFER_DEFAULT)

    Infer the schema length from the first infer_schema_length rows.

  • (defaults to: 1024)

    Number of rows to read in each batch.

  • (defaults to: nil)

    Stop reading from JSON file after reading n_rows.

  • (defaults to: false)

    Reduce memory pressure at the expense of performance.

  • (defaults to: false)

    Reallocate to contiguous memory when all chunks/ files are parsed.

  • (defaults to: nil)

    If not nil, this will insert a row count column with give name into the DataFrame.

  • (defaults to: 0)

    Offset to start the row_count column (only use if the name is set).

  • (defaults to: false)

    Return Null if parsing fails because of schema mismatches.

  • (defaults to: nil)

    Options that indicate how to connect to a cloud provider.

    The cloud providers currently supported are AWS, GCP, and Azure. See supported keys here:

    • aws
    • gcp
    • azure
    • Hugging Face (hf://): Accepts an API key under the token parameter: \ {'token': '...'}, or by setting the HF_TOKEN environment variable.

    If storage_options is not provided, Polars will try to infer the information from environment variables.

  • (defaults to: "auto")

    Provide a function that can be called to provide cloud storage credentials. The function is expected to return a hash of credential keys along with an optional credential expiry time.

  • (defaults to: nil)

    Number of retries if accessing a cloud instance fails.

  • (defaults to: nil)

    Amount of time to keep downloaded cloud files since their last access time, in seconds. Uses the POLARS_FILE_CACHE_TTL environment variable (which defaults to 1 hour) if not given.

  • (defaults to: nil)

    Include the path of the source file(s) as a column with this name.

Returns:

.read_parquet(source, columns: nil, n_rows: nil, row_index_name: nil, row_index_offset: 0, parallel: "auto", use_statistics: true, hive_partitioning: nil, glob: true, schema: nil, hive_schema: nil, try_parse_hive_dates: true, rechunk: false, low_memory: false, storage_options: nil, credential_provider: "auto", retries: nil, include_file_paths: nil, missing_columns: "raise", allow_missing_columns: nil) ⇒ DataFrame Originally defined in module IO

Read into a DataFrame from a parquet file.

Parameters:

  • Path to a file or a file-like object.

  • (defaults to: nil)

    Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.

  • (defaults to: nil)

    Stop reading from parquet file after reading n_rows.

  • (defaults to: nil)

    If not nil, this will insert a row count column with give name into the DataFrame.

  • (defaults to: 0)

    Offset to start the row_count column (only use if the name is set).

  • (defaults to: "auto")

    This determines the direction of parallelism. 'auto' will try to determine the optimal direction.

  • (defaults to: true)

    Use statistics in the parquet to determine if pages can be skipped from reading.

  • (defaults to: nil)

    Infer statistics and schema from hive partitioned URL and use them to prune reads.

  • (defaults to: true)

    Expand path given via globbing rules.

  • (defaults to: nil)

    Specify the datatypes of the columns. The datatypes must match the datatypes in the file(s). If there are extra columns that are not in the file(s), consider also enabling allow_missing_columns.

  • (defaults to: nil)

    The column names and data types of the columns by which the data is partitioned. If set to nil (default), the schema of the Hive partitions is inferred.

  • (defaults to: true)

    Whether to try parsing hive values as date/datetime types.

  • (defaults to: false)

    In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.

  • (defaults to: false)

    Reduce memory pressure at the expense of performance.

  • (defaults to: nil)

    Extra options that make sense for a particular storage connection.

  • (defaults to: "auto")

    Provide a function that can be called to provide cloud storage credentials. The function is expected to return a hash of credential keys along with an optional credential expiry time.

  • (defaults to: nil)

    Number of retries if accessing a cloud instance fails.

  • (defaults to: nil)

    Include the path of the source file(s) as a column with this name.

  • (defaults to: "raise")

    Configuration for behavior when columns defined in the schema are missing from the data:

    • insert: Inserts the missing columns using NULLs as the row values.
    • raise: Raises an error.
  • (defaults to: nil)

    When reading a list of parquet files, if a column existing in the first file cannot be found in subsequent files, the default behavior is to raise an error. However, if allow_missing_columns is set to true, a full-NULL column is returned instead of erroring for the files that do not contain the column.

Returns:

.read_parquet_metadata(source, storage_options: nil, credential_provider: "auto", retries: nil) ⇒ Hash Originally defined in module IO

Note:

This functionality is considered experimental. It may be removed or changed at any point without it being considered a breaking change.

Get file-level custom metadata of a Parquet file without reading data.

Parameters:

  • Path to a file or a file-like object.

  • (defaults to: nil)

    Extra options that make sense for a particular storage connection.

  • (defaults to: "auto")

    Provide a function that can be called to provide cloud storage credentials. The function is expected to return a hash of credential keys along with an optional credential expiry time.

  • (defaults to: nil)

    Number of retries if accessing a cloud instance fails.

Returns:

.read_parquet_schema(source) ⇒ Schema Originally defined in module IO

Get a schema of the Parquet file without reading data.

Parameters:

  • Path to a file or a file-like object.

Returns:

.reduce(exprs, returns_scalar: false, return_dtype: nil, &function) ⇒ Expr Originally defined in module Functions

Accumulate over multiple columns horizontally/ row wise with a left fold.

Examples:

Horizontally sum over all columns.

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3],
    "b" => [0, 1, 2]
  }
)
df.select(
  Polars.reduce(Polars.col("*")) { |acc, x| acc + x }.alias("sum")
)
# =>
# shape: (3, 1)
# ┌─────┐
# │ sum │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# │ 3   │
# │ 5   │
# └─────┘

Parameters:

  • Expressions to aggregate over. May also be a wildcard expression.

  • (defaults to: false)

    Whether or not function applied returns a scalar. This must be set correctly by the user.

  • (defaults to: nil)

    Output datatype. If not set, the dtype will be inferred based on the dtype of the input expressions.

Returns:

.repeat(value, n, dtype: nil, eager: false) ⇒ Object Originally defined in module Functions

Repeat a single value n times.

Examples:

Construct a column with a repeated value in a lazy context.

Polars.select(Polars.repeat("z", 3)).to_series
# =>
# shape: (3,)
# Series: 'repeat' [str]
# [
#         "z"
#         "z"
#         "z"
# ]

Generate a Series directly by setting eager: true.

Polars.repeat(3, 3, dtype: Polars::Int8, eager: true)
# =>
# shape: (3,)
# Series: 'repeat' [i8]
# [
#         3
#         3
#         3
# ]

Parameters:

  • Value to repeat.

  • Repeat n times.

  • (defaults to: nil)

    Data type of the resulting column. If set to nil (default), data type is inferred from the given value. Defaults to Int32 for integer values, unless Int64 is required to fit the given value. Defaults to Float64 for float values.

  • (defaults to: false)

    Run eagerly and collect into a Series.

Returns:

.rolling_corr(a, b, window_size:, min_samples: nil, ddof: 1) ⇒ Expr Originally defined in module Functions

Compute the rolling correlation between two columns/ expressions.

The window at a given row includes the row itself and the window_size - 1 elements before it.

Parameters:

  • Column name or Expression.

  • Column name or Expression.

  • The length of the window.

  • (defaults to: nil)

    The number of values in the window that should be non-null before computing a result. If nil, it will be set equal to window size.

  • (defaults to: 1)

    Delta degrees of freedom. The divisor used in calculations is N - ddof, where N represents the number of elements.

Returns:

.rolling_cov(a, b, window_size:, min_samples: nil, ddof: 1) ⇒ Expr Originally defined in module Functions

Compute the rolling covariance between two columns/ expressions.

The window at a given row includes the row itself and the window_size - 1 elements before it.

Parameters:

  • Column name or Expression.

  • Column name or Expression.

  • The length of the window.

  • (defaults to: nil)

    The number of values in the window that should be non-null before computing a result. If nil, it will be set equal to window size.

  • (defaults to: 1)

    Delta degrees of freedom. The divisor used in calculations is N - ddof, where N represents the number of elements.

Returns:

.scan_csv(source, has_header: true, separator: ",", comment_prefix: nil, quote_char: '"', skip_rows: 0, skip_lines: 0, schema: nil, schema_overrides: nil, null_values: nil, missing_utf8_is_empty_string: false, ignore_errors: false, cache: true, with_column_names: nil, infer_schema: true, infer_schema_length: N_INFER_DEFAULT, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: false, skip_rows_after_header: 0, row_index_name: nil, row_index_offset: 0, try_parse_dates: false, eol_char: "\n", new_columns: nil, raise_if_empty: true, truncate_ragged_lines: false, decimal_comma: false, glob: true, storage_options: nil, credential_provider: "auto", retries: nil, file_cache_ttl: nil, include_file_paths: nil) ⇒ LazyFrame Originally defined in module IO

Lazily read from a CSV file or multiple files via glob patterns.

This allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.

Parameters:

  • Path to a file.

  • (defaults to: true)

    Indicate if the first row of dataset is a header or not. If set to false, column names will be autogenerated in the following format: column_x, with x being an enumeration over every column in the dataset starting at 1.

  • (defaults to: ",")

    Single byte character to use as separator in the file.

  • (defaults to: nil)

    A string used to indicate the start of a comment line. Comment lines are skipped during parsing. Common examples of comment prefixes are # and //.

  • (defaults to: '"')

    Single byte character used for csv quoting. Set to nil to turn off special handling and escaping of quotes.

  • (defaults to: 0)

    Start reading after skip_rows lines. The header will be parsed at this offset.

  • (defaults to: 0)

    Start reading after skip_lines lines. The header will be parsed at this offset. Note that CSV escaping will not be respected when skipping lines. If you want to skip valid CSV rows, use skip_rows.

  • (defaults to: nil)

    Provide the schema. This means that polars doesn't do schema inference. This argument expects the complete schema, whereas schema_overrides can be used to partially overwrite a schema. Note that the order of the columns in the provided schema must match the order of the columns in the CSV being read.

  • (defaults to: nil)

    Overwrite dtypes for specific or all columns during schema inference.

  • (defaults to: nil)

    Values to interpret as null values. You can provide a:

    • String: All values equal to this string will be null.
    • Array: All values equal to any string in this array will be null.
    • Hash: A hash that maps column name to a null value string.
  • (defaults to: false)

    By default a missing value is considered to be null; if you would prefer missing utf8 values to be treated as the empty string you can set this param true.

  • (defaults to: false)

    Try to keep reading lines if some lines yield errors. First try infer_schema_length: 0 to read all columns as :str to check which values might cause an issue.

  • (defaults to: true)

    Cache the result after reading.

  • (defaults to: nil)

    Apply a function over the column names. This can be used to update a schema just in time, thus before scanning.

  • (defaults to: true)

    When true, the schema is inferred from the data using the first infer_schema_length rows. When false, the schema is not inferred and will be Polars::String if not specified in schema or schema_overrides.

  • (defaults to: N_INFER_DEFAULT)

    Maximum number of lines to read to infer schema. If set to 0, all columns will be read as :str. If set to nil, a full table scan will be done (slow).

  • (defaults to: nil)

    Stop reading from CSV file after reading n_rows.

  • (defaults to: "utf8")

    Lossy means that invalid utf8 values are replaced with characters.

  • (defaults to: false)

    Reduce memory usage in expense of performance.

  • (defaults to: false)

    Reallocate to contiguous memory when all chunks/ files are parsed.

  • (defaults to: 0)

    Skip this number of rows when the header is parsed.

  • (defaults to: nil)

    If not nil, this will insert a row count column with the given name into the DataFrame.

  • (defaults to: 0)

    Offset to start the row_count column (only used if the name is set).

  • (defaults to: false)

    Try to automatically parse dates. If this does not succeed, the column remains of data type :str.

  • (defaults to: "\n")

    Single byte end of line character.

  • (defaults to: nil)

    Provide an explicit list of string column names to use (for example, when scanning a headerless CSV file). If the given list is shorter than the width of the DataFrame the remaining columns will have their original name.

  • (defaults to: true)

    When there is no data in the source, NoDataError is raised. If this parameter is set to false, an empty LazyFrame (with no columns) is returned instead.

  • (defaults to: false)

    Truncate lines that are longer than the schema.

  • (defaults to: false)

    Parse floats using a comma as the decimal separator instead of a period.

  • (defaults to: true)

    Expand path given via globbing rules.

  • (defaults to: nil)

    Options that indicate how to connect to a cloud provider.

    The cloud providers currently supported are AWS, GCP, and Azure. See supported keys here:

    • aws
    • gcp
    • azure
    • Hugging Face (hf://): Accepts an API key under the token parameter: \ {'token': '...'}, or by setting the HF_TOKEN environment variable.

    If storage_options is not provided, Polars will try to infer the information from environment variables.

  • (defaults to: "auto")

    Provide a function that can be called to provide cloud storage credentials. The function is expected to return a hash of credential keys along with an optional credential expiry time.

  • (defaults to: nil)

    Number of retries if accessing a cloud instance fails.

  • (defaults to: nil)

    Amount of time to keep downloaded cloud files since their last access time, in seconds. Uses the POLARS_FILE_CACHE_TTL environment variable (which defaults to 1 hour) if not given.

  • (defaults to: nil)

    Include the path of the source file(s) as a column with this name.

Returns:

.scan_delta(source, version: nil, storage_options: nil, delta_table_options: nil, rechunk: nil) ⇒ LazyFrame Originally defined in module IO

Lazily read from a Delta lake table.

Parameters:

  • DeltaTable or a Path or URI to the root of the Delta lake table.

  • (defaults to: nil)

    Numerical version or timestamp version of the Delta lake table.

  • (defaults to: nil)

    Extra options for the storage backends supported by deltalake-rb.

  • (defaults to: nil)

    Additional keyword arguments while reading a Delta lake Table.

  • (defaults to: nil)

    Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.

Returns:

.scan_iceberg(source, snapshot_id: nil, storage_options: nil) ⇒ LazyFrame Originally defined in module IO

Lazily read from an Apache Iceberg table.

Parameters:

  • A Iceberg Ruby table, or a direct path to the metadata.

  • (defaults to: nil)

    The snapshot ID to scan from.

  • (defaults to: nil)

    Extra options for the storage backends.

Returns:

.scan_ipc(source, n_rows: nil, cache: true, rechunk: false, row_index_name: nil, row_index_offset: 0, glob: true, storage_options: nil, credential_provider: "auto", retries: nil, file_cache_ttl: nil, hive_partitioning: nil, hive_schema: nil, try_parse_hive_dates: true, include_file_paths: nil, _record_batch_statistics: false) ⇒ LazyFrame Originally defined in module IO

Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.

This allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.

Parameters:

  • Path to a IPC file.

  • (defaults to: nil)

    Stop reading from IPC file after reading n_rows.

  • (defaults to: true)

    Cache the result after reading.

  • (defaults to: false)

    Reallocate to contiguous memory when all chunks/ files are parsed.

  • (defaults to: nil)

    If not nil, this will insert a row count column with give name into the DataFrame.

  • (defaults to: 0)

    Offset to start the row_count column (only use if the name is set).

  • (defaults to: true)

    Expand path given via globbing rules.

  • (defaults to: nil)

    Extra options that make sense for a particular storage connection.

  • (defaults to: "auto")

    Provide a function that can be called to provide cloud storage credentials. The function is expected to return a hash of credential keys along with an optional credential expiry time.

  • (defaults to: nil)

    Number of retries if accessing a cloud instance fails.

  • (defaults to: nil)

    Amount of time to keep downloaded cloud files since their last access time, in seconds. Uses the POLARS_FILE_CACHE_TTL environment variable (which defaults to 1 hour) if not given.

  • (defaults to: nil)

    Infer statistics and schema from Hive partitioned URL and use them to prune reads. This is unset by default (i.e. nil), meaning it is automatically enabled when a single directory is passed, and otherwise disabled.

  • (defaults to: nil)

    The column names and data types of the columns by which the data is partitioned. If set to nil (default), the schema of the Hive partitions is inferred.

  • (defaults to: true)

    Whether to try parsing hive values as date/datetime types.

  • (defaults to: nil)

    Include the path of the source file(s) as a column with this name.

Returns:

.scan_ndjson(source, schema: nil, schema_overrides: nil, infer_schema_length: N_INFER_DEFAULT, batch_size: 1024, n_rows: nil, low_memory: false, rechunk: false, row_index_name: nil, row_index_offset: 0, ignore_errors: false, storage_options: nil, credential_provider: "auto", retries: nil, file_cache_ttl: nil, include_file_paths: nil) ⇒ LazyFrame Originally defined in module IO

Lazily read from a newline delimited JSON file.

This allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.

Parameters:

  • Path to a file.

  • (defaults to: nil)

    The DataFrame schema may be declared in several ways:

    • As a dict of \{name:type} pairs; if type is nil, it will be auto-inferred.
    • As a list of column names; in this case types are automatically inferred.
    • As a list of (name,type) pairs; this is equivalent to the hash form.

    If you supply a list of column names that does not match the names in the underlying data, the names given here will overwrite them. The number of names given in the schema should match the underlying data dimensions.

  • (defaults to: nil)

    Support type specification or override of one or more columns; note that any dtypes inferred from the schema param will be overridden.

  • (defaults to: N_INFER_DEFAULT)

    Infer the schema length from the first infer_schema_length rows.

  • (defaults to: 1024)

    Number of rows to read in each batch.

  • (defaults to: nil)

    Stop reading from JSON file after reading n_rows.

  • (defaults to: false)

    Reduce memory pressure at the expense of performance.

  • (defaults to: false)

    Reallocate to contiguous memory when all chunks/ files are parsed.

  • (defaults to: nil)

    If not nil, this will insert a row count column with give name into the DataFrame.

  • (defaults to: 0)

    Offset to start the row_count column (only use if the name is set).

  • (defaults to: false)

    Return Null if parsing fails because of schema mismatches.

  • (defaults to: nil)

    Options that indicate how to connect to a cloud provider.

    The cloud providers currently supported are AWS, GCP, and Azure. See supported keys here:

    • aws
    • gcp
    • azure
    • Hugging Face (hf://): Accepts an API key under the token parameter: \ {'token': '...'}, or by setting the HF_TOKEN environment variable.

    If storage_options is not provided, Polars will try to infer the information from environment variables.

  • (defaults to: "auto")

    Provide a function that can be called to provide cloud storage credentials. The function is expected to return a hash of credential keys along with an optional credential expiry time.

  • (defaults to: nil)

    Number of retries if accessing a cloud instance fails.

  • (defaults to: nil)

    Amount of time to keep downloaded cloud files since their last access time, in seconds. Uses the POLARS_FILE_CACHE_TTL environment variable (which defaults to 1 hour) if not given.

  • (defaults to: nil)

    Include the path of the source file(s) as a column with this name.

Returns:

.scan_parquet(source, n_rows: nil, row_index_name: nil, row_index_offset: 0, parallel: "auto", use_statistics: true, hive_partitioning: nil, glob: true, hidden_file_prefix: nil, schema: nil, hive_schema: nil, try_parse_hive_dates: true, rechunk: false, low_memory: false, cache: true, storage_options: nil, credential_provider: "auto", retries: nil, include_file_paths: nil, missing_columns: "raise", allow_missing_columns: nil, extra_columns: "raise", cast_options: nil, _column_mapping: nil, _default_values: nil, _deletion_files: nil, _table_statistics: nil, _row_count: nil) ⇒ LazyFrame Originally defined in module IO

Lazily read from a parquet file or multiple files via glob patterns.

This allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.

Parameters:

  • Path to a file or a file-like object.

  • (defaults to: nil)

    Stop reading from parquet file after reading n_rows.

  • (defaults to: nil)

    If not nil, this will insert a row count column with give name into the DataFrame.

  • (defaults to: 0)

    Offset to start the row_count column (only use if the name is set).

  • (defaults to: "auto")

    This determines the direction of parallelism. 'auto' will try to determine the optimal direction.

  • (defaults to: true)

    Use statistics in the parquet to determine if pages can be skipped from reading.

  • (defaults to: nil)

    Infer statistics and schema from hive partitioned URL and use them to prune reads.

  • (defaults to: true)

    Expand path given via globbing rules.

  • (defaults to: nil)

    Skip reading files whose names begin with the specified prefixes.

  • (defaults to: nil)

    Specify the datatypes of the columns. The datatypes must match the datatypes in the file(s). If there are extra columns that are not in the file(s), consider also enabling allow_missing_columns.

  • (defaults to: nil)

    The column names and data types of the columns by which the data is partitioned. If set to nil (default), the schema of the Hive partitions is inferred.

  • (defaults to: true)

    Whether to try parsing hive values as date/datetime types.

  • (defaults to: false)

    In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.

  • (defaults to: false)

    Reduce memory pressure at the expense of performance.

  • (defaults to: true)

    Cache the result after reading.

  • (defaults to: nil)

    Extra options that make sense for a particular storage connection.

  • (defaults to: "auto")

    Provide a function that can be called to provide cloud storage credentials. The function is expected to return a hash of credential keys along with an optional credential expiry time.

  • (defaults to: nil)

    Number of retries if accessing a cloud instance fails.

  • (defaults to: nil)

    Include the path of the source file(s) as a column with this name.

  • (defaults to: "raise")

    Configuration for behavior when columns defined in the schema are missing from the data:

    • insert: Inserts the missing columns using NULLs as the row values.
    • raise: Raises an error.
  • (defaults to: nil)

    When reading a list of parquet files, if a column existing in the first file cannot be found in subsequent files, the default behavior is to raise an error. However, if allow_missing_columns is set to true, a full-NULL column is returned instead of erroring for the files that do not contain the column.

  • (defaults to: "raise")

    Configuration for behavior when extra columns outside of the defined schema are encountered in the data:

    • ignore: Silently ignores.
    • raise: Raises an error.
  • (defaults to: nil)

    Configuration for column type-casting during scans. Useful for datasets containing files that have differing schemas.

Returns:

.select(*exprs, eager: true, **named_exprs) ⇒ DataFrame Originally defined in module Functions

Run polars expressions without a context.

This is syntactic sugar for running df.select on an empty DataFrame.

Examples:

foo = Polars::Series.new("foo", [1, 2, 3])
bar = Polars::Series.new("bar", [3, 2, 1])
Polars.select(min: Polars.min_horizontal(foo, bar))
# =>
# shape: (3, 1)
# ┌─────┐
# │ min │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# │ 2   │
# │ 1   │
# └─────┘

Parameters:

  • Column(s) to select, specified as positional arguments. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

  • (defaults to: true)

    Evaluate immediately and return a DataFrame (default); if set to false, return a LazyFrame instead.

  • Additional columns to select, specified as keyword arguments. The columns will be renamed to the keyword used.

Returns:

.self_dtypeDataTypeExpr Originally defined in module Functions

Note:

This functionality is considered unstable. It may be changed at any point without it being considered a breaking change.

Get the dtype of self in map_elements and map_batches.

Returns:

.set_random_seed(seed) ⇒ nil Originally defined in module Functions

Set the global random seed for Polars.

This random seed is used to determine things such as shuffle ordering.

Parameters:

  • A non-negative integer < 2**64 used to seed the internal global random number generator.

Returns:

.show_versionsnil

Print out the version of Polars and its optional dependencies.

Returns:



166
167
168
169
170
171
172
173
# File 'lib/polars.rb', line 166

def self.show_versions
  puts "--------Version info---------"
  puts "Polars: #{VERSION}"
  puts "Index type: #{get_index_type}"
  puts "Platform: #{RUBY_PLATFORM}"
  puts "Ruby: #{RUBY_VERSION}"
  nil
end

.sql_expr(sql) ⇒ Expr Originally defined in module Functions

Parse one or more SQL expressions to polars expression(s).

Examples:

Parse a single SQL expression:

df = Polars::DataFrame.new({"a" => [2, 1]})
expr = Polars.sql_expr("MAX(a)")
df.select(expr)
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 2   │
# └─────┘

Parse multiple SQL expressions:

df.with_columns(
  *Polars.sql_expr(["POWER(a,a) AS a_a", "CAST(a AS TEXT) AS a_txt"])
)
# =>
# shape: (2, 3)
# ┌─────┬─────┬───────┐
# │ a   ┆ a_a ┆ a_txt │
# │ --- ┆ --- ┆ ---   │
# │ i64 ┆ i64 ┆ str   │
# ╞═════╪═════╪═══════╡
# │ 2   ┆ 4   ┆ 2     │
# │ 1   ┆ 1   ┆ 1     │
# └─────┴─────┴───────┘

Parameters:

  • One or more SQL expressions.

Returns:

.std(column, ddof: 1) ⇒ Expr Originally defined in module Functions

Get the standard deviation.

This function is syntactic sugar for col(column).std(ddof: ddof).

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.std("a"))
# =>
# shape: (1, 1)
# ┌──────────┐
# │ a        │
# │ ---      │
# │ f64      │
# ╞══════════╡
# │ 3.605551 │
# └──────────┘
df["a"].std
# => 3.605551275463989

Parameters:

  • Column name.

  • (defaults to: 1)

    “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1.

Returns:

.string_cacheObject



31
32
33
# File 'lib/polars/string_cache.rb', line 31

def self.string_cache(...)
  StringCache.new(...)
end

.struct(*exprs, schema: nil, eager: false, **named_exprs) ⇒ Object Originally defined in module Functions

Collect several columns into a Series of dtype Struct.

Examples:

df = Polars::DataFrame.new(
  {
    "int" => [1, 2],
    "str" => ["a", "b"],
    "bool" => [true, nil],
    "list" => [[1, 2], [3]],
  }
)
df.select([Polars.struct(Polars.all).alias("my_struct")])
# =>
# shape: (2, 1)
# ┌─────────────────────┐
# │ my_struct           │
# │ ---                 │
# │ struct[4]           │
# ╞═════════════════════╡
# │ {1,"a",true,[1, 2]} │
# │ {2,"b",null,[3]}    │
# └─────────────────────┘

Collect selected columns into a struct by either passing a list of columns, or by specifying each column as a positional argument.

df.select(Polars.struct("int", false).alias("my_struct"))
# =>
# shape: (2, 1)
# ┌───────────┐
# │ my_struct │
# │ ---       │
# │ struct[2] │
# ╞═══════════╡
# │ {1,false} │
# │ {2,false} │
# └───────────┘

Use keyword arguments to easily name each struct field.

df.select(Polars.struct(p: "int", q: "bool").alias("my_struct")).schema
# => Polars::Schema({"my_struct"=>Polars::Struct({"p"=>Polars::Int64, "q"=>Polars::Boolean})})

Parameters:

  • Column(s) to collect into a struct column, specified as positional arguments. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

  • (defaults to: nil)

    Optional schema that explicitly defines the struct field dtypes. If no columns or expressions are provided, schema keys are used to define columns.

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false (default), return an expression instead.

  • Additional columns to collect into the struct column, specified as keyword arguments. The columns will be renamed to the keyword used.

Returns:

.sum(*names) ⇒ Expr Originally defined in module Functions

Sum all values.

Syntactic sugar for col(name).sum.

Examples:

Sum a column.

df = Polars::DataFrame.new(
  {
    "a" => [1, 2],
    "b" => [3, 4],
    "c" => [5, 6]
  }
)
df.select(Polars.sum("a"))
# =>
# shape: (1, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 3   │
# └─────┘

Sum multiple columns.

df.select(Polars.sum("a", "c"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ a   ┆ c   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 3   ┆ 11  │
# └─────┴─────┘
df.select(Polars.sum("^.*[bc]$"))
# =>
# shape: (1, 2)
# ┌─────┬─────┐
# │ b   ┆ c   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 7   ┆ 11  │
# └─────┴─────┘

Parameters:

  • Name(s) of the columns to use in the aggregation.

Returns:

.sum_horizontal(*exprs, ignore_nulls: true) ⇒ Expr Originally defined in module Functions

Sum all values horizontally across columns.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, nil],
    "c" => ["x", "y", "z"]
  }
)
df.with_columns(sum: Polars.sum_horizontal("a", "b"))
# =>
# shape: (3, 4)
# ┌─────┬──────┬─────┬─────┐
# │ a   ┆ b    ┆ c   ┆ sum │
# │ --- ┆ ---  ┆ --- ┆ --- │
# │ i64 ┆ i64  ┆ str ┆ i64 │
# ╞═════╪══════╪═════╪═════╡
# │ 1   ┆ 4    ┆ x   ┆ 5   │
# │ 8   ┆ 5    ┆ y   ┆ 13  │
# │ 3   ┆ null ┆ z   ┆ 3   │
# └─────┴──────┴─────┴─────┘

Parameters:

  • Column(s) to use in the aggregation. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals.

  • (defaults to: true)

    Ignore null values (default). If set to false, any null value in the input will lead to a null output.

Returns:

.tail(column, n = 10) ⇒ Expr Originally defined in module Functions

Get the last n rows.

This function is syntactic sugar for col(column).tail(n).

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.tail("a"))
# =>
# shape: (3, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# │ 8   │
# │ 3   │
# └─────┘
df.select(Polars.tail("a", 2))
# =>
# shape: (2, 1)
# ┌─────┐
# │ a   │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 8   │
# │ 3   │
# └─────┘

Parameters:

  • Column name.

  • (defaults to: 10)

    Number of rows to return.

Returns:

.thread_pool_sizeInteger

Return the number of threads in the Polars thread pool.

Returns:



138
139
140
# File 'lib/polars.rb', line 138

def self.thread_pool_size
  Plr.thread_pool_size
end

.time(hour = nil, minute = nil, second = nil, microsecond = nil) ⇒ Expr Originally defined in module Functions

Create a Polars literal expression of type Time.

Examples:

df = Polars::DataFrame.new(
  {
    "hour" => [12, 13, 14],
    "minute" => [15, 30, 45]
  }
)
df.with_columns(Polars.time(Polars.col("hour"), Polars.col("minute")))
# =>
# shape: (3, 3)
# ┌──────┬────────┬──────────┐
# │ hour ┆ minute ┆ time     │
# │ ---  ┆ ---    ┆ ---      │
# │ i64  ┆ i64    ┆ time     │
# ╞══════╪════════╪══════════╡
# │ 12   ┆ 15     ┆ 12:15:00 │
# │ 13   ┆ 30     ┆ 13:30:00 │
# │ 14   ┆ 45     ┆ 14:45:00 │
# └──────┴────────┴──────────┘

Parameters:

  • (defaults to: nil)

    column or literal, ranging from 0-23.

  • (defaults to: nil)

    column or literal, ranging from 0-59.

  • (defaults to: nil)

    column or literal, ranging from 0-59.

  • (defaults to: nil)

    column or literal, ranging from 0-999999.

Returns:

.time_range(start = nil, stop = nil, interval = "1h", closed: "both", eager: false) ⇒ Object Originally defined in module Functions

Generate a time range.

Examples:

Polars.time_range(
  Time.utc(2000, 1, 1, 14, 0),
  nil,
  "3h15m",
  eager: true
).alias("time")
# =>
# shape: (4,)
# Series: 'time' [time]
# [
#         14:00:00
#         17:15:00
#         20:30:00
#         23:45:00
# ]

Parameters:

  • (defaults to: nil)

    Lower bound of the time range.

  • (defaults to: nil)

    Upper bound of the time range.

  • (defaults to: "1h")

    Interval of the range periods, specified using the Polars duration string language.

  • (defaults to: "both")

    Define which sides of the range are closed (inclusive).

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false (default), return an expression instead.

Returns:

.time_ranges(start = nil, stop = nil, interval = "1h", closed: "both", eager: false) ⇒ Object Originally defined in module Functions

Create a column of time ranges.

Examples:

df = Polars::DataFrame.new(
  {
    "start" => [Time.utc(2000, 1, 1, 9, 0), Time.utc(2000, 1, 1, 10, 0)],
    "end" => Time.utc(2000, 1, 1, 11, 0)
  }
)
df.select(time_range: Polars.time_ranges("start", "end"))
# =>
# shape: (2, 1)
# ┌────────────────────────────────┐
# │ time_range                     │
# │ ---                            │
# │ list[time]                     │
# ╞════════════════════════════════╡
# │ [09:00:00, 10:00:00, 11:00:00] │
# │ [10:00:00, 11:00:00]           │
# └────────────────────────────────┘

Parameters:

  • (defaults to: nil)

    Lower bound of the time range.

  • (defaults to: nil)

    Upper bound of the time range.

  • (defaults to: "1h")

    Interval of the range periods, specified using the Polars duration string language.

  • (defaults to: "both")

    Define which sides of the range are closed (inclusive).

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false (default), return an expression instead.

Returns:

.union(items, how: "vertical", strict: false) ⇒ Object Originally defined in module Functions

Note:

This function does not guarantee any specific ordering of rows in the result. If you need predictable row ordering, use Polars.concat instead.

Combine multiple DataFrames, LazyFrames, or Series into a single object.

Examples:

df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
Polars.union([df1, df2])
# =>
# shape: (2, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 3   │
# │ 2   ┆ 4   │
# └─────┴─────┘
df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
df2 = Polars::DataFrame.new({"a" => [2.5], "b" => [4]})
Polars.union([df1, df2], how: "vertical_relaxed")
# =>
# shape: (2, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ f64 ┆ i64 │
# ╞═════╪═════╡
# │ 1.0 ┆ 3   │
# │ 2.5 ┆ 4   │
# └─────┴─────┘
df_h1 = Polars::DataFrame.new({"l1" => [1, 2], "l2" => [3, 4]})
df_h2 = Polars::DataFrame.new({"r1" => [5, 6], "r2" => [7, 8], "r3" => [9, 10]})
Polars.union([df_h1, df_h2], how: "horizontal")
# =>
# shape: (2, 5)
# ┌─────┬─────┬─────┬─────┬─────┐
# │ l1  ┆ l2  ┆ r1  ┆ r2  ┆ r3  │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╪═════╪═════╡
# │ 1   ┆ 3   ┆ 5   ┆ 7   ┆ 9   │
# │ 2   ┆ 4   ┆ 6   ┆ 8   ┆ 10  │
# └─────┴─────┴─────┴─────┴─────┘

The "diagonal" strategy allows for some frames to have missing columns, the values for which are filled with null:

df_d1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
df_d2 = Polars::DataFrame.new({"a" => [2], "c" => [4]})
Polars.union([df_d1, df_d2], how: "diagonal")
# =>
# shape: (2, 3)
# ┌─────┬──────┬──────┐
# │ a   ┆ b    ┆ c    │
# │ --- ┆ ---  ┆ ---  │
# │ i64 ┆ i64  ┆ i64  │
# ╞═════╪══════╪══════╡
# │ 1   ┆ 3    ┆ null │
# │ 2   ┆ null ┆ 4    │
# └─────┴──────┴──────┘

Parameters:

  • DataFrames, LazyFrames, or Series to concatenate.

  • (defaults to: "vertical")

    Note that Series only support the vertical strategy.

    • vertical: Applies multiple vstack operations.
    • vertical_relaxed: Same as vertical, but additionally coerces columns to their common supertype if they are mismatched (eg: Int32 → Int64).
    • diagonal: Finds a union between the column schemas and fills missing column values with null.
    • diagonal_relaxed: Same as diagonal, but additionally coerces columns to their common supertype if they are mismatched (eg: Int32 → Int64).
    • horizontal: Stacks Series from DataFrames horizontally and fills with null if the lengths don't match.
    • align, align_full, align_left, align_right: Combines frames horizontally, auto-determining the common key columns and aligning rows using the same logic as align_frames (note that "align" is an alias for "align_full"). The "align" strategy determines the type of join used to align the frames, equivalent to the "how" parameter on align_frames. Note that the common join columns are automatically coalesced, but other column collisions will raise an error (if you need more control over this you should use a suitable join method directly).
  • (defaults to: false)

    When how=horizontal, require all DataFrames to be the same height, raising an error if not.

Returns:

.using_string_cacheBoolean Originally defined in module Functions

Check whether the global string cache is enabled.

Returns:

.var(column, ddof: 1) ⇒ Expr Originally defined in module Functions

Get the variance.

This function is syntactic sugar for col(column).var(ddof: ddof).

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 8, 3],
    "b" => [4, 5, 2],
    "c" => ["foo", "bar", "foo"]
  }
)
df.select(Polars.var("a"))
# =>
# shape: (1, 1)
# ┌──────┐
# │ a    │
# │ ---  │
# │ f64  │
# ╞══════╡
# │ 13.0 │
# └──────┘
df["a"].var
# => 13.0

Parameters:

  • Column name.

  • (defaults to: 1)

    “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1.

Returns:

.when(*predicates, **constraints) ⇒ When Originally defined in module Functions

Start a "when, then, otherwise" expression.

Examples:

Below we add a column with the value 1, where column "foo" > 2 and the value -1 where it isn't.

df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
df.with_columns(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────────┐
# │ foo ┆ bar ┆ literal │
# │ --- ┆ --- ┆ ---     │
# │ i64 ┆ i64 ┆ i32     │
# ╞═════╪═════╪═════════╡
# │ 1   ┆ 3   ┆ -1      │
# │ 3   ┆ 4   ┆ 1       │
# │ 4   ┆ 0   ┆ 1       │
# └─────┴─────┴─────────┘

Or with multiple when-then operations chained:

df.with_columns(
  Polars.when(Polars.col("foo") > 2)
  .then(1)
  .when(Polars.col("bar") > 2)
  .then(4)
  .otherwise(-1)
  .alias("val")
)
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ val │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i32 │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 3   ┆ 4   │
# │ 3   ┆ 4   ┆ 1   │
# │ 4   ┆ 0   ┆ 1   │
# └─────┴─────┴─────┘

The otherwise at the end is optional. If left out, any rows where none of the when expressions evaluate to true, are set to null:

df.with_columns(Polars.when(Polars.col("foo") > 2).then(1).alias("val"))
# =>
# shape: (3, 3)
# ┌─────┬─────┬──────┐
# │ foo ┆ bar ┆ val  │
# │ --- ┆ --- ┆ ---  │
# │ i64 ┆ i64 ┆ i32  │
# ╞═════╪═════╪══════╡
# │ 1   ┆ 3   ┆ null │
# │ 3   ┆ 4   ┆ 1    │
# │ 4   ┆ 0   ┆ 1    │
# └─────┴─────┴──────┘

Pass multiple predicates, each of which must be met:

df.with_columns(
  val: Polars.when(
    Polars.col("bar") > 0,
    Polars.col("foo") % 2 != 0
  )
  .then(99)
  .otherwise(-1)
)
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ val │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i32 │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 3   ┆ 99  │
# │ 3   ┆ 4   ┆ 99  │
# │ 4   ┆ 0   ┆ -1  │
# └─────┴─────┴─────┘

Pass conditions as keyword arguments:

df.with_columns(val: Polars.when(foo: 4, bar: 0).then(99).otherwise(-1))
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ val │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i32 │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 3   ┆ -1  │
# │ 3   ┆ 4   ┆ -1  │
# │ 4   ┆ 0   ┆ 99  │
# └─────┴─────┴─────┘

Returns:

.zeros(n, dtype: Float64, eager: false) ⇒ Object Originally defined in module Functions

Construct a column of length n filled with zeros.

This is syntactic sugar for the repeat function.

Examples:

Polars.zeros(3, dtype: Polars::Int8, eager: true)
# =>
# shape: (3,)
# Series: 'zeros' [i8]
# [
#         0
#         0
#         0
# ]

Parameters:

  • Length of the resulting column.

  • (defaults to: Float64)

    Data type of the resulting column. Defaults to Float64.

  • (defaults to: false)

    Evaluate immediately and return a Series. If set to false, return an expression instead.

Returns: