Module: Polars
- Defined in:
- lib/polars.rb,
lib/polars/expr.rb,
lib/polars/plot.rb,
lib/polars/slice.rb,
lib/polars/utils.rb,
lib/polars/config.rb,
lib/polars/io/csv.rb,
lib/polars/io/ipc.rb,
lib/polars/series.rb,
lib/polars/convert.rb,
lib/polars/io/avro.rb,
lib/polars/io/json.rb,
lib/polars/testing.rb,
lib/polars/version.rb,
lib/polars/cat_expr.rb,
lib/polars/group_by.rb,
lib/polars/whenthen.rb,
lib/polars/functions.rb,
lib/polars/io/ndjson.rb,
lib/polars/list_expr.rb,
lib/polars/meta_expr.rb,
lib/polars/name_expr.rb,
lib/polars/array_expr.rb,
lib/polars/data_frame.rb,
lib/polars/data_types.rb,
lib/polars/exceptions.rb,
lib/polars/io/parquet.rb,
lib/polars/lazy_frame.rb,
lib/polars/utils/wrap.rb,
lib/polars/binary_expr.rb,
lib/polars/io/database.rb,
lib/polars/sql_context.rb,
lib/polars/string_expr.rb,
lib/polars/struct_expr.rb,
lib/polars/utils/parse.rb,
lib/polars/string_cache.rb,
lib/polars/expr_dispatch.rb,
lib/polars/functions/col.rb,
lib/polars/functions/len.rb,
lib/polars/functions/lit.rb,
lib/polars/lazy_group_by.rb,
lib/polars/utils/convert.rb,
lib/polars/utils/various.rb,
lib/polars/cat_name_space.rb,
lib/polars/date_time_expr.rb,
lib/polars/functions/lazy.rb,
lib/polars/functions/eager.rb,
lib/polars/list_name_space.rb,
lib/polars/utils/constants.rb,
lib/polars/array_name_space.rb,
lib/polars/dynamic_group_by.rb,
lib/polars/functions/random.rb,
lib/polars/functions/repeat.rb,
lib/polars/rolling_group_by.rb,
lib/polars/binary_name_space.rb,
lib/polars/string_name_space.rb,
lib/polars/struct_name_space.rb,
lib/polars/batched_csv_reader.rb,
lib/polars/functions/whenthen.rb,
lib/polars/date_time_name_space.rb,
lib/polars/functions/as_datatype.rb,
lib/polars/functions/range/int_range.rb,
lib/polars/functions/range/date_range.rb,
lib/polars/functions/range/time_range.rb,
lib/polars/functions/aggregation/vertical.rb,
lib/polars/functions/range/datetime_range.rb,
lib/polars/functions/aggregation/horizontal.rb
Defined Under Namespace
Modules: Convert, Functions, IO, Plot, Testing Classes: Array, ArrayExpr, ArrayNameSpace, Binary, BinaryExpr, BinaryNameSpace, Boolean, CatExpr, CatNameSpace, Categorical, Config, DataFrame, DataType, Date, DateTimeExpr, DateTimeNameSpace, Datetime, Decimal, Duration, DynamicGroupBy, Enum, Expr, Field, Float32, Float64, FloatType, GroupBy, Int16, Int32, Int64, Int8, IntegerType, LazyFrame, LazyGroupBy, List, ListExpr, ListNameSpace, MetaExpr, NameExpr, NestedType, Null, NumericType, Object, RollingGroupBy, SQLContext, Series, SignedIntegerType, String, StringCache, StringExpr, StringNameSpace, Struct, StructExpr, StructNameSpace, TemporalType, Time, UInt16, UInt32, UInt64, UInt8, Unknown, UnsignedIntegerType
Class Method Summary collapse
-
.align_frames(*frames, on:, select: nil, reverse: false) ⇒ Object
extended
from Functions
Align a sequence of frames using the uique values from one or more columns as a key.
-
.all(*names, ignore_nulls: true) ⇒ Expr
extended
from Functions
Either return an expression representing all columns, or evaluate a bitwise AND operation.
-
.all_horizontal(*exprs) ⇒ Expr
extended
from Functions
Compute the bitwise AND horizontally across columns.
-
.any(*names, ignore_nulls: true) ⇒ Expr
extended
from Functions
Evaluate a bitwise OR operation.
-
.any_horizontal(*exprs) ⇒ Expr
extended
from Functions
Compute the bitwise OR horizontally across columns.
-
.approx_n_unique(*columns) ⇒ Expr
extended
from Functions
Approximate count of unique values.
-
.arctan2(y, x) ⇒ Expr
extended
from Functions
Compute two argument arctan in radians.
-
.arctan2d(y, x) ⇒ Expr
extended
from Functions
Compute two argument arctan in degrees.
-
.arg_sort_by(exprs, reverse: false) ⇒ Expr
(also: #argsort_by)
extended
from Functions
Find the indexes that would sort the columns.
-
.arg_where(condition, eager: false) ⇒ Expr, Series
extended
from Functions
Return indices where
condition
evaluatestrue
. -
.coalesce(exprs, *more_exprs) ⇒ Expr
extended
from Functions
Folds the columns from left to right, keeping the first non-null value.
-
.col(name, *more_names) ⇒ Expr
extended
from Functions
Return an expression representing a column in a DataFrame.
-
.collect_all(lazy_frames, type_coercion: true, predicate_pushdown: true, projection_pushdown: true, simplify_expression: true, string_cache: false, no_optimization: false, slice_pushdown: true, common_subplan_elimination: true, allow_streaming: false) ⇒ Array
extended
from Functions
Collect multiple LazyFrames at the same time.
-
.concat(items, rechunk: true, how: "vertical", parallel: true) ⇒ Object
extended
from Functions
Aggregate multiple Dataframes/Series to a single DataFrame/Series.
-
.concat_list(exprs) ⇒ Expr
extended
from Functions
Concat the arrays in a Series dtype List in linear time.
-
.concat_str(exprs, sep: "", ignore_nulls: false) ⇒ Expr
extended
from Functions
Horizontally concat Utf8 Series in linear time.
-
.corr(a, b, method: "pearson", ddof: 1, propagate_nans: false) ⇒ Expr
extended
from Functions
Compute the Pearson's or Spearman rank correlation correlation between two columns.
-
.count(*columns) ⇒ Expr
extended
from Functions
Return the number of non-null values in the column.
-
.cov(a, b, ddof: 1) ⇒ Expr
extended
from Functions
Compute the covariance between two columns/ expressions.
-
.cum_count(*columns, reverse: false) ⇒ Expr
extended
from Functions
Return the cumulative count of the non-null values in the column.
-
.cum_fold(acc, f, exprs, include_init: false) ⇒ Object
(also: #cumfold)
extended
from Functions
Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
-
.cum_sum(*names) ⇒ Expr
(also: #cumsum)
extended
from Functions
Cumulatively sum all values.
-
.cum_sum_horizontal(*exprs) ⇒ Expr
(also: #cumsum_horizontal)
extended
from Functions
Cumulatively sum all values horizontally across columns.
-
.date_range(start, stop, interval = "1d", closed: "both", eager: false) ⇒ Object
extended
from Functions
Create a range of type
Datetime
(orDate
). -
.date_ranges(start, stop, interval = "1d", closed: "both", eager: false) ⇒ Object
extended
from Functions
Create a column of date ranges.
-
.datetime_range(start, stop, interval = "1d", closed: "both", time_unit: nil, time_zone: nil, eager: false) ⇒ Object
extended
from Functions
Generate a datetime range.
-
.datetime_ranges(start, stop, interval: "1d", closed: "both", time_unit: nil, time_zone: nil, eager: false) ⇒ Object
extended
from Functions
Create a column of datetime ranges.
-
.disable_string_cache ⇒ nil
extended
from Functions
Disable and clear the global string cache.
-
.duration(weeks: nil, days: nil, hours: nil, minutes: nil, seconds: nil, milliseconds: nil, microseconds: nil, nanoseconds: nil, time_unit: "us") ⇒ Expr
extended
from Functions
Create polars
Duration
from distinct time components. -
.element ⇒ Expr
extended
from Functions
Alias for an element in evaluated in an
eval
expression. -
.enable_string_cache ⇒ nil
extended
from Functions
Enable the global string cache.
-
.exclude(columns) ⇒ Object
extended
from Functions
Exclude certain columns from a wildcard/regex selection.
-
.first(*columns) ⇒ Expr
extended
from Functions
Get the first value.
-
.fold(acc, f, exprs) ⇒ Expr
extended
from Functions
Accumulate over multiple columns horizontally/row wise with a left fold.
-
.format(f_string, *args) ⇒ Expr
extended
from Functions
Format expressions as a string.
-
.from_epoch(column, unit: "s", eager: false) ⇒ Object
extended
from Functions
Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
-
.from_hash(data, schema: nil, columns: nil) ⇒ DataFrame
extended
from Convert
Construct a DataFrame from a dictionary of sequences.
-
.get_dummies(df, columns: nil) ⇒ DataFrame
extended
from Functions
Convert categorical variables into dummy/indicator variables.
-
.groups(column) ⇒ Object
extended
from Functions
Syntactic sugar for
Polars.col("foo").agg_groups
. -
.head(column, n = 10) ⇒ Expr
extended
from Functions
Get the first
n
rows. -
.implode(*columns) ⇒ Expr
extended
from Functions
Aggregate all column values into a list.
-
.int_range(start, stop = nil, step: 1, eager: false, dtype: nil) ⇒ Expr, Series
(also: #arange)
extended
from Functions
Create a range expression (or Series).
-
.last(*columns) ⇒ Expr
extended
from Functions
Get the last value.
-
.len ⇒ Expr
(also: #length)
extended
from Functions
Return the number of rows in the context.
-
.lit(value, dtype: nil, allow_object: nil) ⇒ Expr
extended
from Functions
Return an expression representing a literal value.
-
.max(*names) ⇒ Expr
extended
from Functions
Get the maximum value.
-
.max_horizontal(*exprs) ⇒ Expr
extended
from Functions
Get the maximum value horizontally across columns.
-
.mean(*columns) ⇒ Expr
(also: #avg)
extended
from Functions
Get the mean value.
-
.mean_horizontal(*exprs) ⇒ Expr
extended
from Functions
Compute the mean of all values horizontally across columns.
-
.median(*columns) ⇒ Expr
extended
from Functions
Get the median value.
-
.min(*names) ⇒ Expr
extended
from Functions
Get the minimum value.
-
.min_horizontal(*exprs) ⇒ Expr
extended
from Functions
Get the minimum value horizontally across columns.
-
.n_unique(*columns) ⇒ Expr
extended
from Functions
Count unique values.
-
.nth(*indices) ⇒ Expr
extended
from Functions
Get the nth column(s) of the context.
-
.ones(n, dtype: nil, eager: true) ⇒ Object
extended
from Functions
Construct a column of length
n
filled with ones. -
.pearson_corr(a, b, ddof: 1) ⇒ Expr
extended
from Functions
Compute the pearson's correlation between two columns.
-
.quantile(column, quantile, interpolation: "nearest") ⇒ Expr
extended
from Functions
Syntactic sugar for
Polars.col("foo").quantile(...)
. -
.read_avro(source, columns: nil, n_rows: nil) ⇒ DataFrame
extended
from IO
Read into a DataFrame from Apache Avro format.
-
.read_csv(source, has_header: true, columns: nil, new_columns: nil, sep: ",", comment_char: nil, quote_char: '"', skip_rows: 0, dtypes: nil, null_values: nil, ignore_errors: false, parse_dates: false, n_threads: nil, infer_schema_length: N_INFER_DEFAULT, batch_size: 8192, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: true, storage_options: nil, skip_rows_after_header: 0, row_count_name: nil, row_count_offset: 0, sample_size: 1024, eol_char: "\n", truncate_ragged_lines: false) ⇒ DataFrame
extended
from IO
Read a CSV file into a DataFrame.
-
.read_csv_batched(source, has_header: true, columns: nil, new_columns: nil, sep: ",", comment_char: nil, quote_char: '"', skip_rows: 0, dtypes: nil, null_values: nil, missing_utf8_is_empty_string: false, ignore_errors: false, parse_dates: false, n_threads: nil, infer_schema_length: N_INFER_DEFAULT, batch_size: 50_000, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: true, skip_rows_after_header: 0, row_count_name: nil, row_count_offset: 0, sample_size: 1024, eol_char: "\n", raise_if_empty: true, truncate_ragged_lines: false, decimal_comma: false) ⇒ BatchedCsvReader
extended
from IO
Read a CSV file in batches.
-
.read_database(query, schema_overrides: nil) ⇒ DataFrame
(also: #read_sql)
extended
from IO
Read a SQL query into a DataFrame.
-
.read_ipc(source, columns: nil, n_rows: nil, memory_map: true, storage_options: nil, row_count_name: nil, row_count_offset: 0, rechunk: true) ⇒ DataFrame
extended
from IO
Read into a DataFrame from Arrow IPC (Feather v2) file.
-
.read_ipc_schema(source) ⇒ Hash
extended
from IO
Get a schema of the IPC file without reading data.
-
.read_ipc_stream(source, columns: nil, n_rows: nil, storage_options: nil, row_index_name: nil, row_index_offset: 0, rechunk: true) ⇒ DataFrame
extended
from IO
Read into a DataFrame from Arrow IPC record batch stream.
-
.read_json(source, schema: nil, schema_overrides: nil, infer_schema_length: N_INFER_DEFAULT) ⇒ DataFrame
extended
from IO
Read into a DataFrame from a JSON file.
-
.read_ndjson(source, schema: nil, schema_overrides: nil, ignore_errors: false) ⇒ DataFrame
extended
from IO
Read into a DataFrame from a newline delimited JSON file.
-
.read_parquet(source, columns: nil, n_rows: nil, storage_options: nil, parallel: "auto", row_count_name: nil, row_count_offset: 0, low_memory: false, use_statistics: true, rechunk: true) ⇒ DataFrame
extended
from IO
Read into a DataFrame from a parquet file.
-
.read_parquet_schema(source) ⇒ Hash
extended
from IO
Get a schema of the Parquet file without reading data.
-
.repeat(value, n, dtype: nil, eager: false, name: nil) ⇒ Object
extended
from Functions
Repeat a single value n times.
-
.scan_csv(source, has_header: true, sep: ",", comment_char: nil, quote_char: '"', skip_rows: 0, dtypes: nil, null_values: nil, missing_utf8_is_empty_string: false, ignore_errors: false, cache: true, with_column_names: nil, infer_schema_length: N_INFER_DEFAULT, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: true, skip_rows_after_header: 0, row_count_name: nil, row_count_offset: 0, parse_dates: false, eol_char: "\n", raise_if_empty: true, truncate_ragged_lines: false, decimal_comma: false, glob: true) ⇒ LazyFrame
extended
from IO
Lazily read from a CSV file or multiple files via glob patterns.
-
.scan_ipc(source, n_rows: nil, cache: true, rechunk: true, row_count_name: nil, row_count_offset: 0, storage_options: nil, hive_partitioning: nil, hive_schema: nil, try_parse_hive_dates: true, include_file_paths: nil) ⇒ LazyFrame
extended
from IO
Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
-
.scan_ndjson(source, infer_schema_length: N_INFER_DEFAULT, batch_size: 1024, n_rows: nil, low_memory: false, rechunk: true, row_count_name: nil, row_count_offset: 0) ⇒ LazyFrame
extended
from IO
Lazily read from a newline delimited JSON file.
-
.scan_parquet(source, n_rows: nil, cache: true, parallel: "auto", glob: true, rechunk: true, row_count_name: nil, row_count_offset: 0, storage_options: nil, low_memory: false, include_file_paths: nil) ⇒ LazyFrame
extended
from IO
Lazily read from a parquet file or multiple files via glob patterns.
-
.select(*exprs, **named_exprs) ⇒ DataFrame
extended
from Functions
Run polars expressions without a context.
-
.set_random_seed(seed) ⇒ nil
extended
from Functions
Set the global random seed for Polars.
-
.spearman_rank_corr(a, b, ddof: 1, propagate_nans: false) ⇒ Expr
extended
from Functions
Compute the spearman rank correlation between two columns.
-
.sql_expr(sql) ⇒ Expr
extended
from Functions
Parse one or more SQL expressions to polars expression(s).
-
.std(column, ddof: 1) ⇒ Expr
extended
from Functions
Get the standard deviation.
-
.struct(*exprs, schema: nil, eager: false, **named_exprs) ⇒ Object
extended
from Functions
Collect several columns into a Series of dtype Struct.
-
.sum(*names) ⇒ Expr
extended
from Functions
Sum all values.
-
.sum_horizontal(*exprs) ⇒ Expr
extended
from Functions
Sum all values horizontally across columns.
-
.tail(column, n = 10) ⇒ Expr
extended
from Functions
Get the last
n
rows. -
.time_range(start = nil, stop = nil, interval = "1h", closed: "both", eager: false) ⇒ Object
extended
from Functions
Generate a time range.
-
.time_ranges(start = nil, stop = nil, interval = "1h", closed: "both", eager: false) ⇒ Object
extended
from Functions
Create a column of time ranges.
-
.to_list(name) ⇒ Expr
extended
from Functions
Aggregate to list.
-
.using_string_cache ⇒ Boolean
extended
from Functions
Check whether the global string cache is enabled.
-
.var(column, ddof: 1) ⇒ Expr
extended
from Functions
Get the variance.
-
.when(*predicates, **constraints) ⇒ When
extended
from Functions
Start a "when, then, otherwise" expression.
-
.zeros(n, dtype: nil, eager: true) ⇒ Object
extended
from Functions
Construct a column of length
n
filled with zeros.
Class Method Details
.align_frames(*frames, on:, select: nil, reverse: false) ⇒ Object Originally defined in module Functions
Align a sequence of frames using the uique values from one or more columns as a key.
Frames that do not contain the given key values have rows injected (with nulls filling the non-key columns), and each resulting frame is sorted by the key.
The original column order of input frames is not changed unless select
is
specified (in which case the final column order is determined from that).
Note that this does not result in a joined frame - you receive the same number of frames back that you passed in, but each is now aligned by key and has the same number of rows.
.all(*names, ignore_nulls: true) ⇒ Expr Originally defined in module Functions
Either return an expression representing all columns, or evaluate a bitwise AND operation.
If no arguments are passed, this function is syntactic sugar for col("*")
.
Otherwise, this function is syntactic sugar for col(names).all
.
.all_horizontal(*exprs) ⇒ Expr Originally defined in module Functions
Compute the bitwise AND horizontally across columns.
.any(*names, ignore_nulls: true) ⇒ Expr Originally defined in module Functions
Evaluate a bitwise OR operation.
Syntactic sugar for col(names).any
.
.any_horizontal(*exprs) ⇒ Expr Originally defined in module Functions
Compute the bitwise OR horizontally across columns.
.approx_n_unique(*columns) ⇒ Expr Originally defined in module Functions
Approximate count of unique values.
This function is syntactic sugar for col(columns).approx_n_unique
, and
uses the HyperLogLog++ algorithm for cardinality estimation.
.arctan2(y, x) ⇒ Expr Originally defined in module Functions
Compute two argument arctan in radians.
Returns the angle (in radians) in the plane between the positive x-axis and the ray from the origin to (x,y).
.arctan2d(y, x) ⇒ Expr Originally defined in module Functions
Compute two argument arctan in degrees.
Returns the angle (in degrees) in the plane between the positive x-axis and the ray from the origin to (x,y).
.arg_sort_by(exprs, reverse: false) ⇒ Expr Also known as: argsort_by Originally defined in module Functions
Find the indexes that would sort the columns.
Argsort by multiple columns. The first column will be used for the ordering. If there are duplicates in the first column, the second column will be used to determine the ordering and so on.
.arg_where(condition, eager: false) ⇒ Expr, Series Originally defined in module Functions
Return indices where condition
evaluates true
.
.coalesce(exprs, *more_exprs) ⇒ Expr Originally defined in module Functions
Folds the columns from left to right, keeping the first non-null value.
.col(name, *more_names) ⇒ Expr Originally defined in module Functions
Return an expression representing a column in a DataFrame.
.collect_all(lazy_frames, type_coercion: true, predicate_pushdown: true, projection_pushdown: true, simplify_expression: true, string_cache: false, no_optimization: false, slice_pushdown: true, common_subplan_elimination: true, allow_streaming: false) ⇒ Array Originally defined in module Functions
Collect multiple LazyFrames at the same time.
This runs all the computation graphs in parallel on Polars threadpool.
.concat(items, rechunk: true, how: "vertical", parallel: true) ⇒ Object Originally defined in module Functions
Aggregate multiple Dataframes/Series to a single DataFrame/Series.
.concat_list(exprs) ⇒ Expr Originally defined in module Functions
Concat the arrays in a Series dtype List in linear time.
.concat_str(exprs, sep: "", ignore_nulls: false) ⇒ Expr Originally defined in module Functions
Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
.corr(a, b, method: "pearson", ddof: 1, propagate_nans: false) ⇒ Expr Originally defined in module Functions
Compute the Pearson's or Spearman rank correlation correlation between two columns.
.count(*columns) ⇒ Expr Originally defined in module Functions
Return the number of non-null values in the column.
This function is syntactic sugar for col(columns).count
.
Calling this function without any arguments returns the number of rows in the
context. This way of using the function is deprecated. Please use len
instead.
.cov(a, b, ddof: 1) ⇒ Expr Originally defined in module Functions
Compute the covariance between two columns/ expressions.
.cum_count(*columns, reverse: false) ⇒ Expr Originally defined in module Functions
Return the cumulative count of the non-null values in the column.
This function is syntactic sugar for col(columns).cum_count
.
If no arguments are passed, returns the cumulative count of a context. Rows containing null values count towards the result.
.cum_fold(acc, f, exprs, include_init: false) ⇒ Object Also known as: cumfold Originally defined in module Functions
If you simply want the first encountered expression as accumulator,
consider using cumreduce
.
Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
Every cumulative result is added as a separate field in a Struct column.
.cum_sum(*names) ⇒ Expr Also known as: cumsum Originally defined in module Functions
Cumulatively sum all values.
Syntactic sugar for col(names).cum_sum
.
.cum_sum_horizontal(*exprs) ⇒ Expr Also known as: cumsum_horizontal Originally defined in module Functions
Cumulatively sum all values horizontally across columns.
.date_range(start, stop, interval = "1d", closed: "both", eager: false) ⇒ Object Originally defined in module Functions
If both low
and high
are passed as date types (not datetime), and the
interval granularity is no finer than 1d, the returned range is also of
type date. All other permutations return a datetime Series.
Create a range of type Datetime
(or Date
).
.date_ranges(start, stop, interval = "1d", closed: "both", eager: false) ⇒ Object Originally defined in module Functions
interval
is created according to the following string language:
- 1ns (1 nanosecond)
- 1us (1 microsecond)
- 1ms (1 millisecond)
- 1s (1 second)
- 1m (1 minute)
- 1h (1 hour)
- 1d (1 calendar day)
- 1w (1 calendar week)
- 1mo (1 calendar month)
- 1q (1 calendar quarter)
- 1y (1 calendar year)
Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
Create a column of date ranges.
.datetime_range(start, stop, interval = "1d", closed: "both", time_unit: nil, time_zone: nil, eager: false) ⇒ Object Originally defined in module Functions
Generate a datetime range.
.datetime_ranges(start, stop, interval: "1d", closed: "both", time_unit: nil, time_zone: nil, eager: false) ⇒ Object Originally defined in module Functions
Create a column of datetime ranges.
.disable_string_cache ⇒ nil Originally defined in module Functions
Disable and clear the global string cache.
.duration(weeks: nil, days: nil, hours: nil, minutes: nil, seconds: nil, milliseconds: nil, microseconds: nil, nanoseconds: nil, time_unit: "us") ⇒ Expr Originally defined in module Functions
Create polars Duration
from distinct time components.
.element ⇒ Expr Originally defined in module Functions
Alias for an element in evaluated in an eval
expression.
.enable_string_cache ⇒ nil Originally defined in module Functions
Enable the global string cache.
Categorical
columns created under the same global string cache have
the same underlying physical value when string values are equal. This allows the
columns to be concatenated or used in a join operation, for example.
.exclude(columns) ⇒ Object Originally defined in module Functions
Exclude certain columns from a wildcard/regex selection.
.fold(acc, f, exprs) ⇒ Expr Originally defined in module Functions
Accumulate over multiple columns horizontally/row wise with a left fold.
.format(f_string, *args) ⇒ Expr Originally defined in module Functions
Format expressions as a string.
.from_epoch(column, unit: "s", eager: false) ⇒ Object Originally defined in module Functions
Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
Depending on the unit
provided, this function will return a different dtype:
- unit: "d" returns pl.Date
- unit: "s" returns pl.Datetime"us"
- unit: "ms" returns pl.Datetime["ms"]
- unit: "us" returns pl.Datetime["us"]
- unit: "ns" returns pl.Datetime["ns"]
.from_hash(data, schema: nil, columns: nil) ⇒ DataFrame Originally defined in module Convert
Construct a DataFrame from a dictionary of sequences.
This operation clones data, unless you pass in a Hash<String, Series>
.
.get_dummies(df, columns: nil) ⇒ DataFrame Originally defined in module Functions
Convert categorical variables into dummy/indicator variables.
.groups(column) ⇒ Object Originally defined in module Functions
Syntactic sugar for Polars.col("foo").agg_groups
.
.head(column, n = 10) ⇒ Expr Originally defined in module Functions
Get the first n
rows.
This function is syntactic sugar for col(column).head(n)
.
.implode(*columns) ⇒ Expr Originally defined in module Functions
Aggregate all column values into a list.
This function is syntactic sugar for col(name).implode
.
.int_range(start, stop = nil, step: 1, eager: false, dtype: nil) ⇒ Expr, Series Also known as: arange Originally defined in module Functions
Create a range expression (or Series).
This can be used in a select
, with_column
, etc. Be sure that the resulting
range size is equal to the length of the DataFrame you are collecting.
.len ⇒ Expr Also known as: length Originally defined in module Functions
Return the number of rows in the context.
This is similar to COUNT(*)
in SQL.
.lit(value, dtype: nil, allow_object: nil) ⇒ Expr Originally defined in module Functions
Return an expression representing a literal value.
.max(*names) ⇒ Expr Originally defined in module Functions
Get the maximum value.
Syntactic sugar for col(names).max
.
.max_horizontal(*exprs) ⇒ Expr Originally defined in module Functions
Get the maximum value horizontally across columns.
.mean(*columns) ⇒ Expr Also known as: avg Originally defined in module Functions
Get the mean value.
This function is syntactic sugar for col(columns).mean
.
.mean_horizontal(*exprs) ⇒ Expr Originally defined in module Functions
Compute the mean of all values horizontally across columns.
.median(*columns) ⇒ Expr Originally defined in module Functions
Get the median value.
This function is syntactic sugar for pl.col(columns).median
.
.min(*names) ⇒ Expr Originally defined in module Functions
Get the minimum value.
Syntactic sugar for col(names).min
.
.min_horizontal(*exprs) ⇒ Expr Originally defined in module Functions
Get the minimum value horizontally across columns.
.n_unique(*columns) ⇒ Expr Originally defined in module Functions
Count unique values.
This function is syntactic sugar for col(columns).n_unique
.
.ones(n, dtype: nil, eager: true) ⇒ Object Originally defined in module Functions
Construct a column of length n
filled with ones.
This is syntactic sugar for the repeat
function.
.pearson_corr(a, b, ddof: 1) ⇒ Expr Originally defined in module Functions
Compute the pearson's correlation between two columns.
.quantile(column, quantile, interpolation: "nearest") ⇒ Expr Originally defined in module Functions
Syntactic sugar for Polars.col("foo").quantile(...)
.
.read_avro(source, columns: nil, n_rows: nil) ⇒ DataFrame Originally defined in module IO
Read into a DataFrame from Apache Avro format.
.read_csv(source, has_header: true, columns: nil, new_columns: nil, sep: ",", comment_char: nil, quote_char: '"', skip_rows: 0, dtypes: nil, null_values: nil, ignore_errors: false, parse_dates: false, n_threads: nil, infer_schema_length: N_INFER_DEFAULT, batch_size: 8192, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: true, storage_options: nil, skip_rows_after_header: 0, row_count_name: nil, row_count_offset: 0, sample_size: 1024, eol_char: "\n", truncate_ragged_lines: false) ⇒ DataFrame Originally defined in module IO
This operation defaults to a rechunk
operation at the end, meaning that
all data will be stored continuously in memory.
Set rechunk: false
if you are benchmarking the csv-reader. A rechunk
is
an expensive operation.
Read a CSV file into a DataFrame.
.read_csv_batched(source, has_header: true, columns: nil, new_columns: nil, sep: ",", comment_char: nil, quote_char: '"', skip_rows: 0, dtypes: nil, null_values: nil, missing_utf8_is_empty_string: false, ignore_errors: false, parse_dates: false, n_threads: nil, infer_schema_length: N_INFER_DEFAULT, batch_size: 50_000, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: true, skip_rows_after_header: 0, row_count_name: nil, row_count_offset: 0, sample_size: 1024, eol_char: "\n", raise_if_empty: true, truncate_ragged_lines: false, decimal_comma: false) ⇒ BatchedCsvReader Originally defined in module IO
Read a CSV file in batches.
Upon creation of the BatchedCsvReader
,
polars will gather statistics and determine the
file chunks. After that work will only be done
if next_batches
is called.
.read_database(query, schema_overrides: nil) ⇒ DataFrame Also known as: read_sql Originally defined in module IO
Read a SQL query into a DataFrame.
.read_ipc(source, columns: nil, n_rows: nil, memory_map: true, storage_options: nil, row_count_name: nil, row_count_offset: 0, rechunk: true) ⇒ DataFrame Originally defined in module IO
Read into a DataFrame from Arrow IPC (Feather v2) file.
.read_ipc_schema(source) ⇒ Hash Originally defined in module IO
Get a schema of the IPC file without reading data.
.read_ipc_stream(source, columns: nil, n_rows: nil, storage_options: nil, row_index_name: nil, row_index_offset: 0, rechunk: true) ⇒ DataFrame Originally defined in module IO
Read into a DataFrame from Arrow IPC record batch stream.
See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html.
.read_json(source, schema: nil, schema_overrides: nil, infer_schema_length: N_INFER_DEFAULT) ⇒ DataFrame Originally defined in module IO
Read into a DataFrame from a JSON file.
.read_ndjson(source, schema: nil, schema_overrides: nil, ignore_errors: false) ⇒ DataFrame Originally defined in module IO
Read into a DataFrame from a newline delimited JSON file.
.read_parquet(source, columns: nil, n_rows: nil, storage_options: nil, parallel: "auto", row_count_name: nil, row_count_offset: 0, low_memory: false, use_statistics: true, rechunk: true) ⇒ DataFrame Originally defined in module IO
This operation defaults to a rechunk
operation at the end, meaning that
all data will be stored continuously in memory.
Set rechunk: false
if you are benchmarking the parquet-reader. A rechunk
is
an expensive operation.
Read into a DataFrame from a parquet file.
.read_parquet_schema(source) ⇒ Hash Originally defined in module IO
Get a schema of the Parquet file without reading data.
.repeat(value, n, dtype: nil, eager: false, name: nil) ⇒ Object Originally defined in module Functions
Repeat a single value n times.
.scan_csv(source, has_header: true, sep: ",", comment_char: nil, quote_char: '"', skip_rows: 0, dtypes: nil, null_values: nil, missing_utf8_is_empty_string: false, ignore_errors: false, cache: true, with_column_names: nil, infer_schema_length: N_INFER_DEFAULT, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: true, skip_rows_after_header: 0, row_count_name: nil, row_count_offset: 0, parse_dates: false, eol_char: "\n", raise_if_empty: true, truncate_ragged_lines: false, decimal_comma: false, glob: true) ⇒ LazyFrame Originally defined in module IO
Lazily read from a CSV file or multiple files via glob patterns.
This allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.
.scan_ipc(source, n_rows: nil, cache: true, rechunk: true, row_count_name: nil, row_count_offset: 0, storage_options: nil, hive_partitioning: nil, hive_schema: nil, try_parse_hive_dates: true, include_file_paths: nil) ⇒ LazyFrame Originally defined in module IO
Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
This allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.
.scan_ndjson(source, infer_schema_length: N_INFER_DEFAULT, batch_size: 1024, n_rows: nil, low_memory: false, rechunk: true, row_count_name: nil, row_count_offset: 0) ⇒ LazyFrame Originally defined in module IO
Lazily read from a newline delimited JSON file.
This allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.
.scan_parquet(source, n_rows: nil, cache: true, parallel: "auto", glob: true, rechunk: true, row_count_name: nil, row_count_offset: 0, storage_options: nil, low_memory: false, include_file_paths: nil) ⇒ LazyFrame Originally defined in module IO
Lazily read from a parquet file or multiple files via glob patterns.
This allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.
.select(*exprs, **named_exprs) ⇒ DataFrame Originally defined in module Functions
Run polars expressions without a context.
This is syntactic sugar for running df.select
on an empty DataFrame.
.set_random_seed(seed) ⇒ nil Originally defined in module Functions
Set the global random seed for Polars.
This random seed is used to determine things such as shuffle ordering.
.spearman_rank_corr(a, b, ddof: 1, propagate_nans: false) ⇒ Expr Originally defined in module Functions
Compute the spearman rank correlation between two columns.
Missing data will be excluded from the computation.
.sql_expr(sql) ⇒ Expr Originally defined in module Functions
Parse one or more SQL expressions to polars expression(s).
.std(column, ddof: 1) ⇒ Expr Originally defined in module Functions
Get the standard deviation.
This function is syntactic sugar for col(column).std(ddof: ddof)
.
.struct(*exprs, schema: nil, eager: false, **named_exprs) ⇒ Object Originally defined in module Functions
Collect several columns into a Series of dtype Struct.
.sum(*names) ⇒ Expr Originally defined in module Functions
Sum all values.
Syntactic sugar for col(name).sum
.
.sum_horizontal(*exprs) ⇒ Expr Originally defined in module Functions
Sum all values horizontally across columns.
.tail(column, n = 10) ⇒ Expr Originally defined in module Functions
Get the last n
rows.
This function is syntactic sugar for col(column).tail(n)
.
.time_range(start = nil, stop = nil, interval = "1h", closed: "both", eager: false) ⇒ Object Originally defined in module Functions
Generate a time range.
.time_ranges(start = nil, stop = nil, interval = "1h", closed: "both", eager: false) ⇒ Object Originally defined in module Functions
Create a column of time ranges.
.using_string_cache ⇒ Boolean Originally defined in module Functions
Check whether the global string cache is enabled.
.var(column, ddof: 1) ⇒ Expr Originally defined in module Functions
Get the variance.
This function is syntactic sugar for col(column).var(ddof: ddof)
.
.when(*predicates, **constraints) ⇒ When Originally defined in module Functions
Start a "when, then, otherwise" expression.