Method: Polars::IO#read_csv

Defined in:: lib/polars/io/csv.rb

#read_csv(source, has_header: true, columns: nil, new_columns: nil, separator: ",", comment_prefix: nil, quote_char: '"', skip_rows: 0, skip_lines: 0, schema: nil, schema_overrides: nil, null_values: nil, missing_utf8_is_empty_string: false, ignore_errors: false, try_parse_dates: false, n_threads: nil, infer_schema: true, infer_schema_length: N_INFER_DEFAULT, batch_size: 8192, n_rows: nil, encoding: "utf8", low_memory: false, rechunk: false, storage_options: nil, skip_rows_after_header: 0, row_index_name: nil, row_index_offset: 0, eol_char: "\n", raise_if_empty: true, truncate_ragged_lines: false, decimal_comma: false, glob: true) ⇒ `DataFrame`

Note:

This operation defaults to a rechunk operation at the end, meaning that all data will be stored continuously in memory. Set rechunk: false if you are benchmarking the csv-reader. A rechunk is an expensive operation.

Read a CSV file into a DataFrame.

Parameters:

source (Object) —
Path to a file or a file-like object.
has_header (Boolean) (defaults to: true) —
Indicate if the first row of dataset is a header or not. If set to false, column names will be autogenerated in the following format: column_x, with x being an enumeration over every column in the dataset starting at 1.
columns (Object) (defaults to: nil) —
Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.
new_columns (Object) (defaults to: nil) —
Rename columns right after parsing the CSV file. If the given list is shorter than the width of the DataFrame the remaining columns will have their original name.
separator (String) (defaults to: ",") —
Single byte character to use as separator in the file.
comment_prefix (String) (defaults to: nil) —
A string used to indicate the start of a comment line. Comment lines are skipped during parsing. Common examples of comment prefixes are # and //.
quote_char (String) (defaults to: '"') —
Single byte character used for csv quoting. Set to nil to turn off special handling and escaping of quotes.
skip_rows (Integer) (defaults to: 0) —
Start reading after skip_rows lines.
skip_lines (Integer) (defaults to: 0) —
Start reading after skip_lines lines. The header will be parsed at this offset. Note that CSV escaping will not be respected when skipping lines. If you want to skip valid CSV rows, use skip_rows.
schema (Object) (defaults to: nil) —
Provide the schema. This means that polars doesn't do schema inference. This argument expects the complete schema, whereas schema_overrides can be used to partially overwrite a schema. Note that the order of the columns in the provided schema must match the order of the columns in the CSV being read.
schema_overrides (Object) (defaults to: nil) —
Overwrite dtypes for specific or all columns during schema inference.
null_values (Object) (defaults to: nil) —
Values to interpret as null values. You can provide a:
- String: All values equal to this string will be null.
- Array: All values equal to any string in this array will be null.
- Hash: A hash that maps column name to a null value string.
missing_utf8_is_empty_string (Boolean) (defaults to: false) —
By default a missing value is considered to be null; if you would prefer missing utf8 values to be treated as the empty string you can set this param true.
ignore_errors (Boolean) (defaults to: false) —
Try to keep reading lines if some lines yield errors. First try infer_schema_length: 0 to read all columns as :str to check which values might cause an issue.
try_parse_dates (Boolean) (defaults to: false) —
Try to automatically parse dates. If this does not succeed, the column remains of data type :str.
n_threads (Integer) (defaults to: nil) —
Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.
infer_schema (Boolean) (defaults to: true) —
When true, the schema is inferred from the data using the first infer_schema_length rows. When false, the schema is not inferred and will be Polars::String if not specified in schema or schema_overrides.
infer_schema_length (Integer) (defaults to: N_INFER_DEFAULT) —
The maximum number of rows to scan for schema inference. If set to nil, the full data may be scanned (this is slow). Set infer_schema: false to read all columns as Polars::String.
batch_size (Integer) (defaults to: 8192) —
Number of lines to read into the buffer at once. Modify this to change performance.
n_rows (Integer) (defaults to: nil) —
Stop reading from CSV file after reading n_rows. During multi-threaded parsing, an upper bound of n_rows rows cannot be guaranteed.
encoding ("utf8", "utf8-lossy") (defaults to: "utf8") —
Lossy means that invalid utf8 values are replaced with � characters. When using other encodings than utf8 or utf8-lossy, the input is first decoded im memory with Ruby.
low_memory (Boolean) (defaults to: false) —
Reduce memory usage at expense of performance.
rechunk (Boolean) (defaults to: false) —
Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
storage_options (Hash) (defaults to: nil) —
Extra options that make sense for a particular storage connection.
skip_rows_after_header (Integer) (defaults to: 0) —
Skip this number of rows when the header is parsed.
row_index_name (String) (defaults to: nil) —
If not nil, this will insert a row count column with the given name into the DataFrame.
row_index_offset (Integer) (defaults to: 0) —
Offset to start the row_count column (only used if the name is set).
eol_char (String) (defaults to: "\n") —
Single byte end of line character.
raise_if_empty (Boolean) (defaults to: true) —
When there is no data in the source, NoDataError is raised. If this parameter is set to false, an empty DataFrame (with no columns) is returned instead.
truncate_ragged_lines (Boolean) (defaults to: false) —
Truncate lines that are longer than the schema.
decimal_comma (Boolean) (defaults to: false) —
Parse floats using a comma as the decimal separator instead of a period.
glob (Boolean) (defaults to: true) —
Expand path given via globbing rules.

Returns:

(DataFrame)

# File 'lib/polars/io/csv.rb', line 114

def read_csv(
  source,
  has_header: true,
  columns: nil,
  new_columns: nil,
  separator: ",",
  comment_prefix: nil,
  quote_char: '"',
  skip_rows: 0,
  skip_lines: 0,
  schema: nil,
  schema_overrides: nil,
  null_values: nil,
  missing_utf8_is_empty_string: false,
  ignore_errors: false,
  try_parse_dates: false,
  n_threads: nil,
  infer_schema: true,
  infer_schema_length: N_INFER_DEFAULT,
  batch_size: 8192,
  n_rows: nil,
  encoding: "utf8",
  low_memory: false,
  rechunk: false,
  storage_options: nil,
  skip_rows_after_header: 0,
  row_index_name: nil,
  row_index_offset: 0,
  eol_char: "\n",
  raise_if_empty: true,
  truncate_ragged_lines: false,
  decimal_comma: false,
  glob: true
)
  Utils._check_arg_is_1byte("separator", separator, false)
  Utils._check_arg_is_1byte("quote_char", quote_char, true)
  Utils._check_arg_is_1byte("eol_char", eol_char, false)

  projection, columns = Utils.handle_projection_columns(columns)

  storage_options ||= {}

  if columns && !has_header
    columns.each do |column|
      if !column.start_with?("column_")
        raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
      end
    end
  end

  if !infer_schema
    infer_schema_length = 0
  end

  df = nil
  _prepare_file_arg(source) do |data|
    df = _read_csv_impl(
      data,
      has_header: has_header,
      columns: columns || projection,
      separator: separator,
      comment_prefix: comment_prefix,
      quote_char: quote_char,
      skip_rows: skip_rows,
      skip_lines: skip_lines,
      schema_overrides: schema_overrides,
      schema: schema,
      null_values: null_values,
      missing_utf8_is_empty_string: missing_utf8_is_empty_string,
      ignore_errors: ignore_errors,
      try_parse_dates: try_parse_dates,
      n_threads: n_threads,
      infer_schema_length: infer_schema_length,
      batch_size: batch_size,
      n_rows: n_rows,
      encoding: encoding == "utf8-lossy" ? encoding : "utf8",
      low_memory: low_memory,
      rechunk: rechunk,
      skip_rows_after_header: skip_rows_after_header,
      row_index_name: row_index_name,
      row_index_offset: row_index_offset,
      eol_char: eol_char,
      raise_if_empty: raise_if_empty,
      truncate_ragged_lines: truncate_ragged_lines,
      decimal_comma: decimal_comma,
      glob: glob
    )
  end

  if new_columns
    Utils._update_columns(df, new_columns)
  else
    df
  end
end