1777 lines
63 KiB
Python
1777 lines
63 KiB
Python
from __future__ import annotations
|
|
|
|
import platform
|
|
import sys
|
|
from collections.abc import Iterable, Mapping, Sequence
|
|
from functools import partial
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
from narwhals._expression_parsing import (
|
|
ExprKind,
|
|
ExprMetadata,
|
|
apply_n_ary_operation,
|
|
combine_metadata,
|
|
extract_compliant,
|
|
is_scalar_like,
|
|
)
|
|
from narwhals._utils import (
|
|
Implementation,
|
|
Version,
|
|
deprecate_native_namespace,
|
|
flatten,
|
|
is_compliant_expr,
|
|
is_eager_allowed,
|
|
is_sequence_but_not_str,
|
|
supports_arrow_c_stream,
|
|
validate_laziness,
|
|
)
|
|
from narwhals.dependencies import (
|
|
is_narwhals_series,
|
|
is_numpy_array,
|
|
is_numpy_array_2d,
|
|
is_pyarrow_table,
|
|
)
|
|
from narwhals.exceptions import InvalidOperationError
|
|
from narwhals.expr import Expr
|
|
from narwhals.series import Series
|
|
from narwhals.translate import from_native, to_native
|
|
|
|
if TYPE_CHECKING:
|
|
from types import ModuleType
|
|
|
|
from typing_extensions import TypeAlias, TypeIs
|
|
|
|
from narwhals._compliant import CompliantExpr, CompliantNamespace
|
|
from narwhals._translate import IntoArrowTable
|
|
from narwhals.dataframe import DataFrame, LazyFrame
|
|
from narwhals.dtypes import DType
|
|
from narwhals.schema import Schema
|
|
from narwhals.typing import (
|
|
ConcatMethod,
|
|
FrameT,
|
|
IntoDType,
|
|
IntoExpr,
|
|
NativeFrame,
|
|
NativeLazyFrame,
|
|
NativeSeries,
|
|
NonNestedLiteral,
|
|
_1DArray,
|
|
_2DArray,
|
|
)
|
|
|
|
_IntoSchema: TypeAlias = "Mapping[str, DType] | Schema | Sequence[str] | None"
|
|
|
|
|
|
def concat(items: Iterable[FrameT], *, how: ConcatMethod = "vertical") -> FrameT:
|
|
"""Concatenate multiple DataFrames, LazyFrames into a single entity.
|
|
|
|
Arguments:
|
|
items: DataFrames, LazyFrames to concatenate.
|
|
how: concatenating strategy
|
|
|
|
- vertical: Concatenate vertically. Column names must match.
|
|
- horizontal: Concatenate horizontally. If lengths don't match, then
|
|
missing rows are filled with null values. This is only supported
|
|
when all inputs are (eager) DataFrames.
|
|
- diagonal: Finds a union between the column schemas and fills missing column
|
|
values with null.
|
|
|
|
Returns:
|
|
A new DataFrame or LazyFrame resulting from the concatenation.
|
|
|
|
Raises:
|
|
TypeError: The items to concatenate should either all be eager, or all lazy
|
|
|
|
Examples:
|
|
Let's take an example of vertical concatenation:
|
|
|
|
>>> import pandas as pd
|
|
>>> import polars as pl
|
|
>>> import pyarrow as pa
|
|
>>> import narwhals as nw
|
|
|
|
Let's look at one case a for vertical concatenation (pandas backed):
|
|
|
|
>>> df_pd_1 = nw.from_native(pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}))
|
|
>>> df_pd_2 = nw.from_native(pd.DataFrame({"a": [5, 2], "b": [1, 4]}))
|
|
>>> nw.concat([df_pd_1, df_pd_2], how="vertical")
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| a b |
|
|
| 0 1 4 |
|
|
| 1 2 5 |
|
|
| 2 3 6 |
|
|
| 0 5 1 |
|
|
| 1 2 4 |
|
|
└──────────────────┘
|
|
|
|
Let's look at one case a for horizontal concatenation (polars backed):
|
|
|
|
>>> df_pl_1 = nw.from_native(pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}))
|
|
>>> df_pl_2 = nw.from_native(pl.DataFrame({"c": [5, 2], "d": [1, 4]}))
|
|
>>> nw.concat([df_pl_1, df_pl_2], how="horizontal")
|
|
┌───────────────────────────┐
|
|
| Narwhals DataFrame |
|
|
|---------------------------|
|
|
|shape: (3, 4) |
|
|
|┌─────┬─────┬──────┬──────┐|
|
|
|│ a ┆ b ┆ c ┆ d │|
|
|
|│ --- ┆ --- ┆ --- ┆ --- │|
|
|
|│ i64 ┆ i64 ┆ i64 ┆ i64 │|
|
|
|╞═════╪═════╪══════╪══════╡|
|
|
|│ 1 ┆ 4 ┆ 5 ┆ 1 │|
|
|
|│ 2 ┆ 5 ┆ 2 ┆ 4 │|
|
|
|│ 3 ┆ 6 ┆ null ┆ null │|
|
|
|└─────┴─────┴──────┴──────┘|
|
|
└───────────────────────────┘
|
|
|
|
Let's look at one case a for diagonal concatenation (pyarrow backed):
|
|
|
|
>>> df_pa_1 = nw.from_native(pa.table({"a": [1, 2], "b": [3.5, 4.5]}))
|
|
>>> df_pa_2 = nw.from_native(pa.table({"a": [3, 4], "z": ["x", "y"]}))
|
|
>>> nw.concat([df_pa_1, df_pa_2], how="diagonal")
|
|
┌──────────────────────────┐
|
|
| Narwhals DataFrame |
|
|
|--------------------------|
|
|
|pyarrow.Table |
|
|
|a: int64 |
|
|
|b: double |
|
|
|z: string |
|
|
|---- |
|
|
|a: [[1,2],[3,4]] |
|
|
|b: [[3.5,4.5],[null,null]]|
|
|
|z: [[null,null],["x","y"]]|
|
|
└──────────────────────────┘
|
|
"""
|
|
from narwhals.dependencies import is_narwhals_lazyframe
|
|
|
|
if not items:
|
|
msg = "No items to concatenate."
|
|
raise ValueError(msg)
|
|
items = list(items)
|
|
validate_laziness(items)
|
|
if how not in {"horizontal", "vertical", "diagonal"}: # pragma: no cover
|
|
msg = "Only vertical, horizontal and diagonal concatenations are supported."
|
|
raise NotImplementedError(msg)
|
|
first_item = items[0]
|
|
if is_narwhals_lazyframe(first_item) and how == "horizontal":
|
|
msg = (
|
|
"Horizontal concatenation is not supported for LazyFrames.\n\n"
|
|
"Hint: you may want to use `join` instead."
|
|
)
|
|
raise InvalidOperationError(msg)
|
|
plx = first_item.__narwhals_namespace__()
|
|
return first_item._with_compliant(
|
|
plx.concat([df._compliant_frame for df in items], how=how)
|
|
)
|
|
|
|
|
|
def new_series(
|
|
name: str,
|
|
values: Any,
|
|
dtype: IntoDType | None = None,
|
|
*,
|
|
backend: ModuleType | Implementation | str,
|
|
) -> Series[Any]:
|
|
"""Instantiate Narwhals Series from iterable (e.g. list or array).
|
|
|
|
Arguments:
|
|
name: Name of resulting Series.
|
|
values: Values of make Series from.
|
|
dtype: (Narwhals) dtype. If not provided, the native library
|
|
may auto-infer it from `values`.
|
|
backend: specifies which eager backend instantiate to.
|
|
|
|
`backend` can be specified in various ways
|
|
|
|
- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
|
|
`POLARS`, `MODIN` or `CUDF`.
|
|
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
|
|
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
|
|
|
|
Returns:
|
|
A new Series
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> values = [4, 1, 2, 3]
|
|
>>> nw.new_series(name="a", values=values, dtype=nw.Int32, backend=pd)
|
|
┌─────────────────────┐
|
|
| Narwhals Series |
|
|
|---------------------|
|
|
|0 4 |
|
|
|1 1 |
|
|
|2 2 |
|
|
|3 3 |
|
|
|Name: a, dtype: int32|
|
|
└─────────────────────┘
|
|
"""
|
|
return _new_series_impl(name, values, dtype, backend=backend)
|
|
|
|
|
|
def _new_series_impl(
|
|
name: str,
|
|
values: Any,
|
|
dtype: IntoDType | None = None,
|
|
*,
|
|
backend: ModuleType | Implementation | str,
|
|
) -> Series[Any]:
|
|
implementation = Implementation.from_backend(backend)
|
|
if is_eager_allowed(implementation):
|
|
ns = Version.MAIN.namespace.from_backend(implementation).compliant
|
|
series = ns._series.from_iterable(values, name=name, context=ns, dtype=dtype)
|
|
return series.to_narwhals()
|
|
elif implementation is Implementation.UNKNOWN: # pragma: no cover
|
|
_native_namespace = implementation.to_native_namespace()
|
|
try:
|
|
native_series: NativeSeries = _native_namespace.new_series(
|
|
name, values, dtype
|
|
)
|
|
return from_native(native_series, series_only=True).alias(name)
|
|
except AttributeError as e:
|
|
msg = "Unknown namespace is expected to implement `new_series` constructor."
|
|
raise AttributeError(msg) from e
|
|
msg = (
|
|
f"{implementation} support in Narwhals is lazy-only, but `new_series` is an eager-only function.\n\n"
|
|
"Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n"
|
|
f" nw.new_series('a', [1,2,3], backend='pyarrow').to_frame().lazy('{implementation}')"
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
|
|
@deprecate_native_namespace(warn_version="1.26.0")
|
|
def from_dict(
|
|
data: Mapping[str, Any],
|
|
schema: Mapping[str, DType] | Schema | None = None,
|
|
*,
|
|
backend: ModuleType | Implementation | str | None = None,
|
|
native_namespace: ModuleType | None = None, # noqa: ARG001
|
|
) -> DataFrame[Any]:
|
|
"""Instantiate DataFrame from dictionary.
|
|
|
|
Indexes (if present, for pandas-like backends) are aligned following
|
|
the [left-hand-rule](../concepts/pandas_index.md/).
|
|
|
|
Notes:
|
|
For pandas-like dataframes, conversion to schema is applied after dataframe
|
|
creation.
|
|
|
|
Arguments:
|
|
data: Dictionary to create DataFrame from.
|
|
schema: The DataFrame schema as Schema or dict of {name: type}. If not
|
|
specified, the schema will be inferred by the native library.
|
|
backend: specifies which eager backend instantiate to. Only
|
|
necessary if inputs are not Narwhals Series.
|
|
|
|
`backend` can be specified in various ways
|
|
|
|
- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
|
|
`POLARS`, `MODIN` or `CUDF`.
|
|
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
|
|
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
|
|
native_namespace: deprecated, same as `backend`.
|
|
|
|
Returns:
|
|
A new DataFrame.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import narwhals as nw
|
|
>>> data = {"c": [5, 2], "d": [1, 4]}
|
|
>>> nw.from_dict(data, backend="pandas")
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| c d |
|
|
| 0 5 1 |
|
|
| 1 2 4 |
|
|
└──────────────────┘
|
|
"""
|
|
if backend is None:
|
|
data, backend = _from_dict_no_backend(data)
|
|
implementation = Implementation.from_backend(backend)
|
|
if is_eager_allowed(implementation):
|
|
ns = Version.MAIN.namespace.from_backend(implementation).compliant
|
|
return ns._dataframe.from_dict(data, schema=schema, context=ns).to_narwhals()
|
|
elif implementation is Implementation.UNKNOWN: # pragma: no cover
|
|
_native_namespace = implementation.to_native_namespace()
|
|
try:
|
|
# implementation is UNKNOWN, Narwhals extension using this feature should
|
|
# implement `from_dict` function in the top-level namespace.
|
|
native_frame: NativeFrame = _native_namespace.from_dict(data, schema=schema)
|
|
except AttributeError as e:
|
|
msg = "Unknown namespace is expected to implement `from_dict` function."
|
|
raise AttributeError(msg) from e
|
|
return from_native(native_frame, eager_only=True)
|
|
msg = (
|
|
f"{implementation} support in Narwhals is lazy-only, but `from_dict` is an eager-only function.\n\n"
|
|
"Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n"
|
|
f" nw.from_dict({{'a': [1, 2]}}, backend='pyarrow').lazy('{implementation}')"
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
|
|
def _from_dict_no_backend(
|
|
data: Mapping[str, Series[Any] | Any], /
|
|
) -> tuple[dict[str, Series[Any] | Any], ModuleType]:
|
|
for val in data.values():
|
|
if is_narwhals_series(val):
|
|
native_namespace = val.__native_namespace__()
|
|
break
|
|
else:
|
|
msg = "Calling `from_dict` without `backend` is only supported if all input values are already Narwhals Series"
|
|
raise TypeError(msg)
|
|
data = {key: to_native(value, pass_through=True) for key, value in data.items()}
|
|
return data, native_namespace
|
|
|
|
|
|
def from_numpy(
|
|
data: _2DArray,
|
|
schema: Mapping[str, DType] | Schema | Sequence[str] | None = None,
|
|
*,
|
|
backend: ModuleType | Implementation | str,
|
|
) -> DataFrame[Any]:
|
|
"""Construct a DataFrame from a NumPy ndarray.
|
|
|
|
Notes:
|
|
Only row orientation is currently supported.
|
|
|
|
For pandas-like dataframes, conversion to schema is applied after dataframe
|
|
creation.
|
|
|
|
Arguments:
|
|
data: Two-dimensional data represented as a NumPy ndarray.
|
|
schema: The DataFrame schema as Schema, dict of {name: type}, or a sequence of str.
|
|
backend: specifies which eager backend instantiate to.
|
|
|
|
`backend` can be specified in various ways
|
|
|
|
- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
|
|
`POLARS`, `MODIN` or `CUDF`.
|
|
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
|
|
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
|
|
|
|
Returns:
|
|
A new DataFrame.
|
|
|
|
Examples:
|
|
>>> import numpy as np
|
|
>>> import pyarrow as pa
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> arr = np.array([[5, 2, 1], [1, 4, 3]])
|
|
>>> schema = {"c": nw.Int16(), "d": nw.Float32(), "e": nw.Int8()}
|
|
>>> nw.from_numpy(arr, schema=schema, backend="pyarrow")
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| pyarrow.Table |
|
|
| c: int16 |
|
|
| d: float |
|
|
| e: int8 |
|
|
| ---- |
|
|
| c: [[5,1]] |
|
|
| d: [[2,4]] |
|
|
| e: [[1,3]] |
|
|
└──────────────────┘
|
|
"""
|
|
if not is_numpy_array_2d(data):
|
|
msg = "`from_numpy` only accepts 2D numpy arrays"
|
|
raise ValueError(msg)
|
|
if not _is_into_schema(schema):
|
|
msg = (
|
|
"`schema` is expected to be one of the following types: "
|
|
"Mapping[str, DType] | Schema | Sequence[str]. "
|
|
f"Got {type(schema)}."
|
|
)
|
|
raise TypeError(msg)
|
|
implementation = Implementation.from_backend(backend)
|
|
if is_eager_allowed(implementation):
|
|
ns = Version.MAIN.namespace.from_backend(implementation).compliant
|
|
return ns.from_numpy(data, schema).to_narwhals()
|
|
elif implementation is Implementation.UNKNOWN: # pragma: no cover
|
|
_native_namespace = implementation.to_native_namespace()
|
|
try:
|
|
# implementation is UNKNOWN, Narwhals extension using this feature should
|
|
# implement `from_numpy` function in the top-level namespace.
|
|
native_frame: NativeFrame = _native_namespace.from_numpy(data, schema=schema)
|
|
except AttributeError as e:
|
|
msg = "Unknown namespace is expected to implement `from_numpy` function."
|
|
raise AttributeError(msg) from e
|
|
return from_native(native_frame, eager_only=True)
|
|
msg = (
|
|
f"{implementation} support in Narwhals is lazy-only, but `from_numpy` is an eager-only function.\n\n"
|
|
"Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n"
|
|
f" nw.from_numpy(arr, backend='pyarrow').lazy('{implementation}')"
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
|
|
def _is_into_schema(obj: Any) -> TypeIs[_IntoSchema]:
|
|
from narwhals.schema import Schema
|
|
|
|
return (
|
|
obj is None or isinstance(obj, (Mapping, Schema)) or is_sequence_but_not_str(obj)
|
|
)
|
|
|
|
|
|
def from_arrow(
|
|
native_frame: IntoArrowTable, *, backend: ModuleType | Implementation | str
|
|
) -> DataFrame[Any]: # pragma: no cover
|
|
"""Construct a DataFrame from an object which supports the PyCapsule Interface.
|
|
|
|
Arguments:
|
|
native_frame: Object which implements `__arrow_c_stream__`.
|
|
backend: specifies which eager backend instantiate to.
|
|
|
|
`backend` can be specified in various ways
|
|
|
|
- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
|
|
`POLARS`, `MODIN` or `CUDF`.
|
|
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
|
|
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
|
|
|
|
Returns:
|
|
A new DataFrame.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pd.DataFrame({"a": [1, 2], "b": [4.2, 5.1]})
|
|
>>> nw.from_arrow(df_native, backend="polars")
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| shape: (2, 2) |
|
|
| ┌─────┬─────┐ |
|
|
| │ a ┆ b │ |
|
|
| │ --- ┆ --- │ |
|
|
| │ i64 ┆ f64 │ |
|
|
| ╞═════╪═════╡ |
|
|
| │ 1 ┆ 4.2 │ |
|
|
| │ 2 ┆ 5.1 │ |
|
|
| └─────┴─────┘ |
|
|
└──────────────────┘
|
|
"""
|
|
if not (supports_arrow_c_stream(native_frame) or is_pyarrow_table(native_frame)):
|
|
msg = f"Given object of type {type(native_frame)} does not support PyCapsule interface"
|
|
raise TypeError(msg)
|
|
implementation = Implementation.from_backend(backend)
|
|
if is_eager_allowed(implementation):
|
|
ns = Version.MAIN.namespace.from_backend(implementation).compliant
|
|
return ns._dataframe.from_arrow(native_frame, context=ns).to_narwhals()
|
|
elif implementation is Implementation.UNKNOWN: # pragma: no cover
|
|
_native_namespace = implementation.to_native_namespace()
|
|
try:
|
|
# implementation is UNKNOWN, Narwhals extension using this feature should
|
|
# implement PyCapsule support
|
|
native: NativeFrame = _native_namespace.DataFrame(native_frame)
|
|
except AttributeError as e:
|
|
msg = "Unknown namespace is expected to implement `DataFrame` class which accepts object which supports PyCapsule Interface."
|
|
raise AttributeError(msg) from e
|
|
return from_native(native, eager_only=True)
|
|
msg = (
|
|
f"{implementation} support in Narwhals is lazy-only, but `from_arrow` is an eager-only function.\n\n"
|
|
"Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n"
|
|
f" nw.from_arrow(df, backend='pyarrow').lazy('{implementation}')"
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
|
|
def _get_sys_info() -> dict[str, str]:
|
|
"""System information.
|
|
|
|
Returns system and Python version information
|
|
|
|
Copied from sklearn
|
|
|
|
Returns:
|
|
Dictionary with system info.
|
|
"""
|
|
python = sys.version.replace("\n", " ")
|
|
|
|
blob = (
|
|
("python", python),
|
|
("executable", sys.executable),
|
|
("machine", platform.platform()),
|
|
)
|
|
|
|
return dict(blob)
|
|
|
|
|
|
def _get_deps_info() -> dict[str, str]:
|
|
"""Overview of the installed version of main dependencies.
|
|
|
|
This function does not import the modules to collect the version numbers
|
|
but instead relies on standard Python package metadata.
|
|
|
|
Returns version information on relevant Python libraries
|
|
|
|
This function and show_versions were copied from sklearn and adapted
|
|
|
|
Returns:
|
|
Mapping from dependency to version.
|
|
"""
|
|
from importlib.metadata import distributions
|
|
|
|
extra_names = ("narwhals", "numpy")
|
|
member_names = Implementation._member_names_
|
|
exclude = {"PYSPARK_CONNECT", "UNKNOWN"}
|
|
target_names = tuple(
|
|
name.lower() for name in (*extra_names, *member_names) if name not in exclude
|
|
)
|
|
result = dict.fromkeys(target_names, "") # Initialize with empty strings
|
|
|
|
for dist in distributions():
|
|
dist_name, dist_version = dist.name.lower(), dist.version
|
|
|
|
if dist_name in result: # exact match
|
|
result[dist_name] = dist_version
|
|
else: # prefix match
|
|
for target in target_names:
|
|
if not result[target] and dist_name.startswith(target):
|
|
result[target] = dist_version
|
|
break
|
|
|
|
return result
|
|
|
|
|
|
def show_versions() -> None:
|
|
"""Print useful debugging information.
|
|
|
|
Examples:
|
|
>>> from narwhals import show_versions
|
|
>>> show_versions() # doctest: +SKIP
|
|
"""
|
|
sys_info = _get_sys_info()
|
|
deps_info = _get_deps_info()
|
|
|
|
print("\nSystem:") # noqa: T201
|
|
for k, stat in sys_info.items():
|
|
print(f"{k:>10}: {stat}") # noqa: T201
|
|
|
|
print("\nPython dependencies:") # noqa: T201
|
|
for k, stat in deps_info.items():
|
|
print(f"{k:>13}: {stat}") # noqa: T201
|
|
|
|
|
|
def read_csv(
|
|
source: str, *, backend: ModuleType | Implementation | str, **kwargs: Any
|
|
) -> DataFrame[Any]:
|
|
"""Read a CSV file into a DataFrame.
|
|
|
|
Arguments:
|
|
source: Path to a file.
|
|
backend: The eager backend for DataFrame creation.
|
|
`backend` can be specified in various ways
|
|
|
|
- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
|
|
`POLARS`, `MODIN` or `CUDF`.
|
|
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
|
|
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
|
|
kwargs: Extra keyword arguments which are passed to the native CSV reader.
|
|
For example, you could use
|
|
`nw.read_csv('file.csv', backend='pandas', engine='pyarrow')`.
|
|
|
|
Returns:
|
|
DataFrame.
|
|
|
|
Examples:
|
|
>>> import narwhals as nw
|
|
>>> nw.read_csv("file.csv", backend="pandas") # doctest:+SKIP
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| a b |
|
|
| 0 1 4 |
|
|
| 1 2 5 |
|
|
└──────────────────┘
|
|
"""
|
|
eager_backend = Implementation.from_backend(backend)
|
|
native_namespace = eager_backend.to_native_namespace()
|
|
native_frame: NativeFrame
|
|
if eager_backend in {
|
|
Implementation.POLARS,
|
|
Implementation.PANDAS,
|
|
Implementation.MODIN,
|
|
Implementation.CUDF,
|
|
}:
|
|
native_frame = native_namespace.read_csv(source, **kwargs)
|
|
elif eager_backend is Implementation.PYARROW:
|
|
from pyarrow import csv # ignore-banned-import
|
|
|
|
native_frame = csv.read_csv(source, **kwargs)
|
|
else: # pragma: no cover
|
|
try:
|
|
# implementation is UNKNOWN, Narwhals extension using this feature should
|
|
# implement `read_csv` function in the top-level namespace.
|
|
native_frame = native_namespace.read_csv(source=source, **kwargs)
|
|
except AttributeError as e:
|
|
msg = "Unknown namespace is expected to implement `read_csv` function."
|
|
raise AttributeError(msg) from e
|
|
return from_native(native_frame, eager_only=True)
|
|
|
|
|
|
def scan_csv(
|
|
source: str, *, backend: ModuleType | Implementation | str, **kwargs: Any
|
|
) -> LazyFrame[Any]:
|
|
"""Lazily read from a CSV file.
|
|
|
|
For the libraries that do not support lazy dataframes, the function reads
|
|
a csv file eagerly and then converts the resulting dataframe to a lazyframe.
|
|
|
|
Arguments:
|
|
source: Path to a file.
|
|
backend: The eager backend for DataFrame creation.
|
|
`backend` can be specified in various ways
|
|
|
|
- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
|
|
`POLARS`, `MODIN` or `CUDF`.
|
|
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
|
|
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
|
|
kwargs: Extra keyword arguments which are passed to the native CSV reader.
|
|
For example, you could use
|
|
`nw.scan_csv('file.csv', backend=pd, engine='pyarrow')`.
|
|
|
|
Returns:
|
|
LazyFrame.
|
|
|
|
Examples:
|
|
>>> import duckdb
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> nw.scan_csv("file.csv", backend="duckdb").to_native() # doctest:+SKIP
|
|
┌─────────┬───────┐
|
|
│ a │ b │
|
|
│ varchar │ int32 │
|
|
├─────────┼───────┤
|
|
│ x │ 1 │
|
|
│ y │ 2 │
|
|
│ z │ 3 │
|
|
└─────────┴───────┘
|
|
"""
|
|
implementation = Implementation.from_backend(backend)
|
|
native_namespace = implementation.to_native_namespace()
|
|
native_frame: NativeFrame | NativeLazyFrame
|
|
if implementation is Implementation.POLARS:
|
|
native_frame = native_namespace.scan_csv(source, **kwargs)
|
|
elif implementation in {
|
|
Implementation.PANDAS,
|
|
Implementation.MODIN,
|
|
Implementation.CUDF,
|
|
Implementation.DASK,
|
|
Implementation.DUCKDB,
|
|
Implementation.IBIS,
|
|
}:
|
|
native_frame = native_namespace.read_csv(source, **kwargs)
|
|
elif implementation is Implementation.PYARROW:
|
|
from pyarrow import csv # ignore-banned-import
|
|
|
|
native_frame = csv.read_csv(source, **kwargs)
|
|
elif implementation.is_spark_like():
|
|
if (session := kwargs.pop("session", None)) is None:
|
|
msg = "Spark like backends require a session object to be passed in `kwargs`."
|
|
raise ValueError(msg)
|
|
|
|
csv_reader = session.read.format("csv")
|
|
native_frame = (
|
|
csv_reader.load(source)
|
|
if (
|
|
implementation is Implementation.SQLFRAME
|
|
and implementation._backend_version() < (3, 27, 0)
|
|
)
|
|
else csv_reader.options(**kwargs).load(source)
|
|
)
|
|
else: # pragma: no cover
|
|
try:
|
|
# implementation is UNKNOWN, Narwhals extension using this feature should
|
|
# implement `scan_csv` function in the top-level namespace.
|
|
native_frame = native_namespace.scan_csv(source=source, **kwargs)
|
|
except AttributeError as e:
|
|
msg = "Unknown namespace is expected to implement `scan_csv` function."
|
|
raise AttributeError(msg) from e
|
|
return from_native(native_frame).lazy()
|
|
|
|
|
|
def read_parquet(
|
|
source: str, *, backend: ModuleType | Implementation | str, **kwargs: Any
|
|
) -> DataFrame[Any]:
|
|
"""Read into a DataFrame from a parquet file.
|
|
|
|
Arguments:
|
|
source: Path to a file.
|
|
backend: The eager backend for DataFrame creation.
|
|
`backend` can be specified in various ways
|
|
|
|
- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
|
|
`POLARS`, `MODIN` or `CUDF`.
|
|
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
|
|
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
|
|
kwargs: Extra keyword arguments which are passed to the native parquet reader.
|
|
For example, you could use
|
|
`nw.read_parquet('file.parquet', backend=pd, engine='pyarrow')`.
|
|
|
|
Returns:
|
|
DataFrame.
|
|
|
|
Examples:
|
|
>>> import pyarrow as pa
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> nw.read_parquet("file.parquet", backend="pyarrow") # doctest:+SKIP
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
|pyarrow.Table |
|
|
|a: int64 |
|
|
|c: double |
|
|
|---- |
|
|
|a: [[1,2]] |
|
|
|c: [[0.2,0.1]] |
|
|
└──────────────────┘
|
|
"""
|
|
implementation = Implementation.from_backend(backend)
|
|
native_namespace = implementation.to_native_namespace()
|
|
native_frame: NativeFrame
|
|
if implementation in {
|
|
Implementation.POLARS,
|
|
Implementation.PANDAS,
|
|
Implementation.MODIN,
|
|
Implementation.CUDF,
|
|
Implementation.DUCKDB,
|
|
Implementation.IBIS,
|
|
}:
|
|
native_frame = native_namespace.read_parquet(source, **kwargs)
|
|
elif implementation is Implementation.PYARROW:
|
|
import pyarrow.parquet as pq # ignore-banned-import
|
|
|
|
native_frame = pq.read_table(source, **kwargs)
|
|
else: # pragma: no cover
|
|
try:
|
|
# implementation is UNKNOWN, Narwhals extension using this feature should
|
|
# implement `read_parquet` function in the top-level namespace.
|
|
native_frame = native_namespace.read_parquet(source=source, **kwargs)
|
|
except AttributeError as e:
|
|
msg = "Unknown namespace is expected to implement `read_parquet` function."
|
|
raise AttributeError(msg) from e
|
|
return from_native(native_frame, eager_only=True)
|
|
|
|
|
|
def scan_parquet(
|
|
source: str, *, backend: ModuleType | Implementation | str, **kwargs: Any
|
|
) -> LazyFrame[Any]:
|
|
"""Lazily read from a parquet file.
|
|
|
|
For the libraries that do not support lazy dataframes, the function reads
|
|
a parquet file eagerly and then converts the resulting dataframe to a lazyframe.
|
|
|
|
Note:
|
|
Spark like backends require a session object to be passed in `kwargs`.
|
|
|
|
For instance:
|
|
|
|
```py
|
|
import narwhals as nw
|
|
from sqlframe.duckdb import DuckDBSession
|
|
|
|
nw.scan_parquet(source, backend="sqlframe", session=DuckDBSession())
|
|
```
|
|
|
|
Arguments:
|
|
source: Path to a file.
|
|
backend: The eager backend for DataFrame creation.
|
|
`backend` can be specified in various ways
|
|
|
|
- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
|
|
`POLARS`, `MODIN`, `CUDF`, `PYSPARK` or `SQLFRAME`.
|
|
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"`, `"cudf"`,
|
|
`"pyspark"` or `"sqlframe"`.
|
|
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin`, `cudf`,
|
|
`pyspark.sql` or `sqlframe`.
|
|
kwargs: Extra keyword arguments which are passed to the native parquet reader.
|
|
For example, you could use
|
|
`nw.scan_parquet('file.parquet', backend=pd, engine='pyarrow')`.
|
|
|
|
Returns:
|
|
LazyFrame.
|
|
|
|
Examples:
|
|
>>> import dask.dataframe as dd
|
|
>>> from sqlframe.duckdb import DuckDBSession
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> nw.scan_parquet("file.parquet", backend="dask").collect() # doctest:+SKIP
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| a b |
|
|
| 0 1 4 |
|
|
| 1 2 5 |
|
|
└──────────────────┘
|
|
>>> nw.scan_parquet(
|
|
... "file.parquet", backend="sqlframe", session=DuckDBSession()
|
|
... ).collect() # doctest:+SKIP
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| pyarrow.Table |
|
|
| a: int64 |
|
|
| b: int64 |
|
|
| ---- |
|
|
| a: [[1,2]] |
|
|
| b: [[4,5]] |
|
|
└──────────────────┘
|
|
"""
|
|
implementation = Implementation.from_backend(backend)
|
|
native_namespace = implementation.to_native_namespace()
|
|
native_frame: NativeFrame | NativeLazyFrame
|
|
if implementation is Implementation.POLARS:
|
|
native_frame = native_namespace.scan_parquet(source, **kwargs)
|
|
elif implementation in {
|
|
Implementation.PANDAS,
|
|
Implementation.MODIN,
|
|
Implementation.CUDF,
|
|
Implementation.DASK,
|
|
Implementation.DUCKDB,
|
|
Implementation.IBIS,
|
|
}:
|
|
native_frame = native_namespace.read_parquet(source, **kwargs)
|
|
elif implementation is Implementation.PYARROW:
|
|
import pyarrow.parquet as pq # ignore-banned-import
|
|
|
|
native_frame = pq.read_table(source, **kwargs)
|
|
elif implementation.is_spark_like():
|
|
if (session := kwargs.pop("session", None)) is None:
|
|
msg = "Spark like backends require a session object to be passed in `kwargs`."
|
|
raise ValueError(msg)
|
|
|
|
pq_reader = session.read.format("parquet")
|
|
native_frame = (
|
|
pq_reader.load(source)
|
|
if (
|
|
implementation is Implementation.SQLFRAME
|
|
and implementation._backend_version() < (3, 27, 0)
|
|
)
|
|
else pq_reader.options(**kwargs).load(source)
|
|
)
|
|
|
|
else: # pragma: no cover
|
|
try:
|
|
# implementation is UNKNOWN, Narwhals extension using this feature should
|
|
# implement `scan_parquet` function in the top-level namespace.
|
|
native_frame = native_namespace.scan_parquet(source=source, **kwargs)
|
|
except AttributeError as e:
|
|
msg = "Unknown namespace is expected to implement `scan_parquet` function."
|
|
raise AttributeError(msg) from e
|
|
return from_native(native_frame).lazy()
|
|
|
|
|
|
def col(*names: str | Iterable[str]) -> Expr:
|
|
"""Creates an expression that references one or more columns by their name(s).
|
|
|
|
Arguments:
|
|
names: Name(s) of the columns to use.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": ["x", "z"]})
|
|
>>> nw.from_native(df_native).select(nw.col("a", "b") * nw.col("b"))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| shape: (2, 2) |
|
|
| ┌─────┬─────┐ |
|
|
| │ a ┆ b │ |
|
|
| │ --- ┆ --- │ |
|
|
| │ i64 ┆ i64 │ |
|
|
| ╞═════╪═════╡ |
|
|
| │ 3 ┆ 9 │ |
|
|
| │ 8 ┆ 16 │ |
|
|
| └─────┴─────┘ |
|
|
└──────────────────┘
|
|
"""
|
|
flat_names = flatten(names)
|
|
|
|
def func(plx: Any) -> Any:
|
|
return plx.col(*flat_names)
|
|
|
|
return Expr(
|
|
func,
|
|
ExprMetadata.selector_single()
|
|
if len(flat_names) == 1
|
|
else ExprMetadata.selector_multi_named(),
|
|
)
|
|
|
|
|
|
def exclude(*names: str | Iterable[str]) -> Expr:
|
|
"""Creates an expression that excludes columns by their name(s).
|
|
|
|
Arguments:
|
|
names: Name(s) of the columns to exclude.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": ["x", "z"]})
|
|
>>> nw.from_native(df_native).select(nw.exclude("c", "a"))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| shape: (2, 1) |
|
|
| ┌─────┐ |
|
|
| │ b │ |
|
|
| │ --- │ |
|
|
| │ i64 │ |
|
|
| ╞═════╡ |
|
|
| │ 3 │ |
|
|
| │ 4 │ |
|
|
| └─────┘ |
|
|
└──────────────────┘
|
|
"""
|
|
exclude_names = frozenset(flatten(names))
|
|
|
|
def func(plx: Any) -> Any:
|
|
return plx.exclude(exclude_names)
|
|
|
|
return Expr(func, ExprMetadata.selector_multi_unnamed())
|
|
|
|
|
|
def nth(*indices: int | Sequence[int]) -> Expr:
|
|
"""Creates an expression that references one or more columns by their index(es).
|
|
|
|
Notes:
|
|
`nth` is not supported for Polars version<1.0.0. Please use
|
|
[`narwhals.col`][] instead.
|
|
|
|
Arguments:
|
|
indices: One or more indices representing the columns to retrieve.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pyarrow as pa
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pa.table({"a": [1, 2], "b": [3, 4], "c": [0.123, 3.14]})
|
|
>>> nw.from_native(df_native).select(nw.nth(0, 2) * 2)
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
|pyarrow.Table |
|
|
|a: int64 |
|
|
|c: double |
|
|
|---- |
|
|
|a: [[2,4]] |
|
|
|c: [[0.246,6.28]] |
|
|
└──────────────────┘
|
|
"""
|
|
flat_indices = flatten(indices)
|
|
|
|
def func(plx: Any) -> Any:
|
|
return plx.nth(*flat_indices)
|
|
|
|
return Expr(
|
|
func,
|
|
ExprMetadata.selector_single()
|
|
if len(flat_indices) == 1
|
|
else ExprMetadata.selector_multi_unnamed(),
|
|
)
|
|
|
|
|
|
# Add underscore so it doesn't conflict with builtin `all`
|
|
def all_() -> Expr:
|
|
"""Instantiate an expression representing all columns.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pd.DataFrame({"a": [1, 2], "b": [3.14, 0.123]})
|
|
>>> nw.from_native(df_native).select(nw.all() * 2)
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| a b |
|
|
| 0 2 6.280 |
|
|
| 1 4 0.246 |
|
|
└──────────────────┘
|
|
"""
|
|
return Expr(lambda plx: plx.all(), ExprMetadata.selector_multi_unnamed())
|
|
|
|
|
|
# Add underscore so it doesn't conflict with builtin `len`
|
|
def len_() -> Expr:
|
|
"""Return the number of rows.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pl.DataFrame({"a": [1, 2], "b": [5, None]})
|
|
>>> nw.from_native(df_native).select(nw.len())
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| shape: (1, 1) |
|
|
| ┌─────┐ |
|
|
| │ len │ |
|
|
| │ --- │ |
|
|
| │ u32 │ |
|
|
| ╞═════╡ |
|
|
| │ 2 │ |
|
|
| └─────┘ |
|
|
└──────────────────┘
|
|
"""
|
|
|
|
def func(plx: Any) -> Any:
|
|
return plx.len()
|
|
|
|
return Expr(func, ExprMetadata.aggregation())
|
|
|
|
|
|
def sum(*columns: str) -> Expr:
|
|
"""Sum all values.
|
|
|
|
Note:
|
|
Syntactic sugar for ``nw.col(columns).sum()``
|
|
|
|
Arguments:
|
|
columns: Name(s) of the columns to use in the aggregation function
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pd.DataFrame({"a": [1, 2], "b": [-1.4, 6.2]})
|
|
>>> nw.from_native(df_native).select(nw.sum("a", "b"))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| a b |
|
|
| 0 3 4.8 |
|
|
└──────────────────┘
|
|
"""
|
|
return col(*columns).sum()
|
|
|
|
|
|
def mean(*columns: str) -> Expr:
|
|
"""Get the mean value.
|
|
|
|
Note:
|
|
Syntactic sugar for ``nw.col(columns).mean()``
|
|
|
|
Arguments:
|
|
columns: Name(s) of the columns to use in the aggregation function
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pyarrow as pa
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pa.table({"a": [1, 8, 3], "b": [3.14, 6.28, 42.1]})
|
|
>>> nw.from_native(df_native).select(nw.mean("a", "b"))
|
|
┌─────────────────────────┐
|
|
| Narwhals DataFrame |
|
|
|-------------------------|
|
|
|pyarrow.Table |
|
|
|a: double |
|
|
|b: double |
|
|
|---- |
|
|
|a: [[4]] |
|
|
|b: [[17.173333333333336]]|
|
|
└─────────────────────────┘
|
|
"""
|
|
return col(*columns).mean()
|
|
|
|
|
|
def median(*columns: str) -> Expr:
|
|
"""Get the median value.
|
|
|
|
Notes:
|
|
- Syntactic sugar for ``nw.col(columns).median()``
|
|
- Results might slightly differ across backends due to differences in the
|
|
underlying algorithms used to compute the median.
|
|
|
|
Arguments:
|
|
columns: Name(s) of the columns to use in the aggregation function
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pl.DataFrame({"a": [4, 5, 2]})
|
|
>>> nw.from_native(df_native).select(nw.median("a"))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| shape: (1, 1) |
|
|
| ┌─────┐ |
|
|
| │ a │ |
|
|
| │ --- │ |
|
|
| │ f64 │ |
|
|
| ╞═════╡ |
|
|
| │ 4.0 │ |
|
|
| └─────┘ |
|
|
└──────────────────┘
|
|
"""
|
|
return col(*columns).median()
|
|
|
|
|
|
def min(*columns: str) -> Expr:
|
|
"""Return the minimum value.
|
|
|
|
Note:
|
|
Syntactic sugar for ``nw.col(columns).min()``.
|
|
|
|
Arguments:
|
|
columns: Name(s) of the columns to use in the aggregation function.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pyarrow as pa
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pa.table({"a": [1, 2], "b": [5, 10]})
|
|
>>> nw.from_native(df_native).select(nw.min("a", "b"))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| pyarrow.Table |
|
|
| a: int64 |
|
|
| b: int64 |
|
|
| ---- |
|
|
| a: [[1]] |
|
|
| b: [[5]] |
|
|
└──────────────────┘
|
|
"""
|
|
return col(*columns).min()
|
|
|
|
|
|
def max(*columns: str) -> Expr:
|
|
"""Return the maximum value.
|
|
|
|
Note:
|
|
Syntactic sugar for ``nw.col(columns).max()``.
|
|
|
|
Arguments:
|
|
columns: Name(s) of the columns to use in the aggregation function.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pd.DataFrame({"a": [1, 2], "b": [5, 10]})
|
|
>>> nw.from_native(df_native).select(nw.max("a", "b"))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| a b |
|
|
| 0 2 10 |
|
|
└──────────────────┘
|
|
"""
|
|
return col(*columns).max()
|
|
|
|
|
|
def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr:
|
|
"""Sum all values horizontally across columns.
|
|
|
|
Warning:
|
|
Unlike Polars, we support horizontal sum over numeric columns only.
|
|
|
|
Arguments:
|
|
exprs: Name(s) of the columns to use in the aggregation function. Accepts
|
|
expression input.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pl.DataFrame({"a": [1, 2, 3], "b": [5, 10, None]})
|
|
>>> nw.from_native(df_native).with_columns(sum=nw.sum_horizontal("a", "b"))
|
|
┌────────────────────┐
|
|
| Narwhals DataFrame |
|
|
|--------------------|
|
|
|shape: (3, 3) |
|
|
|┌─────┬──────┬─────┐|
|
|
|│ a ┆ b ┆ sum │|
|
|
|│ --- ┆ --- ┆ --- │|
|
|
|│ i64 ┆ i64 ┆ i64 │|
|
|
|╞═════╪══════╪═════╡|
|
|
|│ 1 ┆ 5 ┆ 6 │|
|
|
|│ 2 ┆ 10 ┆ 12 │|
|
|
|│ 3 ┆ null ┆ 3 │|
|
|
|└─────┴──────┴─────┘|
|
|
└────────────────────┘
|
|
"""
|
|
if not exprs:
|
|
msg = "At least one expression must be passed to `sum_horizontal`"
|
|
raise ValueError(msg)
|
|
flat_exprs = flatten(exprs)
|
|
return Expr(
|
|
lambda plx: apply_n_ary_operation(
|
|
plx, plx.sum_horizontal, *flat_exprs, str_as_lit=False
|
|
),
|
|
ExprMetadata.from_horizontal_op(*flat_exprs),
|
|
)
|
|
|
|
|
|
def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr:
|
|
"""Get the minimum value horizontally across columns.
|
|
|
|
Notes:
|
|
We support `min_horizontal` over numeric columns only.
|
|
|
|
Arguments:
|
|
exprs: Name(s) of the columns to use in the aggregation function. Accepts
|
|
expression input.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pyarrow as pa
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pa.table({"a": [1, 8, 3], "b": [4, 5, None]})
|
|
>>> nw.from_native(df_native).with_columns(h_min=nw.min_horizontal("a", "b"))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| pyarrow.Table |
|
|
| a: int64 |
|
|
| b: int64 |
|
|
| h_min: int64 |
|
|
| ---- |
|
|
| a: [[1,8,3]] |
|
|
| b: [[4,5,null]] |
|
|
| h_min: [[1,5,3]] |
|
|
└──────────────────┘
|
|
"""
|
|
if not exprs:
|
|
msg = "At least one expression must be passed to `min_horizontal`"
|
|
raise ValueError(msg)
|
|
flat_exprs = flatten(exprs)
|
|
return Expr(
|
|
lambda plx: apply_n_ary_operation(
|
|
plx, plx.min_horizontal, *flat_exprs, str_as_lit=False
|
|
),
|
|
ExprMetadata.from_horizontal_op(*flat_exprs),
|
|
)
|
|
|
|
|
|
def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr:
|
|
"""Get the maximum value horizontally across columns.
|
|
|
|
Notes:
|
|
We support `max_horizontal` over numeric columns only.
|
|
|
|
Arguments:
|
|
exprs: Name(s) of the columns to use in the aggregation function. Accepts
|
|
expression input.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, None]})
|
|
>>> nw.from_native(df_native).with_columns(h_max=nw.max_horizontal("a", "b"))
|
|
┌──────────────────────┐
|
|
| Narwhals DataFrame |
|
|
|----------------------|
|
|
|shape: (3, 3) |
|
|
|┌─────┬──────┬───────┐|
|
|
|│ a ┆ b ┆ h_max │|
|
|
|│ --- ┆ --- ┆ --- │|
|
|
|│ i64 ┆ i64 ┆ i64 │|
|
|
|╞═════╪══════╪═══════╡|
|
|
|│ 1 ┆ 4 ┆ 4 │|
|
|
|│ 8 ┆ 5 ┆ 8 │|
|
|
|│ 3 ┆ null ┆ 3 │|
|
|
|└─────┴──────┴───────┘|
|
|
└──────────────────────┘
|
|
"""
|
|
if not exprs:
|
|
msg = "At least one expression must be passed to `max_horizontal`"
|
|
raise ValueError(msg)
|
|
flat_exprs = flatten(exprs)
|
|
return Expr(
|
|
lambda plx: apply_n_ary_operation(
|
|
plx, plx.max_horizontal, *flat_exprs, str_as_lit=False
|
|
),
|
|
ExprMetadata.from_horizontal_op(*flat_exprs),
|
|
)
|
|
|
|
|
|
class When:
|
|
def __init__(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> None:
|
|
self._predicate = all_horizontal(*flatten(predicates), ignore_nulls=False)
|
|
|
|
def then(self, value: IntoExpr | NonNestedLiteral | _1DArray) -> Then:
|
|
kind = ExprKind.from_into_expr(value, str_as_lit=False)
|
|
if self._predicate._metadata.is_scalar_like and not kind.is_scalar_like:
|
|
msg = (
|
|
"If you pass a scalar-like predicate to `nw.when`, then "
|
|
"the `then` value must also be scalar-like."
|
|
)
|
|
raise InvalidOperationError(msg)
|
|
|
|
return Then(
|
|
lambda plx: apply_n_ary_operation(
|
|
plx,
|
|
lambda *args: plx.when(args[0]).then(args[1]),
|
|
self._predicate,
|
|
value,
|
|
str_as_lit=False,
|
|
),
|
|
combine_metadata(
|
|
self._predicate,
|
|
value,
|
|
str_as_lit=False,
|
|
allow_multi_output=False,
|
|
to_single_output=False,
|
|
),
|
|
)
|
|
|
|
|
|
class Then(Expr):
|
|
def otherwise(self, value: IntoExpr | NonNestedLiteral | _1DArray) -> Expr:
|
|
kind = ExprKind.from_into_expr(value, str_as_lit=False)
|
|
if self._metadata.is_scalar_like and not is_scalar_like(kind):
|
|
msg = (
|
|
"If you pass a scalar-like predicate to `nw.when`, then "
|
|
"the `otherwise` value must also be scalar-like."
|
|
)
|
|
raise InvalidOperationError(msg)
|
|
|
|
def func(plx: CompliantNamespace[Any, Any]) -> CompliantExpr[Any, Any]:
|
|
compliant_expr = self._to_compliant_expr(plx)
|
|
compliant_value = extract_compliant(plx, value, str_as_lit=False)
|
|
if (
|
|
not self._metadata.is_scalar_like
|
|
and is_scalar_like(kind)
|
|
and is_compliant_expr(compliant_value)
|
|
):
|
|
compliant_value = compliant_value.broadcast(kind)
|
|
return compliant_expr.otherwise(compliant_value) # type: ignore[attr-defined, no-any-return]
|
|
|
|
return Expr(
|
|
func,
|
|
combine_metadata(
|
|
self,
|
|
value,
|
|
str_as_lit=False,
|
|
allow_multi_output=False,
|
|
to_single_output=False,
|
|
),
|
|
)
|
|
|
|
|
|
def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When:
|
|
"""Start a `when-then-otherwise` expression.
|
|
|
|
Expression similar to an `if-else` statement in Python. Always initiated by a
|
|
`pl.when(<condition>).then(<value if condition>)`, and optionally followed by a
|
|
`.otherwise(<value if condition is false>)` can be appended at the end. If not
|
|
appended, and the condition is not `True`, `None` will be returned.
|
|
|
|
Info:
|
|
Chaining multiple `.when(<condition>).then(<value>)` statements is currently
|
|
not supported.
|
|
See [Narwhals#668](https://github.com/narwhals-dev/narwhals/issues/668).
|
|
|
|
Arguments:
|
|
predicates: Condition(s) that must be met in order to apply the subsequent
|
|
statement. Accepts one or more boolean expressions, which are implicitly
|
|
combined with `&`. String input is parsed as a column name.
|
|
|
|
Returns:
|
|
A "when" object, which `.then` can be called on.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> data = {"a": [1, 2, 3], "b": [5, 10, 15]}
|
|
>>> df_native = pd.DataFrame(data)
|
|
>>> nw.from_native(df_native).with_columns(
|
|
... nw.when(nw.col("a") < 3).then(5).otherwise(6).alias("a_when")
|
|
... )
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| a b a_when |
|
|
| 0 1 5 5 |
|
|
| 1 2 10 5 |
|
|
| 2 3 15 6 |
|
|
└──────────────────┘
|
|
"""
|
|
return When(*predicates)
|
|
|
|
|
|
def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr], ignore_nulls: bool) -> Expr:
|
|
r"""Compute the bitwise AND horizontally across columns.
|
|
|
|
Arguments:
|
|
exprs: Name(s) of the columns to use in the aggregation function. Accepts
|
|
expression input.
|
|
ignore_nulls: Whether to ignore nulls:
|
|
|
|
- If `True`, null values are ignored. If there are no elements, the result
|
|
is `True`.
|
|
- If `False`, Kleene logic is followed. Note that this is not allowed for
|
|
pandas with classical NumPy dtypes when null values are present.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pyarrow as pa
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> data = {
|
|
... "a": [False, False, True, True, False, None],
|
|
... "b": [False, True, True, None, None, None],
|
|
... }
|
|
>>> df_native = pa.table(data)
|
|
>>> nw.from_native(df_native).select(
|
|
... "a", "b", all=nw.all_horizontal("a", "b", ignore_nulls=False)
|
|
... )
|
|
┌─────────────────────────────────────────┐
|
|
| Narwhals DataFrame |
|
|
|-----------------------------------------|
|
|
|pyarrow.Table |
|
|
|a: bool |
|
|
|b: bool |
|
|
|all: bool |
|
|
|---- |
|
|
|a: [[false,false,true,true,false,null]] |
|
|
|b: [[false,true,true,null,null,null]] |
|
|
|all: [[false,false,true,null,false,null]]|
|
|
└─────────────────────────────────────────┘
|
|
|
|
"""
|
|
if not exprs:
|
|
msg = "At least one expression must be passed to `all_horizontal`"
|
|
raise ValueError(msg)
|
|
flat_exprs = flatten(exprs)
|
|
return Expr(
|
|
lambda plx: apply_n_ary_operation(
|
|
plx,
|
|
partial(plx.all_horizontal, ignore_nulls=ignore_nulls),
|
|
*flat_exprs,
|
|
str_as_lit=False,
|
|
),
|
|
ExprMetadata.from_horizontal_op(*flat_exprs),
|
|
)
|
|
|
|
|
|
def lit(value: NonNestedLiteral, dtype: IntoDType | None = None) -> Expr:
|
|
"""Return an expression representing a literal value.
|
|
|
|
Arguments:
|
|
value: The value to use as literal.
|
|
dtype: The data type of the literal value. If not provided, the data type will
|
|
be inferred by the native library.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> df_native = pd.DataFrame({"a": [1, 2]})
|
|
>>> nw.from_native(df_native).with_columns(nw.lit(3))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| a literal |
|
|
| 0 1 3 |
|
|
| 1 2 3 |
|
|
└──────────────────┘
|
|
"""
|
|
if is_numpy_array(value):
|
|
msg = (
|
|
"numpy arrays are not supported as literal values. "
|
|
"Consider using `with_columns` to create a new column from the array."
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
if isinstance(value, (list, tuple)):
|
|
msg = f"Nested datatypes are not supported yet. Got {value}"
|
|
raise NotImplementedError(msg)
|
|
|
|
return Expr(lambda plx: plx.lit(value, dtype), ExprMetadata.literal())
|
|
|
|
|
|
def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr], ignore_nulls: bool) -> Expr:
|
|
r"""Compute the bitwise OR horizontally across columns.
|
|
|
|
Arguments:
|
|
exprs: Name(s) of the columns to use in the aggregation function. Accepts
|
|
expression input.
|
|
ignore_nulls: Whether to ignore nulls:
|
|
|
|
- If `True`, null values are ignored. If there are no elements, the result
|
|
is `False`.
|
|
- If `False`, Kleene logic is followed. Note that this is not allowed for
|
|
pandas with classical NumPy dtypes when null values are present.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> data = {
|
|
... "a": [False, False, True, True, False, None],
|
|
... "b": [False, True, True, None, None, None],
|
|
... }
|
|
>>> df_native = pl.DataFrame(data)
|
|
>>> nw.from_native(df_native).select(
|
|
... "a", "b", any=nw.any_horizontal("a", "b", ignore_nulls=False)
|
|
... )
|
|
┌─────────────────────────┐
|
|
| Narwhals DataFrame |
|
|
|-------------------------|
|
|
|shape: (6, 3) |
|
|
|┌───────┬───────┬───────┐|
|
|
|│ a ┆ b ┆ any │|
|
|
|│ --- ┆ --- ┆ --- │|
|
|
|│ bool ┆ bool ┆ bool │|
|
|
|╞═══════╪═══════╪═══════╡|
|
|
|│ false ┆ false ┆ false │|
|
|
|│ false ┆ true ┆ true │|
|
|
|│ true ┆ true ┆ true │|
|
|
|│ true ┆ null ┆ true │|
|
|
|│ false ┆ null ┆ null │|
|
|
|│ null ┆ null ┆ null │|
|
|
|└───────┴───────┴───────┘|
|
|
└─────────────────────────┘
|
|
"""
|
|
if not exprs:
|
|
msg = "At least one expression must be passed to `any_horizontal`"
|
|
raise ValueError(msg)
|
|
flat_exprs = flatten(exprs)
|
|
return Expr(
|
|
lambda plx: apply_n_ary_operation(
|
|
plx,
|
|
partial(plx.any_horizontal, ignore_nulls=ignore_nulls),
|
|
*flat_exprs,
|
|
str_as_lit=False,
|
|
),
|
|
ExprMetadata.from_horizontal_op(*flat_exprs),
|
|
)
|
|
|
|
|
|
def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr:
|
|
"""Compute the mean of all values horizontally across columns.
|
|
|
|
Arguments:
|
|
exprs: Name(s) of the columns to use in the aggregation function. Accepts
|
|
expression input.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pyarrow as pa
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> data = {"a": [1, 8, 3], "b": [4, 5, None], "c": ["x", "y", "z"]}
|
|
>>> df_native = pa.table(data)
|
|
|
|
We define a dataframe-agnostic function that computes the horizontal mean of "a"
|
|
and "b" columns:
|
|
|
|
>>> nw.from_native(df_native).select(nw.mean_horizontal("a", "b"))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| pyarrow.Table |
|
|
| a: double |
|
|
| ---- |
|
|
| a: [[2.5,6.5,3]] |
|
|
└──────────────────┘
|
|
"""
|
|
if not exprs:
|
|
msg = "At least one expression must be passed to `mean_horizontal`"
|
|
raise ValueError(msg)
|
|
flat_exprs = flatten(exprs)
|
|
return Expr(
|
|
lambda plx: apply_n_ary_operation(
|
|
plx, plx.mean_horizontal, *flat_exprs, str_as_lit=False
|
|
),
|
|
ExprMetadata.from_horizontal_op(*flat_exprs),
|
|
)
|
|
|
|
|
|
def concat_str(
|
|
exprs: IntoExpr | Iterable[IntoExpr],
|
|
*more_exprs: IntoExpr,
|
|
separator: str = "",
|
|
ignore_nulls: bool = False,
|
|
) -> Expr:
|
|
r"""Horizontally concatenate columns into a single string column.
|
|
|
|
Arguments:
|
|
exprs: Columns to concatenate into a single string column. Accepts expression
|
|
input. Strings are parsed as column names, other non-expression inputs are
|
|
parsed as literals. Non-`String` columns are cast to `String`.
|
|
*more_exprs: Additional columns to concatenate into a single string column,
|
|
specified as positional arguments.
|
|
separator: String that will be used to separate the values of each column.
|
|
ignore_nulls: Ignore null values (default is `False`).
|
|
If set to `False`, null values will be propagated and if the row contains any
|
|
null values, the output is null.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import narwhals as nw
|
|
>>>
|
|
>>> data = {
|
|
... "a": [1, 2, 3],
|
|
... "b": ["dogs", "cats", None],
|
|
... "c": ["play", "swim", "walk"],
|
|
... }
|
|
>>> df_native = pd.DataFrame(data)
|
|
>>> (
|
|
... nw.from_native(df_native).select(
|
|
... nw.concat_str(
|
|
... [nw.col("a") * 2, nw.col("b"), nw.col("c")], separator=" "
|
|
... ).alias("full_sentence")
|
|
... )
|
|
... )
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| full_sentence |
|
|
| 0 2 dogs play |
|
|
| 1 4 cats swim |
|
|
| 2 None |
|
|
└──────────────────┘
|
|
"""
|
|
flat_exprs = flatten([*flatten([exprs]), *more_exprs])
|
|
return Expr(
|
|
lambda plx: apply_n_ary_operation(
|
|
plx,
|
|
lambda *args: plx.concat_str(
|
|
*args, separator=separator, ignore_nulls=ignore_nulls
|
|
),
|
|
*flat_exprs,
|
|
str_as_lit=False,
|
|
),
|
|
combine_metadata(
|
|
*flat_exprs, str_as_lit=False, allow_multi_output=True, to_single_output=True
|
|
),
|
|
)
|
|
|
|
|
|
def coalesce(
|
|
exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr | NonNestedLiteral
|
|
) -> Expr:
|
|
"""Folds the columns from left to right, keeping the first non-null value.
|
|
|
|
Arguments:
|
|
exprs: Columns to coalesce, must be a str, nw.Expr, or nw.Series
|
|
where strings are parsed as column names and both nw.Expr/nw.Series
|
|
are passed through as-is. Scalar values must be wrapped in `nw.lit`.
|
|
|
|
*more_exprs: Additional columns to coalesce, specified as positional arguments.
|
|
|
|
Raises:
|
|
TypeError: If any of the inputs are not a str, nw.Expr, or nw.Series.
|
|
|
|
Returns:
|
|
A new expression.
|
|
|
|
Examples:
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>> data = [
|
|
... (1, 5, None),
|
|
... (None, 6, None),
|
|
... (None, None, 9),
|
|
... (4, 8, 10),
|
|
... (None, None, None),
|
|
... ]
|
|
>>> df = pl.DataFrame(data, schema=["a", "b", "c"], orient="row")
|
|
>>> nw.from_native(df).select(nw.coalesce("a", "b", "c", nw.lit(-1)))
|
|
┌──────────────────┐
|
|
|Narwhals DataFrame|
|
|
|------------------|
|
|
| shape: (5, 1) |
|
|
| ┌─────┐ |
|
|
| │ a │ |
|
|
| │ --- │ |
|
|
| │ i64 │ |
|
|
| ╞═════╡ |
|
|
| │ 1 │ |
|
|
| │ 6 │ |
|
|
| │ 9 │ |
|
|
| │ 4 │ |
|
|
| │ -1 │ |
|
|
| └─────┘ |
|
|
└──────────────────┘
|
|
"""
|
|
flat_exprs = flatten([*flatten([exprs]), *more_exprs])
|
|
|
|
non_exprs = [expr for expr in flat_exprs if not isinstance(expr, (str, Expr, Series))]
|
|
if non_exprs:
|
|
msg = (
|
|
f"All arguments to `coalesce` must be of type {str!r}, {Expr!r}, or {Series!r}."
|
|
"\nGot the following invalid arguments (type, value):"
|
|
f"\n {', '.join(repr((type(e), e)) for e in non_exprs)}"
|
|
)
|
|
raise TypeError(msg)
|
|
|
|
return Expr(
|
|
lambda plx: apply_n_ary_operation(
|
|
plx, lambda *args: plx.coalesce(*args), *flat_exprs, str_as_lit=False
|
|
),
|
|
ExprMetadata.from_horizontal_op(*flat_exprs),
|
|
)
|