from __future__ import annotations from abc import abstractmethod from itertools import chain from typing import ( TYPE_CHECKING, Any, Callable, ClassVar, Generic, Literal, NoReturn, TypeVar, overload, ) from narwhals._exceptions import issue_warning from narwhals._expression_parsing import ( ExprKind, all_exprs_are_scalar_like, check_expressions_preserve_length, is_scalar_like, ) from narwhals._utils import ( Implementation, Version, flatten, generate_repr, is_compliant_dataframe, is_compliant_lazyframe, is_eager_allowed, is_index_selector, is_list_of, is_sequence_like, is_slice_none, supports_arrow_c_stream, ) from narwhals.dependencies import ( get_polars, is_numpy_array, is_numpy_array_2d, is_pyarrow_table, ) from narwhals.exceptions import ( InvalidIntoExprError, InvalidOperationError, PerformanceWarning, ) from narwhals.functions import _from_dict_no_backend, _is_into_schema from narwhals.schema import Schema from narwhals.series import Series from narwhals.translate import to_native if TYPE_CHECKING: from collections.abc import Iterable, Iterator, Mapping, Sequence from io import BytesIO from pathlib import Path from types import ModuleType import pandas as pd import polars as pl import pyarrow as pa from typing_extensions import Concatenate, ParamSpec, Self, TypeAlias from narwhals._compliant import CompliantDataFrame, CompliantLazyFrame from narwhals._compliant.typing import CompliantExprAny, EagerNamespaceAny from narwhals._translate import IntoArrowTable from narwhals.dtypes import DType from narwhals.group_by import GroupBy, LazyGroupBy from narwhals.typing import ( AsofJoinStrategy, IntoDataFrame, IntoExpr, IntoFrame, JoinStrategy, LazyUniqueKeepStrategy, MultiColSelector as _MultiColSelector, MultiIndexSelector as _MultiIndexSelector, PivotAgg, SingleColSelector, SingleIndexSelector, SizeUnit, UniqueKeepStrategy, _2DArray, ) PS = ParamSpec("PS") _FrameT = TypeVar("_FrameT", bound="IntoFrame") FrameT = TypeVar("FrameT", bound="IntoFrame") DataFrameT = TypeVar("DataFrameT", bound="IntoDataFrame") R = TypeVar("R") MultiColSelector: TypeAlias = "_MultiColSelector[Series[Any]]" MultiIndexSelector: TypeAlias = "_MultiIndexSelector[Series[Any]]" class BaseFrame(Generic[_FrameT]): _compliant_frame: Any _level: Literal["full", "lazy", "interchange"] def __native_namespace__(self) -> ModuleType: return self._compliant_frame.__native_namespace__() # type: ignore[no-any-return] def __narwhals_namespace__(self) -> Any: return self._compliant_frame.__narwhals_namespace__() def _with_compliant(self, df: Any) -> Self: # construct, preserving properties return self.__class__(df, level=self._level) # type: ignore[call-arg] def _flatten_and_extract( self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr ) -> tuple[list[CompliantExprAny], list[ExprKind]]: """Process `args` and `kwargs`, extracting underlying objects as we go, interpreting strings as column names.""" out_exprs = [] out_kinds = [] for expr in flatten(exprs): compliant_expr = self._extract_compliant(expr) out_exprs.append(compliant_expr) out_kinds.append(ExprKind.from_into_expr(expr, str_as_lit=False)) for alias, expr in named_exprs.items(): compliant_expr = self._extract_compliant(expr).alias(alias) out_exprs.append(compliant_expr) out_kinds.append(ExprKind.from_into_expr(expr, str_as_lit=False)) return out_exprs, out_kinds @abstractmethod def _extract_compliant(self, arg: Any) -> Any: raise NotImplementedError @property def schema(self) -> Schema: return Schema(self._compliant_frame.schema.items()) def collect_schema(self) -> Schema: native_schema = dict(self._compliant_frame.collect_schema()) return Schema(native_schema) def pipe( self, function: Callable[Concatenate[Self, PS], R], *args: PS.args, **kwargs: PS.kwargs, ) -> R: return function(self, *args, **kwargs) def drop_nulls(self, subset: str | list[str] | None) -> Self: subset = [subset] if isinstance(subset, str) else subset return self._with_compliant(self._compliant_frame.drop_nulls(subset=subset)) @property def columns(self) -> list[str]: return self._compliant_frame.columns # type: ignore[no-any-return] def with_columns( self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr ) -> Self: compliant_exprs, kinds = self._flatten_and_extract(*exprs, **named_exprs) compliant_exprs = [ compliant_expr.broadcast(kind) if is_scalar_like(kind) else compliant_expr for compliant_expr, kind in zip(compliant_exprs, kinds) ] return self._with_compliant(self._compliant_frame.with_columns(*compliant_exprs)) def select( self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr ) -> Self: flat_exprs = tuple(flatten(exprs)) if flat_exprs and all(isinstance(x, str) for x in flat_exprs) and not named_exprs: # fast path! try: return self._with_compliant( self._compliant_frame.simple_select(*flat_exprs) ) except Exception as e: # Column not found is the only thing that can realistically be raised here. if error := self._compliant_frame._check_columns_exist(flat_exprs): raise error from e raise compliant_exprs, kinds = self._flatten_and_extract(*flat_exprs, **named_exprs) if compliant_exprs and all_exprs_are_scalar_like(*flat_exprs, **named_exprs): return self._with_compliant(self._compliant_frame.aggregate(*compliant_exprs)) compliant_exprs = [ compliant_expr.broadcast(kind) if is_scalar_like(kind) else compliant_expr for compliant_expr, kind in zip(compliant_exprs, kinds) ] return self._with_compliant(self._compliant_frame.select(*compliant_exprs)) def rename(self, mapping: dict[str, str]) -> Self: return self._with_compliant(self._compliant_frame.rename(mapping)) def head(self, n: int) -> Self: return self._with_compliant(self._compliant_frame.head(n)) def tail(self, n: int) -> Self: return self._with_compliant(self._compliant_frame.tail(n)) def drop(self, *columns: Iterable[str], strict: bool) -> Self: return self._with_compliant(self._compliant_frame.drop(columns, strict=strict)) def filter( self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any ) -> Self: if len(predicates) == 1 and is_list_of(predicates[0], bool): predicate = predicates[0] else: from narwhals.functions import col flat_predicates = flatten(predicates) check_expressions_preserve_length(*flat_predicates, function_name="filter") plx = self.__narwhals_namespace__() compliant_predicates, _kinds = self._flatten_and_extract(*flat_predicates) compliant_constraints = ( (col(name) == v)._to_compliant_expr(plx) for name, v in constraints.items() ) predicate = plx.all_horizontal( *chain(compliant_predicates, compliant_constraints), ignore_nulls=False ) return self._with_compliant(self._compliant_frame.filter(predicate)) def sort( self, by: str | Iterable[str], *more_by: str, descending: bool | Sequence[bool] = False, nulls_last: bool = False, ) -> Self: by = flatten([*flatten([by]), *more_by]) return self._with_compliant( self._compliant_frame.sort(*by, descending=descending, nulls_last=nulls_last) ) def join( self, other: Self, on: str | list[str] | None, how: JoinStrategy, *, left_on: str | list[str] | None, right_on: str | list[str] | None, suffix: str, ) -> Self: _supported_joins = ("inner", "left", "full", "cross", "anti", "semi") on = [on] if isinstance(on, str) else on left_on = [left_on] if isinstance(left_on, str) else left_on right_on = [right_on] if isinstance(right_on, str) else right_on compliant = self._compliant_frame other = self._extract_compliant(other) if how not in _supported_joins: msg = f"Only the following join strategies are supported: {_supported_joins}; found '{how}'." raise NotImplementedError(msg) if how == "cross": if left_on is not None or right_on is not None or on is not None: msg = "Can not pass `left_on`, `right_on` or `on` keys for cross join" raise ValueError(msg) result = compliant.join( other, how=how, left_on=None, right_on=None, suffix=suffix ) elif on is None: if left_on is None or right_on is None: msg = f"Either (`left_on` and `right_on`) or `on` keys should be specified for {how}." raise ValueError(msg) if len(left_on) != len(right_on): msg = "`left_on` and `right_on` must have the same length." raise ValueError(msg) result = compliant.join( other, how=how, left_on=left_on, right_on=right_on, suffix=suffix ) else: if left_on is not None or right_on is not None: msg = f"If `on` is specified, `left_on` and `right_on` should be None for {how}." raise ValueError(msg) result = compliant.join( other, how=how, left_on=on, right_on=on, suffix=suffix ) return self._with_compliant(result) def gather_every(self, n: int, offset: int = 0) -> Self: return self._with_compliant( self._compliant_frame.gather_every(n=n, offset=offset) ) def join_asof( self, other: Self, *, left_on: str | None, right_on: str | None, on: str | None, by_left: str | list[str] | None, by_right: str | list[str] | None, by: str | list[str] | None, strategy: AsofJoinStrategy, suffix: str, ) -> Self: _supported_strategies = ("backward", "forward", "nearest") if strategy not in _supported_strategies: msg = f"Only the following strategies are supported: {_supported_strategies}; found '{strategy}'." raise NotImplementedError(msg) if (on is None) and (left_on is None or right_on is None): msg = "Either (`left_on` and `right_on`) or `on` keys should be specified." raise ValueError(msg) if (on is not None) and (left_on is not None or right_on is not None): msg = "If `on` is specified, `left_on` and `right_on` should be None." raise ValueError(msg) if (by is None) and ( (by_left is None and by_right is not None) or (by_left is not None and by_right is None) ): msg = ( "Can not specify only `by_left` or `by_right`, you need to specify both." ) raise ValueError(msg) if (by is not None) and (by_left is not None or by_right is not None): msg = "If `by` is specified, `by_left` and `by_right` should be None." raise ValueError(msg) if on is not None: left_on = right_on = on if by is not None: by_left = by_right = by by_left = [by_left] if isinstance(by_left, str) else by_left by_right = [by_right] if isinstance(by_right, str) else by_right if (isinstance(by_left, list) and isinstance(by_right, list)) and ( len(by_left) != len(by_right) ): msg = "`by_left` and `by_right` must have the same length." raise ValueError(msg) return self._with_compliant( self._compliant_frame.join_asof( self._extract_compliant(other), left_on=left_on, right_on=right_on, by_left=by_left, by_right=by_right, strategy=strategy, suffix=suffix, ) ) def unpivot( self, on: str | list[str] | None, *, index: str | list[str] | None, variable_name: str, value_name: str, ) -> Self: on = [on] if isinstance(on, str) else on index = [index] if isinstance(index, str) else index return self._with_compliant( self._compliant_frame.unpivot( on=on, index=index, variable_name=variable_name, value_name=value_name ) ) def __neq__(self, other: object) -> NoReturn: msg = ( "DataFrame.__neq__ and LazyFrame.__neq__ are not implemented, please " "use expressions instead.\n\n" "Hint: instead of\n" " df != 0\n" "you may want to use\n" " df.select(nw.all() != 0)" ) raise NotImplementedError(msg) def __eq__(self, other: object) -> NoReturn: msg = ( "DataFrame.__eq__ and LazyFrame.__eq__ are not implemented, please " "use expressions instead.\n\n" "Hint: instead of\n" " df == 0\n" "you may want to use\n" " df.select(nw.all() == 0)" ) raise NotImplementedError(msg) def explode(self, columns: str | Sequence[str], *more_columns: str) -> Self: to_explode = ( [columns, *more_columns] if isinstance(columns, str) else [*columns, *more_columns] ) return self._with_compliant(self._compliant_frame.explode(columns=to_explode)) class DataFrame(BaseFrame[DataFrameT]): """Narwhals DataFrame, backed by a native eager dataframe. Warning: This class is not meant to be instantiated directly - instead: - If the native object is a eager dataframe from one of the supported backend (e.g. pandas.DataFrame, polars.DataFrame, pyarrow.Table), you can use [`narwhals.from_native`][]: ```py narwhals.from_native(native_dataframe) narwhals.from_native(native_dataframe, eager_only=True) ``` - If the object is a dictionary of column names and generic sequences mapping (e.g. `dict[str, list]`), you can create a DataFrame via [`narwhals.from_dict`][]: ```py narwhals.from_dict( data={"a": [1, 2, 3]}, backend=narwhals.get_native_namespace(another_object), ) ``` """ _version: ClassVar[Version] = Version.MAIN def _extract_compliant(self, arg: Any) -> Any: from narwhals.expr import Expr from narwhals.series import Series plx: EagerNamespaceAny = self.__narwhals_namespace__() if isinstance(arg, BaseFrame): return arg._compliant_frame if isinstance(arg, Series): return arg._compliant_series._to_expr() if isinstance(arg, Expr): return arg._to_compliant_expr(self.__narwhals_namespace__()) if isinstance(arg, str): return plx.col(arg) if get_polars() is not None and "polars" in str(type(arg)): # pragma: no cover msg = ( f"Expected Narwhals object, got: {type(arg)}.\n\n" "Perhaps you:\n" "- Forgot a `nw.from_native` somewhere?\n" "- Used `pl.col` instead of `nw.col`?" ) raise TypeError(msg) if is_numpy_array(arg): return plx._series.from_numpy(arg, context=plx)._to_expr() raise InvalidIntoExprError.from_invalid_type(type(arg)) @property def _series(self) -> type[Series[Any]]: return Series @property def _lazyframe(self) -> type[LazyFrame[Any]]: return LazyFrame def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> None: self._level: Literal["full", "lazy", "interchange"] = level # NOTE: Interchange support (`DataFrameLike`) is the source of the error self._compliant_frame: CompliantDataFrame[Any, Any, DataFrameT, Self] # type: ignore[type-var] if is_compliant_dataframe(df): self._compliant_frame = df.__narwhals_dataframe__() else: # pragma: no cover msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}" raise AssertionError(msg) @classmethod def from_arrow( cls, native_frame: IntoArrowTable, *, backend: ModuleType | Implementation | str ) -> DataFrame[Any]: """Construct a DataFrame from an object which supports the PyCapsule Interface. Arguments: native_frame: Object which implements `__arrow_c_stream__`. backend: specifies which eager backend instantiate to. `backend` can be specified in various ways - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW`, `POLARS`, `MODIN` or `CUDF`. - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. Returns: A new DataFrame. Examples: >>> import pandas as pd >>> import polars as pl >>> import narwhals as nw >>> >>> df_native = pd.DataFrame({"a": [1, 2], "b": [4.2, 5.1]}) >>> nw.DataFrame.from_arrow(df_native, backend="polars") ┌──────────────────┐ |Narwhals DataFrame| |------------------| | shape: (2, 2) | | ┌─────┬─────┐ | | │ a ┆ b │ | | │ --- ┆ --- │ | | │ i64 ┆ f64 │ | | ╞═════╪═════╡ | | │ 1 ┆ 4.2 │ | | │ 2 ┆ 5.1 │ | | └─────┴─────┘ | └──────────────────┘ """ if not (supports_arrow_c_stream(native_frame) or is_pyarrow_table(native_frame)): msg = f"Given object of type {type(native_frame)} does not support PyCapsule interface" raise TypeError(msg) implementation = Implementation.from_backend(backend) if is_eager_allowed(implementation): ns = cls._version.namespace.from_backend(implementation).compliant compliant = ns._dataframe.from_arrow(native_frame, context=ns) return cls(compliant, level="full") msg = ( f"{implementation} support in Narwhals is lazy-only, but `DataFrame.from_arrow` is an eager-only function.\n\n" "Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n" f" nw.DataFrame.from_arrow(df, backend='pyarrow').lazy('{implementation}')" ) raise ValueError(msg) @classmethod def from_dict( cls, data: Mapping[str, Any], schema: Mapping[str, DType] | Schema | None = None, *, backend: ModuleType | Implementation | str | None = None, ) -> DataFrame[Any]: """Instantiate DataFrame from dictionary. Indexes (if present, for pandas-like backends) are aligned following the [left-hand-rule](../concepts/pandas_index.md/). Notes: For pandas-like dataframes, conversion to schema is applied after dataframe creation. Arguments: data: Dictionary to create DataFrame from. schema: The DataFrame schema as Schema or dict of {name: type}. If not specified, the schema will be inferred by the native library. backend: specifies which eager backend instantiate to. Only necessary if inputs are not Narwhals Series. `backend` can be specified in various ways - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW`, `POLARS`, `MODIN` or `CUDF`. - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. Returns: A new DataFrame. Examples: >>> import pandas as pd >>> import narwhals as nw >>> data = {"c": [5, 2], "d": [1, 4]} >>> nw.DataFrame.from_dict(data, backend="pandas") ┌──────────────────┐ |Narwhals DataFrame| |------------------| | c d | | 0 5 1 | | 1 2 4 | └──────────────────┘ """ if backend is None: data, backend = _from_dict_no_backend(data) implementation = Implementation.from_backend(backend) if is_eager_allowed(implementation): ns = cls._version.namespace.from_backend(implementation).compliant compliant = ns._dataframe.from_dict(data, schema=schema, context=ns) return cls(compliant, level="full") # NOTE: (#2786) needs resolving for extensions msg = ( f"{implementation} support in Narwhals is lazy-only, but `DataFrame.from_dict` is an eager-only function.\n\n" "Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n" f" nw.DataFrame.from_dict({{'a': [1, 2]}}, backend='pyarrow').lazy('{implementation}')" ) raise ValueError(msg) @classmethod def from_numpy( cls, data: _2DArray, schema: Mapping[str, DType] | Schema | Sequence[str] | None = None, *, backend: ModuleType | Implementation | str, ) -> DataFrame[Any]: """Construct a DataFrame from a NumPy ndarray. Notes: Only row orientation is currently supported. For pandas-like dataframes, conversion to schema is applied after dataframe creation. Arguments: data: Two-dimensional data represented as a NumPy ndarray. schema: The DataFrame schema as Schema, dict of {name: type}, or a sequence of str. backend: specifies which eager backend instantiate to. `backend` can be specified in various ways - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW`, `POLARS`, `MODIN` or `CUDF`. - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. Returns: A new DataFrame. Examples: >>> import numpy as np >>> import polars as pl >>> import narwhals as nw >>> >>> arr = np.array([[5, 2, 1], [1, 4, 3]]) >>> schema = {"c": nw.Int16(), "d": nw.Float32(), "e": nw.Int8()} >>> nw.DataFrame.from_numpy(arr, schema=schema, backend="polars") ┌───────────────────┐ |Narwhals DataFrame | |-------------------| |shape: (2, 3) | |┌─────┬─────┬─────┐| |│ c ┆ d ┆ e │| |│ --- ┆ --- ┆ --- │| |│ i16 ┆ f32 ┆ i8 │| |╞═════╪═════╪═════╡| |│ 5 ┆ 2.0 ┆ 1 │| |│ 1 ┆ 4.0 ┆ 3 │| |└─────┴─────┴─────┘| └───────────────────┘ """ if not is_numpy_array_2d(data): msg = "`from_numpy` only accepts 2D numpy arrays" raise ValueError(msg) if not _is_into_schema(schema): msg = ( "`schema` is expected to be one of the following types: " "Mapping[str, DType] | Schema | Sequence[str]. " f"Got {type(schema)}." ) raise TypeError(msg) implementation = Implementation.from_backend(backend) if is_eager_allowed(implementation): ns = cls._version.namespace.from_backend(implementation).compliant return cls(ns.from_numpy(data, schema), level="full") msg = ( f"{implementation} support in Narwhals is lazy-only, but `DataFrame.from_numpy` is an eager-only function.\n\n" "Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n" f" nw.DataFrame.from_numpy(arr, backend='pyarrow').lazy('{implementation}')" ) raise ValueError(msg) @property def implementation(self) -> Implementation: """Return implementation of native frame. This can be useful when you need to use special-casing for features outside of Narwhals' scope - for example, when dealing with pandas' Period Dtype. Returns: Implementation. Examples: >>> import narwhals as nw >>> import pandas as pd >>> df_native = pd.DataFrame({"a": [1, 2, 3]}) >>> df = nw.from_native(df_native) >>> df.implementation >>> df.implementation.is_pandas() True >>> df.implementation.is_pandas_like() True >>> df.implementation.is_polars() False """ return self._compliant_frame._implementation def __len__(self) -> int: return self._compliant_frame.__len__() def __array__(self, dtype: Any = None, copy: bool | None = None) -> _2DArray: # noqa: FBT001 return self._compliant_frame.__array__(dtype, copy=copy) def __repr__(self) -> str: # pragma: no cover return generate_repr("Narwhals DataFrame", self.to_native().__repr__()) def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: """Export a DataFrame via the Arrow PyCapsule Interface. - if the underlying dataframe implements the interface, it'll return that - else, it'll call `to_arrow` and then defer to PyArrow's implementation See [PyCapsule Interface](https://arrow.apache.org/docs/dev/format/CDataInterface/PyCapsuleInterface.html) for more. """ native_frame = self._compliant_frame._native_frame if supports_arrow_c_stream(native_frame): return native_frame.__arrow_c_stream__(requested_schema=requested_schema) try: pa_version = Implementation.PYARROW._backend_version() except ModuleNotFoundError as exc: # pragma: no cover msg = f"'pyarrow>=14.0.0' is required for `DataFrame.__arrow_c_stream__` for object of type {type(native_frame)}" raise ModuleNotFoundError(msg) from exc if pa_version < (14, 0): # pragma: no cover msg = f"'pyarrow>=14.0.0' is required for `DataFrame.__arrow_c_stream__` for object of type {type(native_frame)}" raise ModuleNotFoundError(msg) from None pa_table = self.to_arrow() return pa_table.__arrow_c_stream__(requested_schema=requested_schema) # type: ignore[no-untyped-call] def lazy( self, backend: ModuleType | Implementation | str | None = None ) -> LazyFrame[Any]: """Restrict available API methods to lazy-only ones. If `backend` is specified, then a conversion between different backends might be triggered. If a library does not support lazy execution and `backend` is not specified, then this is will only restrict the API to lazy-only operations. This is useful if you want to ensure that you write dataframe-agnostic code which all has the possibility of running entirely lazily. Arguments: backend: Which lazy backend collect to. This will be the underlying backend for the resulting Narwhals LazyFrame. If not specified, and the given library does not support lazy execution, then this will restrict the API to lazy-only operations. `backend` can be specified in various ways - As `Implementation.` with `BACKEND` being `DASK`, `DUCKDB`, `IBIS` or `POLARS`. - As a string: `"dask"`, `"duckdb"`, `"ibis"` or `"polars"` - Directly as a module `dask.dataframe`, `duckdb`, `ibis` or `polars`. Returns: A new LazyFrame. Examples: >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pl.DataFrame({"a": [1, 2], "b": [4, 6]}) >>> df = nw.from_native(df_native) If we call `df.lazy`, we get a `narwhals.LazyFrame` backed by a Polars LazyFrame. >>> df.lazy() # doctest: +SKIP ┌─────────────────────────────┐ | Narwhals LazyFrame | |-----------------------------| || └─────────────────────────────┘ We can also pass DuckDB as the backend, and then we'll get a `narwhals.LazyFrame` backed by a `duckdb.DuckDBPyRelation`. >>> df.lazy(backend=nw.Implementation.DUCKDB) ┌──────────────────┐ |Narwhals LazyFrame| |------------------| |┌───────┬───────┐ | |│ a │ b │ | |│ int64 │ int64 │ | |├───────┼───────┤ | |│ 1 │ 4 │ | |│ 2 │ 6 │ | |└───────┴───────┘ | └──────────────────┘ """ lazy_backend = None if backend is None else Implementation.from_backend(backend) supported_lazy_backends = ( Implementation.DASK, Implementation.DUCKDB, Implementation.POLARS, Implementation.IBIS, ) if lazy_backend is not None and lazy_backend not in supported_lazy_backends: msg = ( "Not-supported backend." f"\n\nExpected one of {supported_lazy_backends} or `None`, got {lazy_backend}" ) raise ValueError(msg) return self._lazyframe( self._compliant_frame.lazy(backend=lazy_backend), level="lazy" ) def to_native(self) -> DataFrameT: """Convert Narwhals DataFrame to native one. Returns: Object of class that user started with. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame( ... {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} ... ) Calling `to_native` on a Narwhals DataFrame returns the native object: >>> nw.from_native(df_native).to_native() foo bar ham 0 1 6.0 a 1 2 7.0 b 2 3 8.0 c """ return self._compliant_frame._native_frame def to_pandas(self) -> pd.DataFrame: """Convert this DataFrame to a pandas DataFrame. Returns: A pandas DataFrame. Examples: >>> import polars as pl >>> import narwhals as nw >>> df_native = pl.DataFrame( ... {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} ... ) >>> df = nw.from_native(df_native) >>> df.to_pandas() foo bar ham 0 1 6.0 a 1 2 7.0 b 2 3 8.0 c """ return self._compliant_frame.to_pandas() def to_polars(self) -> pl.DataFrame: """Convert this DataFrame to a polars DataFrame. Returns: A polars DataFrame. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> df = nw.from_native(df_native) >>> df.to_polars() shape: (2, 2) ┌─────┬─────┐ │ foo ┆ bar │ │ --- ┆ --- │ │ i64 ┆ f64 │ ╞═════╪═════╡ │ 1 ┆ 6.0 │ │ 2 ┆ 7.0 │ └─────┴─────┘ """ return self._compliant_frame.to_polars() @overload def write_csv(self, file: None = None) -> str: ... @overload def write_csv(self, file: str | Path | BytesIO) -> None: ... def write_csv(self, file: str | Path | BytesIO | None = None) -> str | None: r"""Write dataframe to comma-separated values (CSV) file. Arguments: file: String, path object or file-like object to which the dataframe will be written. If None, the resulting csv format is returned as a string. Returns: String or None. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame( ... {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} ... ) >>> df = nw.from_native(df_native) >>> df.write_csv() 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' If we had passed a file name to `write_csv`, it would have been written to that file. """ return self._compliant_frame.write_csv(file) def write_parquet(self, file: str | Path | BytesIO) -> None: """Write dataframe to parquet file. Arguments: file: String, path object or file-like object to which the dataframe will be written. Returns: None. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> df = nw.from_native(df_native) >>> df.write_parquet("out.parquet") # doctest:+SKIP """ self._compliant_frame.write_parquet(file) def to_numpy(self) -> _2DArray: """Convert this DataFrame to a NumPy ndarray. Returns: A NumPy ndarray array. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"foo": [1, 2], "bar": [6.5, 7.0]}) >>> df = nw.from_native(df_native) >>> df.to_numpy() array([[1. , 6.5], [2. , 7. ]]) """ return self._compliant_frame.to_numpy(None, copy=None) @property def shape(self) -> tuple[int, int]: """Get the shape of the DataFrame. Returns: The shape of the dataframe as a tuple. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"foo": [1, 2]}) >>> df = nw.from_native(df_native) >>> df.shape (2, 1) """ return self._compliant_frame.shape def get_column(self, name: str) -> Series[Any]: """Get a single column by name. Arguments: name: The column name as a string. Returns: A Narwhals Series, backed by a native series. Notes: Although `name` is typed as `str`, pandas does allow non-string column names, and they will work when passed to this function if the `narwhals.DataFrame` is backed by a pandas dataframe with non-string columns. This function can only be used to extract a column by name, so there is no risk of ambiguity. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"a": [1, 2]}) >>> df = nw.from_native(df_native) >>> df.get_column("a").to_native() 0 1 1 2 Name: a, dtype: int64 """ return self._series(self._compliant_frame.get_column(name), level=self._level) def estimated_size(self, unit: SizeUnit = "b") -> int | float: """Return an estimation of the total (heap) allocated size of the `DataFrame`. Estimated size is given in the specified unit (bytes by default). Arguments: unit: 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. Returns: Integer or Float. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> df = nw.from_native(df_native) >>> df.estimated_size() 32 """ return self._compliant_frame.estimated_size(unit=unit) # `str` overlaps with `Sequence[str]` # We can ignore this but we must keep this overload ordering @overload def __getitem__(self, item: tuple[SingleIndexSelector, SingleColSelector]) -> Any: ... @overload def __getitem__( # type: ignore[overload-overlap] self, item: str | tuple[MultiIndexSelector, SingleColSelector] ) -> Series[Any]: ... @overload def __getitem__( self, item: ( SingleIndexSelector | MultiIndexSelector | MultiColSelector | tuple[SingleIndexSelector, MultiColSelector] | tuple[MultiIndexSelector, MultiColSelector] ), ) -> Self: ... def __getitem__( # noqa: C901, PLR0912 self, item: ( SingleIndexSelector | SingleColSelector | MultiColSelector | MultiIndexSelector | tuple[SingleIndexSelector, SingleColSelector] | tuple[SingleIndexSelector, MultiColSelector] | tuple[MultiIndexSelector, SingleColSelector] | tuple[MultiIndexSelector, MultiColSelector] ), ) -> Series[Any] | Self | Any: """Extract column or slice of DataFrame. Arguments: item: How to slice dataframe. What happens depends on what is passed. It's easiest to explain by example. Suppose we have a Dataframe `df` - `df['a']` extracts column `'a'` and returns a `Series`. - `df[0:2]` extracts the first two rows and returns a `DataFrame`. - `df[0:2, 'a']` extracts the first two rows from column `'a'` and returns a `Series`. - `df[0:2, 0]` extracts the first two rows from the first column and returns a `Series`. - `df[[0, 1], [0, 1, 2]]` extracts the first two rows and the first three columns and returns a `DataFrame` - `df[:, [0, 1, 2]]` extracts all rows from the first three columns and returns a `DataFrame`. - `df[:, ['a', 'c']]` extracts all rows and columns `'a'` and `'c'` and returns a `DataFrame`. - `df[['a', 'c']]` extracts all rows and columns `'a'` and `'c'` and returns a `DataFrame`. - `df[0: 2, ['a', 'c']]` extracts the first two rows and columns `'a'` and `'c'` and returns a `DataFrame` - `df[:, 0: 2]` extracts all rows from the first two columns and returns a `DataFrame` - `df[:, 'a': 'c']` extracts all rows and all columns positioned between `'a'` and `'c'` _inclusive_ and returns a `DataFrame`. For example, if the columns are `'a', 'd', 'c', 'b'`, then that would extract columns `'a'`, `'d'`, and `'c'`. Returns: A Narwhals Series, backed by a native series. Notes: - Integers are always interpreted as positions - Strings are always interpreted as column names. In contrast with Polars, pandas allows non-string column names. If you don't know whether the column name you're trying to extract is definitely a string (e.g. `df[df.columns[0]]`) then you should use `DataFrame.get_column` instead. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"a": [1, 2]}) >>> df = nw.from_native(df_native) >>> df["a"].to_native() 0 1 1 2 Name: a, dtype: int64 """ from narwhals.series import Series msg = ( f"Unexpected type for `DataFrame.__getitem__`, got: {type(item)}.\n\n" "Hints:\n" "- use `df.item` to select a single item.\n" "- Use `df[indices, :]` to select rows positionally.\n" "- Use `df.filter(mask)` to filter rows based on a boolean mask." ) if isinstance(item, tuple): if len(item) > 2: tuple_msg = ( "Tuples cannot be passed to DataFrame.__getitem__ directly.\n\n" "Hint: instead of `df[indices]`, did you mean `df[indices, :]`?" ) raise TypeError(tuple_msg) rows = None if not item or is_slice_none(item[0]) else item[0] columns = None if len(item) < 2 or is_slice_none(item[1]) else item[1] if rows is None and columns is None: return self elif is_index_selector(item): rows = item columns = None elif is_sequence_like(item) or isinstance(item, (slice, str)): rows = None columns = item else: raise TypeError(msg) if isinstance(rows, str): raise TypeError(msg) compliant = self._compliant_frame if isinstance(columns, (int, str)): if isinstance(rows, int): return self.item(rows, columns) col_name = columns if isinstance(columns, str) else self.columns[columns] series = self.get_column(col_name) return series[rows] if rows is not None else series if isinstance(rows, Series): rows = rows._compliant_series if isinstance(columns, Series): columns = columns._compliant_series if rows is None: return self._with_compliant(compliant[:, columns]) if columns is None: return self._with_compliant(compliant[rows, :]) return self._with_compliant(compliant[rows, columns]) def __contains__(self, key: str) -> bool: return key in self.columns @overload def to_dict(self, *, as_series: Literal[True] = ...) -> dict[str, Series[Any]]: ... @overload def to_dict(self, *, as_series: Literal[False]) -> dict[str, list[Any]]: ... @overload def to_dict( self, *, as_series: bool ) -> dict[str, Series[Any]] | dict[str, list[Any]]: ... def to_dict( self, *, as_series: bool = True ) -> dict[str, Series[Any]] | dict[str, list[Any]]: """Convert DataFrame to a dictionary mapping column name to values. Arguments: as_series: If set to true ``True``, then the values are Narwhals Series, otherwise the values are Any. Returns: A mapping from column name to values / Series. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"A": [1, 2], "fruits": ["banana", "apple"]}) >>> df = nw.from_native(df_native) >>> df.to_dict(as_series=False) {'A': [1, 2], 'fruits': ['banana', 'apple']} """ if as_series: return { key: self._series(value, level=self._level) for key, value in self._compliant_frame.to_dict( as_series=as_series ).items() } return self._compliant_frame.to_dict(as_series=as_series) def row(self, index: int) -> tuple[Any, ...]: """Get values at given row. Warning: You should NEVER use this method to iterate over a DataFrame; if you require row-iteration you should strongly prefer use of iter_rows() instead. Arguments: index: Row number. Returns: A tuple of the values in the selected row. Notes: cuDF doesn't support this method. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"a": [1, 2], "b": [4, 5]}) >>> nw.from_native(df_native).row(1) (, ) """ return self._compliant_frame.row(index) # inherited def pipe( self, function: Callable[Concatenate[Self, PS], R], *args: PS.args, **kwargs: PS.kwargs, ) -> R: """Pipe function call. Arguments: function: Function to apply. args: Positional arguments to pass to function. kwargs: Keyword arguments to pass to function. Returns: The original object with the function applied. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"a": [1, 2], "ba": [4, 5]}) >>> nw.from_native(df_native).pipe( ... lambda _df: _df.select( ... [x for x in _df.columns if len(x) == 1] ... ).to_native() ... ) a 0 1 1 2 """ return super().pipe(function, *args, **kwargs) def drop_nulls(self, subset: str | list[str] | None = None) -> Self: """Drop rows that contain null values. Arguments: subset: Column name(s) for which null values are considered. If set to None (default), use all columns. Returns: The original object with the rows removed that contained the null values. Notes: pandas handles null values differently from Polars and PyArrow. See [null_handling](../concepts/null_handling.md) for reference. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"a": [1.0, None], "ba": [1.0, 2.0]}) >>> nw.from_native(df_native).drop_nulls().to_native() pyarrow.Table a: double ba: double ---- a: [[1]] ba: [[1]] """ return super().drop_nulls(subset=subset) def with_row_index( self, name: str = "index", *, order_by: str | Sequence[str] | None = None ) -> Self: """Insert column which enumerates rows. Arguments: name: The name of the column as a string. The default is "index". order_by: Column(s) to order by when computing the row index. Returns: The original object with the column added. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"a": [1, 2], "b": [4, 5]}) >>> nw.from_native(df_native).with_row_index().to_native() pyarrow.Table index: int64 a: int64 b: int64 ---- index: [[0,1]] a: [[1,2]] b: [[4,5]] """ order_by_ = [order_by] if isinstance(order_by, str) else order_by return self._with_compliant( self._compliant_frame.with_row_index(name, order_by=order_by_) ) @property def schema(self) -> Schema: r"""Get an ordered mapping of column names to their data type. Returns: A Narwhals Schema object that displays the mapping of column names. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> nw.from_native(df_native).schema Schema({'foo': Int64, 'bar': Float64}) """ return super().schema def collect_schema(self) -> Schema: r"""Get an ordered mapping of column names to their data type. Returns: A Narwhals Schema object that displays the mapping of column names. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> nw.from_native(df_native).collect_schema() Schema({'foo': Int64, 'bar': Float64}) """ return super().collect_schema() @property def columns(self) -> list[str]: """Get column names. Returns: The column names stored in a list. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> nw.from_native(df_native).columns ['foo', 'bar'] """ return super().columns @overload def rows(self, *, named: Literal[False] = False) -> list[tuple[Any, ...]]: ... @overload def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... @overload def rows(self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ... def rows( self, *, named: bool = False ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: """Returns all data in the DataFrame as a list of rows of python-native values. Arguments: named: By default, each row is returned as a tuple of values given in the same order as the frame columns. Setting named=True will return rows of dictionaries instead. Returns: The data as a list of rows. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> nw.from_native(df_native).rows() [(1, 6.0), (2, 7.0)] """ return self._compliant_frame.rows(named=named) # type: ignore[return-value] def iter_columns(self) -> Iterator[Series[Any]]: """Returns an iterator over the columns of this DataFrame. Yields: A Narwhals Series, backed by a native series. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> iter_columns = nw.from_native(df_native).iter_columns() >>> next(iter_columns) ┌───────────────────────┐ | Narwhals Series | |-----------------------| |0 1 | |1 2 | |Name: foo, dtype: int64| └───────────────────────┘ >>> next(iter_columns) ┌─────────────────────────┐ | Narwhals Series | |-------------------------| |0 6.0 | |1 7.0 | |Name: bar, dtype: float64| └─────────────────────────┘ """ for series in self._compliant_frame.iter_columns(): yield self._series(series, level=self._level) @overload def iter_rows( self, *, named: Literal[False] = ..., buffer_size: int = ... ) -> Iterator[tuple[Any, ...]]: ... @overload def iter_rows( self, *, named: Literal[True], buffer_size: int = ... ) -> Iterator[dict[str, Any]]: ... @overload def iter_rows( self, *, named: bool, buffer_size: int = ... ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: ... def iter_rows( self, *, named: bool = False, buffer_size: int = 512 ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: """Returns an iterator over the DataFrame of rows of python-native values. Arguments: named: By default, each row is returned as a tuple of values given in the same order as the frame columns. Setting named=True will return rows of dictionaries instead. buffer_size: Determines the number of rows that are buffered internally while iterating over the data. See https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_rows.html Returns: An iterator over the DataFrame of rows. Notes: cuDF doesn't support this method. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> iter_rows = nw.from_native(df_native).iter_rows() >>> next(iter_rows) (1, 6.0) >>> next(iter_rows) (2, 7.0) """ return self._compliant_frame.iter_rows(named=named, buffer_size=buffer_size) # type: ignore[return-value] def with_columns( self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr ) -> Self: r"""Add columns to this DataFrame. Added columns will replace existing columns with the same name. Arguments: *exprs: Column(s) to add, specified as positional arguments. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals. **named_exprs: Additional columns to add, specified as keyword arguments. The columns will be renamed to the keyword used. Returns: DataFrame: A new DataFrame with the columns added. Note: Creating a new DataFrame using this method does not create a new copy of existing data. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"a": [1, 2], "b": [0.5, 4.0]}) >>> ( ... nw.from_native(df_native) ... .with_columns((nw.col("a") * 2).alias("a*2")) ... .to_native() ... ) a b a*2 0 1 0.5 2 1 2 4.0 4 """ return super().with_columns(*exprs, **named_exprs) def select( self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr ) -> Self: r"""Select columns from this DataFrame. Arguments: *exprs: Column(s) to select, specified as positional arguments. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals. **named_exprs: Additional columns to select, specified as keyword arguments. The columns will be renamed to the keyword used. Returns: The dataframe containing only the selected columns. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"a": [1, 2], "b": [3, 4]}) >>> nw.from_native(df_native).select("a", a_plus_1=nw.col("a") + 1) ┌──────────────────┐ |Narwhals DataFrame| |------------------| |pyarrow.Table | |a: int64 | |a_plus_1: int64 | |---- | |a: [[1,2]] | |a_plus_1: [[2,3]] | └──────────────────┘ """ return super().select(*exprs, **named_exprs) def rename(self, mapping: dict[str, str]) -> Self: """Rename column names. Arguments: mapping: Key value pairs that map from old name to new name. Returns: The dataframe with the specified columns renamed. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, 2], "bar": [6, 7]}) >>> nw.from_native(df_native).rename({"foo": "apple"}).to_native() pyarrow.Table apple: int64 bar: int64 ---- apple: [[1,2]] bar: [[6,7]] """ return super().rename(mapping) def head(self, n: int = 5) -> Self: """Get the first `n` rows. Arguments: n: Number of rows to return. If a negative value is passed, return all rows except the last `abs(n)`. Returns: A subset of the dataframe of shape (n, n_columns). Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"a": [1, 2], "b": [0.5, 4.0]}) >>> nw.from_native(df_native).head(1).to_native() a b 0 1 0.5 """ return super().head(n) def tail(self, n: int = 5) -> Self: """Get the last `n` rows. Arguments: n: Number of rows to return. If a negative value is passed, return all rows except the first `abs(n)`. Returns: A subset of the dataframe of shape (n, n_columns). Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"a": [1, 2], "b": [0.5, 4.0]}) >>> nw.from_native(df_native).tail(1) ┌──────────────────┐ |Narwhals DataFrame| |------------------| | a b | | 1 2 4.0 | └──────────────────┘ """ return super().tail(n) def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: """Remove columns from the dataframe. Returns: The dataframe with the specified columns removed. Arguments: *columns: Names of the columns that should be removed from the dataframe. strict: Validate that all column names exist in the schema and throw an exception if a column name does not exist in the schema. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame( ... {"foo": [1, 2], "bar": [6.0, 7.0], "ham": ["a", "b"]} ... ) >>> nw.from_native(df_native).drop("ham").to_native() foo bar 0 1 6.0 1 2 7.0 """ return super().drop(*flatten(columns), strict=strict) def unique( self, subset: str | list[str] | None = None, *, keep: UniqueKeepStrategy = "any", maintain_order: bool = False, ) -> Self: """Drop duplicate rows from this dataframe. Arguments: subset: Column name(s) to consider when identifying duplicate rows. keep: {'first', 'last', 'any', 'none'} Which of the duplicate rows to keep. * 'any': Does not give any guarantee of which row is kept. This allows more optimizations. * 'none': Don't keep duplicate rows. * 'first': Keep first unique row. * 'last': Keep last unique row. maintain_order: Keep the same order as the original DataFrame. This may be more expensive to compute. Returns: The dataframe with the duplicate rows removed. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame( ... {"foo": [1, 2], "bar": ["a", "a"], "ham": ["b", "b"]} ... ) >>> nw.from_native(df_native).unique(["bar", "ham"]).to_native() foo bar ham 0 1 a b """ if keep not in {"any", "none", "first", "last"}: msg = f"Expected {'any', 'none', 'first', 'last'}, got: {keep}" raise ValueError(msg) if isinstance(subset, str): subset = [subset] return self._with_compliant( self._compliant_frame.unique(subset, keep=keep, maintain_order=maintain_order) ) def filter( self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any ) -> Self: r"""Filter the rows in the DataFrame based on one or more predicate expressions. The original order of the remaining rows is preserved. Arguments: *predicates: Expression(s) that evaluates to a boolean Series. Can also be a (single!) boolean list. **constraints: Column filters; use `name = value` to filter columns by the supplied value. Each constraint will behave the same as `nw.col(name).eq(value)`, and will be implicitly joined with the other filter conditions using &. Returns: The filtered dataframe. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame( ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} ... ) Filter on one condition >>> nw.from_native(df_native).filter(nw.col("foo") > 1).to_native() foo bar ham 1 2 7 b 2 3 8 c Filter on multiple conditions with implicit `&` >>> nw.from_native(df_native).filter( ... nw.col("foo") < 3, nw.col("ham") == "a" ... ).to_native() foo bar ham 0 1 6 a Filter on multiple conditions with `|` >>> nw.from_native(df_native).filter( ... (nw.col("foo") == 1) | (nw.col("ham") == "c") ... ).to_native() foo bar ham 0 1 6 a 2 3 8 c Filter using `**kwargs` syntax >>> nw.from_native(df_native).filter(foo=2, ham="b").to_native() foo bar ham 1 2 7 b """ return super().filter(*predicates, **constraints) @overload def group_by( self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: Literal[False] = ... ) -> GroupBy[Self]: ... @overload def group_by( self, *keys: str | Iterable[str], drop_null_keys: Literal[True] ) -> GroupBy[Self]: ... def group_by( self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: bool = False ) -> GroupBy[Self]: r"""Start a group by operation. Arguments: *keys: Column(s) to group by. Accepts expression input. Strings are parsed as column names. drop_null_keys: if True, then groups where any key is null won't be included in the result. Returns: GroupBy: Object which can be used to perform aggregations. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame( ... { ... "a": ["a", "b", "a", "b", "c"], ... "b": [1, 2, 1, 3, 3], ... "c": [5, 4, 3, 2, 1], ... } ... ) Group by one column and compute the sum of another column >>> nw.from_native(df_native, eager_only=True).group_by("a").agg( ... nw.col("b").sum() ... ).sort("a").to_native() a b 0 a 2 1 b 5 2 c 3 Group by multiple columns and compute the max of another column >>> ( ... nw.from_native(df_native, eager_only=True) ... .group_by(["a", "b"]) ... .agg(nw.max("c")) ... .sort("a", "b") ... .to_native() ... ) a b c 0 a 1 5 1 b 2 4 2 b 3 2 3 c 3 1 Expressions are also accepted. >>> nw.from_native(df_native, eager_only=True).group_by( ... "a", nw.col("b") // 2 ... ).agg(nw.col("c").mean()).to_native() a b c 0 a 0 4.0 1 b 1 3.0 2 c 1 1.0 """ from narwhals.group_by import GroupBy flat_keys = flatten(keys) if all(isinstance(key, str) for key in flat_keys): return GroupBy(self, flat_keys, drop_null_keys=drop_null_keys) from narwhals import col from narwhals.expr import Expr from narwhals.series import Series key_is_expr_or_series = tuple(isinstance(k, (Expr, Series)) for k in flat_keys) if drop_null_keys and any(key_is_expr_or_series): msg = "drop_null_keys cannot be True when keys contains Expr or Series" raise NotImplementedError(msg) _keys = [ k if is_expr else col(k) for k, is_expr in zip(flat_keys, key_is_expr_or_series) ] expr_flat_keys, kinds = self._flatten_and_extract(*_keys) if not all(kind is ExprKind.ELEMENTWISE for kind in kinds): from narwhals.exceptions import ComputeError msg = ( "Group by is not supported with keys that are not elementwise expressions" ) raise ComputeError(msg) return GroupBy(self, expr_flat_keys, drop_null_keys=drop_null_keys) def sort( self, by: str | Iterable[str], *more_by: str, descending: bool | Sequence[bool] = False, nulls_last: bool = False, ) -> Self: r"""Sort the dataframe by the given columns. Arguments: by: Column(s) names to sort by. *more_by: Additional columns to sort by, specified as positional arguments. descending: Sort in descending order. When sorting by multiple columns, can be specified per column by passing a sequence of booleans. nulls_last: Place null values last. Returns: The sorted dataframe. Note: Unlike Polars, it is not possible to specify a sequence of booleans for `nulls_last` in order to control per-column behaviour. Instead a single boolean is applied for all `by` columns. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame( ... {"foo": [2, 1], "bar": [6.0, 7.0], "ham": ["a", "b"]} ... ) >>> nw.from_native(df_native).sort("foo") ┌──────────────────┐ |Narwhals DataFrame| |------------------| | foo bar ham | | 1 1 7.0 b | | 0 2 6.0 a | └──────────────────┘ """ return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) def join( self, other: Self, on: str | list[str] | None = None, how: JoinStrategy = "inner", *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, suffix: str = "_right", ) -> Self: r"""Join in SQL-like fashion. Arguments: other: DataFrame to join with. on: Name(s) of the join columns in both DataFrames. If set, `left_on` and `right_on` should be None. how: Join strategy. * *inner*: Returns rows that have matching values in both tables. * *left*: Returns all rows from the left table, and the matched rows from the right table. * *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys. * *cross*: Returns the Cartesian product of rows from both tables. * *semi*: Filter rows that have a match in the right table. * *anti*: Filter rows that do not have a match in the right table. left_on: Join column of the left DataFrame. right_on: Join column of the right DataFrame. suffix: Suffix to append to columns with a duplicate name. Returns: A new joined DataFrame Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_1_native = pd.DataFrame({"id": ["a", "b"], "price": [6.0, 7.0]}) >>> df_2_native = pd.DataFrame({"id": ["a", "b", "c"], "qty": [1, 2, 3]}) >>> nw.from_native(df_1_native).join(nw.from_native(df_2_native), on="id") ┌──────────────────┐ |Narwhals DataFrame| |------------------| | id price qty | | 0 a 6.0 1 | | 1 b 7.0 2 | └──────────────────┘ """ return super().join( other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix ) def join_asof( self, other: Self, *, left_on: str | None = None, right_on: str | None = None, on: str | None = None, by_left: str | list[str] | None = None, by_right: str | list[str] | None = None, by: str | list[str] | None = None, strategy: AsofJoinStrategy = "backward", suffix: str = "_right", ) -> Self: """Perform an asof join. This is similar to a left-join except that we match on nearest key rather than equal keys. For Polars, both DataFrames must be sorted by the `on` key (within each `by` group if specified). Arguments: other: DataFrame to join with. left_on: Name(s) of the left join column(s). right_on: Name(s) of the right join column(s). on: Join column of both DataFrames. If set, left_on and right_on should be None. by_left: join on these columns before doing asof join. by_right: join on these columns before doing asof join. by: join on these columns before doing asof join. strategy: Join strategy. The default is "backward". suffix: Suffix to append to columns with a duplicate name. * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. * *forward*: selects the first row in the right DataFrame whose "on" key is greater than or equal to the left's key. * *nearest*: search selects the last row in the right DataFrame whose value is nearest to the left's key. Returns: A new joined DataFrame Examples: >>> from datetime import datetime >>> import pandas as pd >>> import narwhals as nw >>> data_gdp = { ... "datetime": [ ... datetime(2016, 1, 1), ... datetime(2017, 1, 1), ... datetime(2018, 1, 1), ... datetime(2019, 1, 1), ... datetime(2020, 1, 1), ... ], ... "gdp": [4164, 4411, 4566, 4696, 4827], ... } >>> data_population = { ... "datetime": [ ... datetime(2016, 3, 1), ... datetime(2018, 8, 1), ... datetime(2019, 1, 1), ... ], ... "population": [82.19, 82.66, 83.12], ... } >>> gdp_native = pd.DataFrame(data_gdp) >>> population_native = pd.DataFrame(data_population) >>> gdp = nw.from_native(gdp_native) >>> population = nw.from_native(population_native) >>> population.join_asof(gdp, on="datetime", strategy="backward") ┌──────────────────────────────┐ | Narwhals DataFrame | |------------------------------| | datetime population gdp| |0 2016-03-01 82.19 4164| |1 2018-08-01 82.66 4566| |2 2019-01-01 83.12 4696| └──────────────────────────────┘ """ return super().join_asof( other, left_on=left_on, right_on=right_on, on=on, by_left=by_left, by_right=by_right, by=by, strategy=strategy, suffix=suffix, ) # --- descriptive --- def is_duplicated(self) -> Series[Any]: r"""Get a mask of all duplicated rows in this DataFrame. Returns: A new Series. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"foo": [2, 2, 2], "bar": [6.0, 6.0, 7.0]}) >>> nw.from_native(df_native).is_duplicated() ┌───────────────┐ |Narwhals Series| |---------------| | 0 True | | 1 True | | 2 False | | dtype: bool | └───────────────┘ """ return ~self.is_unique() def is_empty(self) -> bool: r"""Check if the dataframe is empty. Returns: A boolean indicating whether the dataframe is empty (True) or not (False). Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"foo": [2, 2, 2], "bar": [6.0, 6.0, 7.0]}) >>> nw.from_native(df_native).is_empty() False """ return len(self) == 0 def is_unique(self) -> Series[Any]: r"""Get a mask of all unique rows in this DataFrame. Returns: A new Series. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"foo": [2, 2, 2], "bar": [6.0, 6.0, 7.0]}) >>> nw.from_native(df_native).is_unique() ┌───────────────┐ |Narwhals Series| |---------------| | 0 False | | 1 False | | 2 True | | dtype: bool | └───────────────┘ """ return self._series(self._compliant_frame.is_unique(), level=self._level) def null_count(self) -> Self: r"""Create a new DataFrame that shows the null counts per column. Returns: A dataframe of shape (1, n_columns). Notes: pandas handles null values differently from Polars and PyArrow. See [null_handling](../concepts/null_handling.md/) for reference. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, None], "bar": [2, 3]}) >>> nw.from_native(df_native).null_count() ┌──────────────────┐ |Narwhals DataFrame| |------------------| | pyarrow.Table | | foo: int64 | | bar: int64 | | ---- | | foo: [[1]] | | bar: [[0]] | └──────────────────┘ """ plx = self._compliant_frame.__narwhals_namespace__() result = self._compliant_frame.select(plx.all().null_count()) return self._with_compliant(result) def item(self, row: int | None = None, column: int | str | None = None) -> Any: r"""Return the DataFrame as a scalar, or return the element at the given row/column. Arguments: row: The *n*-th row. column: The column selected via an integer or a string (column name). Returns: A scalar or the specified element in the dataframe. Notes: If row/col not provided, this is equivalent to df[0,0], with a check that the shape is (1,1). With row/col, this is equivalent to df[row,col]. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, None], "bar": [2, 3]}) >>> nw.from_native(df_native).item(0, 1) 2 """ return self._compliant_frame.item(row=row, column=column) def clone(self) -> Self: r"""Create a copy of this DataFrame. Returns: An identical copy of the original dataframe. """ return self._with_compliant(self._compliant_frame.clone()) def gather_every(self, n: int, offset: int = 0) -> Self: r"""Take every nth row in the DataFrame and return as a new DataFrame. Arguments: n: Gather every *n*-th row. offset: Starting index. Returns: The dataframe containing only the selected rows. Examples: >>> import pyarrow as pa >>> import narwhals as nw >>> df_native = pa.table({"foo": [1, None, 2, 3]}) >>> nw.from_native(df_native).gather_every(2) ┌──────────────────┐ |Narwhals DataFrame| |------------------| | pyarrow.Table | | foo: int64 | | ---- | | foo: [[1,2]] | └──────────────────┘ """ return super().gather_every(n=n, offset=offset) def pivot( self, on: str | list[str], *, index: str | list[str] | None = None, values: str | list[str] | None = None, aggregate_function: PivotAgg | None = None, maintain_order: bool | None = None, sort_columns: bool = False, separator: str = "_", ) -> Self: r"""Create a spreadsheet-style pivot table as a DataFrame. Arguments: on: Name of the column(s) whose values will be used as the header of the output DataFrame. index: One or multiple keys to group by. If None, all remaining columns not specified on `on` and `values` will be used. At least one of `index` and `values` must be specified. values: One or multiple keys to group by. If None, all remaining columns not specified on `on` and `index` will be used. At least one of `index` and `values` must be specified. aggregate_function: Choose from - None: no aggregation takes place, will raise error if multiple values are in group. - A predefined aggregate function string, one of {'min', 'max', 'first', 'last', 'sum', 'mean', 'median', 'len'} maintain_order: Has no effect and is kept around only for backwards-compatibility. sort_columns: Sort the transposed columns by name. Default is by order of discovery. separator: Used as separator/delimiter in generated column names in case of multiple `values` columns. Returns: A new dataframe. Examples: >>> import pandas as pd >>> import narwhals as nw >>> data = { ... "ix": [1, 1, 2, 2, 1, 2], ... "col": ["a", "a", "a", "a", "b", "b"], ... "foo": [0, 1, 2, 2, 7, 1], ... "bar": [0, 2, 0, 0, 9, 4], ... } >>> df_native = pd.DataFrame(data) >>> nw.from_native(df_native).pivot( ... "col", index="ix", aggregate_function="sum" ... ) ┌─────────────────────────────────┐ | Narwhals DataFrame | |---------------------------------| | ix foo_a foo_b bar_a bar_b| |0 1 1 7 2 9| |1 2 4 1 0 4| └─────────────────────────────────┘ """ if values is None and index is None: msg = "At least one of `values` and `index` must be passed" raise ValueError(msg) if maintain_order is not None: msg = ( "`maintain_order` has no effect and is only kept around for backwards-compatibility. " "You can safely remove this argument." ) issue_warning(msg, UserWarning) on = [on] if isinstance(on, str) else on values = [values] if isinstance(values, str) else values index = [index] if isinstance(index, str) else index return self._with_compliant( self._compliant_frame.pivot( on=on, index=index, values=values, aggregate_function=aggregate_function, sort_columns=sort_columns, separator=separator, ) ) def to_arrow(self) -> pa.Table: r"""Convert to arrow table. Returns: A new PyArrow table. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"foo": [1, None], "bar": [2, 3]}) >>> nw.from_native(df_native).to_arrow() pyarrow.Table foo: double bar: int64 ---- foo: [[1,null]] bar: [[2,3]] """ return self._compliant_frame.to_arrow() def sample( self, n: int | None = None, *, fraction: float | None = None, with_replacement: bool = False, seed: int | None = None, ) -> Self: r"""Sample from this DataFrame. Arguments: n: Number of items to return. Cannot be used with fraction. fraction: Fraction of items to return. Cannot be used with n. with_replacement: Allow values to be sampled more than once. seed: Seed for the random number generator. If set to None (default), a random seed is generated for each sample operation. Returns: A new dataframe. Notes: The results may not be consistent across libraries. Examples: >>> import pandas as pd >>> import narwhals as nw >>> df_native = pd.DataFrame({"foo": [1, 2, 3], "bar": [19, 32, 4]}) >>> nw.from_native(df_native).sample(n=2) # doctest:+SKIP ┌──────────────────┐ |Narwhals DataFrame| |------------------| | foo bar | | 2 3 4 | | 1 2 32 | └──────────────────┘ """ return self._with_compliant( self._compliant_frame.sample( n=n, fraction=fraction, with_replacement=with_replacement, seed=seed ) ) def unpivot( self, on: str | list[str] | None = None, *, index: str | list[str] | None = None, variable_name: str = "variable", value_name: str = "value", ) -> Self: r"""Unpivot a DataFrame from wide to long format. Optionally leaves identifiers set. This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (index) while all other columns, considered measured variables (on), are "unpivoted" to the row axis leaving just two non-identifier columns, 'variable' and 'value'. Arguments: on: Column(s) to use as values variables; if `on` is empty all columns that are not in `index` will be used. index: Column(s) to use as identifier variables. variable_name: Name to give to the `variable` column. Defaults to "variable". value_name: Name to give to the `value` column. Defaults to "value". Returns: The unpivoted dataframe. Notes: If you're coming from pandas, this is similar to `pandas.DataFrame.melt`, but with `index` replacing `id_vars` and `on` replacing `value_vars`. In other frameworks, you might know this operation as `pivot_longer`. Examples: >>> import pandas as pd >>> import narwhals as nw >>> data = {"a": ["x", "y", "z"], "b": [1, 3, 5], "c": [2, 4, 6]} >>> df_native = pd.DataFrame(data) >>> nw.from_native(df_native).unpivot(["b", "c"], index="a") ┌────────────────────┐ | Narwhals DataFrame | |--------------------| | a variable value| |0 x b 1| |1 y b 3| |2 z b 5| |3 x c 2| |4 y c 4| |5 z c 6| └────────────────────┘ """ return super().unpivot( on=on, index=index, variable_name=variable_name, value_name=value_name ) def explode(self, columns: str | Sequence[str], *more_columns: str) -> Self: """Explode the dataframe to long format by exploding the given columns. Notes: It is possible to explode multiple columns only if these columns must have matching element counts. Arguments: columns: Column names. The underlying columns being exploded must be of the `List` data type. *more_columns: Additional names of columns to explode, specified as positional arguments. Returns: New DataFrame Examples: >>> import polars as pl >>> import narwhals as nw >>> data = {"a": ["x", "y"], "b": [[1, 2], [3]]} >>> df_native = pl.DataFrame(data) >>> nw.from_native(df_native).explode("b").to_native() shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═════╪═════╡ │ x ┆ 1 │ │ x ┆ 2 │ │ y ┆ 3 │ └─────┴─────┘ """ return super().explode(columns, *more_columns) class LazyFrame(BaseFrame[FrameT]): """Narwhals LazyFrame, backed by a native lazyframe. Warning: This class is not meant to be instantiated directly - instead use [`narwhals.from_native`][] with a native object that is a lazy dataframe from one of the supported backend (e.g. polars.LazyFrame, dask_expr._collection.DataFrame): ```py narwhals.from_native(native_lazyframe) ``` """ def _extract_compliant(self, arg: Any) -> Any: from narwhals.expr import Expr from narwhals.series import Series if isinstance(arg, BaseFrame): return arg._compliant_frame if isinstance(arg, Series): # pragma: no cover msg = "Binary operations between Series and LazyFrame are not supported." raise TypeError(msg) if isinstance(arg, str): # pragma: no cover plx = self.__narwhals_namespace__() return plx.col(arg) if isinstance(arg, Expr): if arg._metadata.n_orderable_ops: msg = ( "Order-dependent expressions are not supported for use in LazyFrame.\n\n" "Hint: To make the expression valid, use `.over` with `order_by` specified.\n\n" "For example, if you wrote `nw.col('price').cum_sum()` and you have a column\n" "`'date'` which orders your data, then replace:\n\n" " nw.col('price').cum_sum()\n\n" " with:\n\n" " nw.col('price').cum_sum().over(order_by='date')\n" " ^^^^^^^^^^^^^^^^^^^^^^\n\n" "See https://narwhals-dev.github.io/narwhals/concepts/order_dependence/." ) raise InvalidOperationError(msg) if arg._metadata.is_filtration: msg = ( "Length-changing expressions are not supported for use in LazyFrame, unless\n" "followed by an aggregation.\n\n" "Hints:\n" "- Instead of `lf.select(nw.col('a').head())`, use `lf.select('a').head()\n" "- Instead of `lf.select(nw.col('a').drop_nulls()).select(nw.sum('a'))`,\n" " use `lf.select(nw.col('a').drop_nulls().sum())\n" ) raise InvalidOperationError(msg) return arg._to_compliant_expr(self.__narwhals_namespace__()) if get_polars() is not None and "polars" in str(type(arg)): # pragma: no cover msg = ( f"Expected Narwhals object, got: {type(arg)}.\n\n" "Perhaps you:\n" "- Forgot a `nw.from_native` somewhere?\n" "- Used `pl.col` instead of `nw.col`?" ) raise TypeError(msg) raise InvalidIntoExprError.from_invalid_type(type(arg)) # pragma: no cover @property def _dataframe(self) -> type[DataFrame[Any]]: return DataFrame def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> None: self._level = level self._compliant_frame: CompliantLazyFrame[Any, FrameT, Self] # type: ignore[type-var] if is_compliant_lazyframe(df): self._compliant_frame = df.__narwhals_lazyframe__() else: # pragma: no cover msg = f"Expected Polars LazyFrame or an object that implements `__narwhals_lazyframe__`, got: {type(df)}" raise AssertionError(msg) def __repr__(self) -> str: # pragma: no cover return generate_repr("Narwhals LazyFrame", self.to_native().__repr__()) @property def implementation(self) -> Implementation: """Return implementation of native frame. This can be useful when you need to use special-casing for features outside of Narwhals' scope - for example, when dealing with pandas' Period Dtype. Returns: Implementation. Examples: >>> import narwhals as nw >>> import dask.dataframe as dd >>> lf_native = dd.from_dict({"a": [1, 2]}, npartitions=1) >>> nw.from_native(lf_native).implementation """ return self._compliant_frame._implementation def __getitem__(self, item: str | slice) -> NoReturn: msg = "Slicing is not supported on LazyFrame" raise TypeError(msg) def collect( self, backend: ModuleType | Implementation | str | None = None, **kwargs: Any ) -> DataFrame[Any]: r"""Materialize this LazyFrame into a DataFrame. As each underlying lazyframe has different arguments to set when materializing the lazyframe into a dataframe, we allow to pass them as kwargs (see examples below for how to generalize the specification). Arguments: backend: specifies which eager backend collect to. This will be the underlying backend for the resulting Narwhals DataFrame. If None, then the following default conversions will be applied - `polars.LazyFrame` -> `polars.DataFrame` - `dask.DataFrame` -> `pandas.DataFrame` - `duckdb.PyRelation` -> `pyarrow.Table` - `pyspark.DataFrame` -> `pyarrow.Table` `backend` can be specified in various ways - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW` or `POLARS`. - As a string: `"pandas"`, `"pyarrow"` or `"polars"` - Directly as a module `pandas`, `pyarrow` or `polars`. kwargs: backend specific kwargs to pass along. To know more please check the backend specific documentation - [polars.LazyFrame.collect](https://docs.pola.rs/api/python/dev/reference/lazyframe/api/polars.LazyFrame.collect.html) - [dask.dataframe.DataFrame.compute](https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.compute.html) Returns: DataFrame Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)") >>> lf = nw.from_native(lf_native) >>> lf ┌──────────────────┐ |Narwhals LazyFrame| |------------------| |┌───────┬───────┐ | |│ a │ b │ | |│ int32 │ int32 │ | |├───────┼───────┤ | |│ 1 │ 2 │ | |│ 3 │ 4 │ | |└───────┴───────┘ | └──────────────────┘ >>> lf.collect() ┌──────────────────┐ |Narwhals DataFrame| |------------------| | pyarrow.Table | | a: int32 | | b: int32 | | ---- | | a: [[1,3]] | | b: [[2,4]] | └──────────────────┘ """ eager_backend = None if backend is None else Implementation.from_backend(backend) supported_eager_backends = ( Implementation.POLARS, Implementation.PANDAS, Implementation.PYARROW, ) if eager_backend is not None and eager_backend not in supported_eager_backends: msg = f"Unsupported `backend` value.\nExpected one of {supported_eager_backends} or None, got: {eager_backend}." raise ValueError(msg) return self._dataframe( self._compliant_frame.collect(backend=eager_backend, **kwargs), level="full" ) def to_native(self) -> FrameT: """Convert Narwhals LazyFrame to native one. Returns: Object of class that user started with. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)") >>> nw.from_native(lf_native).to_native() ┌───────┬───────┐ │ a │ b │ │ int32 │ int32 │ ├───────┼───────┤ │ 1 │ 2 │ │ 3 │ 4 │ └───────┴───────┘ """ return to_native(narwhals_object=self, pass_through=False) # inherited def pipe( self, function: Callable[Concatenate[Self, PS], R], *args: PS.args, **kwargs: PS.kwargs, ) -> R: """Pipe function call. Arguments: function: Function to apply. args: Positional arguments to pass to function. kwargs: Keyword arguments to pass to function. Returns: The original object with the function applied. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)") >>> nw.from_native(lf_native).pipe(lambda x: x.select("a")).to_native() ┌───────┐ │ a │ │ int32 │ ├───────┤ │ 1 │ │ 3 │ └───────┘ """ return super().pipe(function, *args, **kwargs) def drop_nulls(self, subset: str | list[str] | None = None) -> Self: """Drop rows that contain null values. Arguments: subset: Column name(s) for which null values are considered. If set to None (default), use all columns. Returns: The original object with the rows removed that contained the null values. Notes: pandas handles null values differently from Polars and PyArrow. See [null_handling](../concepts/null_handling.md/) for reference. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, NULL), (3, 4) df(a, b)") >>> nw.from_native(lf_native).drop_nulls() ┌──────────────────┐ |Narwhals LazyFrame| |------------------| |┌───────┬───────┐ | |│ a │ b │ | |│ int32 │ int32 │ | |├───────┼───────┤ | |│ 3 │ 4 │ | |└───────┴───────┘ | └──────────────────┘ """ return super().drop_nulls(subset=subset) def with_row_index( self, name: str = "index", *, order_by: str | Sequence[str] ) -> Self: """Insert column which enumerates rows. Arguments: name: The name of the column as a string. The default is "index". order_by: Column(s) to order by when computing the row index. Returns: The original object with the column added. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 5), (2, 4) df(a, b)") >>> nw.from_native(lf_native).with_row_index(order_by="a").sort("a").collect() ┌──────────────────┐ |Narwhals DataFrame| |------------------| | pyarrow.Table | | index: int64 | | a: int32 | | b: int32 | | ---- | | index: [[0,1]] | | a: [[1,2]] | | b: [[5,4]] | └──────────────────┘ >>> nw.from_native(lf_native).with_row_index(order_by="b").sort("a").collect() ┌──────────────────┐ |Narwhals DataFrame| |------------------| | pyarrow.Table | | index: int64 | | a: int32 | | b: int32 | | ---- | | index: [[1,0]] | | a: [[1,2]] | | b: [[5,4]] | └──────────────────┘ """ order_by_ = [order_by] if isinstance(order_by, str) else order_by return self._with_compliant( self._compliant_frame.with_row_index(name, order_by=order_by_) ) @property def schema(self) -> Schema: r"""Get an ordered mapping of column names to their data type. Returns: A Narwhals Schema object that displays the mapping of column names. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") >>> nw.from_native(lf_native).schema # doctest:+SKIP Schema({'a': Int32, 'b': Decimal}) """ if self._compliant_frame._version is not Version.V1: msg = ( "Resolving the schema of a LazyFrame is a potentially expensive operation. " "Use `LazyFrame.collect_schema()` to get the schema without this warning." ) issue_warning(msg, PerformanceWarning) return super().schema def collect_schema(self) -> Schema: r"""Get an ordered mapping of column names to their data type. Returns: A Narwhals Schema object that displays the mapping of column names. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") >>> nw.from_native(lf_native).collect_schema() Schema({'a': Int32, 'b': Decimal}) """ return super().collect_schema() @property def columns(self) -> list[str]: r"""Get column names. Returns: The column names stored in a list. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") >>> nw.from_native(lf_native).columns ['a', 'b'] """ return super().columns def with_columns( self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr ) -> Self: r"""Add columns to this LazyFrame. Added columns will replace existing columns with the same name. Arguments: *exprs: Column(s) to add, specified as positional arguments. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals. **named_exprs: Additional columns to add, specified as keyword arguments. The columns will be renamed to the keyword used. Returns: LazyFrame: A new LazyFrame with the columns added. Note: Creating a new LazyFrame using this method does not create a new copy of existing data. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") >>> nw.from_native(lf_native).with_columns(c=nw.col("a") + 1) ┌────────────────────────────────┐ | Narwhals LazyFrame | |--------------------------------| |┌───────┬──────────────┬───────┐| |│ a │ b │ c │| |│ int32 │ decimal(2,1) │ int32 │| |├───────┼──────────────┼───────┤| |│ 1 │ 4.5 │ 2 │| |│ 3 │ 2.0 │ 4 │| |└───────┴──────────────┴───────┘| └────────────────────────────────┘ """ if not exprs and not named_exprs: msg = "At least one expression must be passed to LazyFrame.with_columns" raise ValueError(msg) return super().with_columns(*exprs, **named_exprs) def select( self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr ) -> Self: r"""Select columns from this LazyFrame. Arguments: *exprs: Column(s) to select, specified as positional arguments. Accepts expression input. Strings are parsed as column names. **named_exprs: Additional columns to select, specified as keyword arguments. The columns will be renamed to the keyword used. Returns: The LazyFrame containing only the selected columns. Notes: If you'd like to select a column whose name isn't a string (for example, if you're working with pandas) then you should explicitly use `nw.col` instead of just passing the column name. For example, to select a column named `0` use `df.select(nw.col(0))`, not `df.select(0)`. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") >>> nw.from_native(lf_native).select("a", a_plus_1=nw.col("a") + 1) ┌────────────────────┐ | Narwhals LazyFrame | |--------------------| |┌───────┬──────────┐| |│ a │ a_plus_1 │| |│ int32 │ int32 │| |├───────┼──────────┤| |│ 1 │ 2 │| |│ 3 │ 4 │| |└───────┴──────────┘| └────────────────────┘ """ if not exprs and not named_exprs: msg = "At least one expression must be passed to LazyFrame.select" raise ValueError(msg) return super().select(*exprs, **named_exprs) def rename(self, mapping: dict[str, str]) -> Self: r"""Rename column names. Arguments: mapping: Key value pairs that map from old name to new name, or a function that takes the old name as input and returns the new name. Returns: The LazyFrame with the specified columns renamed. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)") >>> nw.from_native(lf_native).rename({"a": "c"}) ┌────────────────────────┐ | Narwhals LazyFrame | |------------------------| |┌───────┬──────────────┐| |│ c │ b │| |│ int32 │ decimal(2,1) │| |├───────┼──────────────┤| |│ 1 │ 4.5 │| |│ 3 │ 2.0 │| |└───────┴──────────────┘| └────────────────────────┘ """ return super().rename(mapping) def head(self, n: int = 5) -> Self: r"""Get `n` rows. Arguments: n: Number of rows to return. Returns: A subset of the LazyFrame of shape (n, n_columns). Examples: >>> import dask.dataframe as dd >>> import narwhals as nw >>> lf_native = dd.from_dict({"a": [1, 2, 3], "b": [4, 5, 6]}, npartitions=1) >>> nw.from_native(lf_native).head(2).collect() ┌──────────────────┐ |Narwhals DataFrame| |------------------| | a b | | 0 1 4 | | 1 2 5 | └──────────────────┘ """ return super().head(n) def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: r"""Remove columns from the LazyFrame. Arguments: *columns: Names of the columns that should be removed from the dataframe. strict: Validate that all column names exist in the schema and throw an exception if a column name does not exist in the schema. Returns: The LazyFrame with the specified columns removed. Warning: `strict` argument is ignored for `polars<1.0.0`. Please consider upgrading to a newer version or pass to eager mode. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)") >>> nw.from_native(lf_native).drop("a").to_native() ┌───────┐ │ b │ │ int32 │ ├───────┤ │ 2 │ │ 4 │ └───────┘ """ return super().drop(*flatten(columns), strict=strict) def unique( self, subset: str | list[str] | None = None, *, keep: LazyUniqueKeepStrategy = "any", ) -> Self: """Drop duplicate rows from this LazyFrame. Arguments: subset: Column name(s) to consider when identifying duplicate rows. If set to `None`, use all columns. keep: {'any', 'none'} Which of the duplicate rows to keep. * 'any': Does not give any guarantee of which row is kept. * 'none': Don't keep duplicate rows. Returns: The LazyFrame with unique rows. Examples: >>> import duckdb >>> import narwhals as nw >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 1), (3, 4) df(a, b)") >>> nw.from_native(lf_native).unique("a").sort("a", descending=True) ┌──────────────────┐ |Narwhals LazyFrame| |------------------| |┌───────┬───────┐ | |│ a │ b │ | |│ int32 │ int32 │ | |├───────┼───────┤ | |│ 3 │ 4 │ | |│ 1 │ 1 │ | |└───────┴───────┘ | └──────────────────┘ """ if keep not in {"any", "none"}: msg = ( "narwhals.LazyFrame makes no assumptions about row order, so only " f"'any' and 'none' are supported for `keep` in `unique`. Got: {keep}." ) raise ValueError(msg) if isinstance(subset, str): subset = [subset] return self._with_compliant( self._compliant_frame.unique(subset=subset, keep=keep) ) def filter( self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any ) -> Self: r"""Filter the rows in the LazyFrame based on a predicate expression. The original order of the remaining rows is preserved. Arguments: *predicates: Expression that evaluates to a boolean Series. Can also be a (single!) boolean list. **constraints: Column filters; use `name = value` to filter columns by the supplied value. Each constraint will behave the same as `nw.col(name).eq(value)`, and will be implicitly joined with the other filter conditions using &. Returns: The filtered LazyFrame. Examples: >>> import duckdb >>> import narwhals as nw >>> df_native = duckdb.sql(''' ... SELECT * FROM VALUES ... (1, 6, 'a'), ... (2, 7, 'b'), ... (3, 8, 'c') ... df(foo, bar, ham) ... ''') Filter on one condition >>> nw.from_native(df_native).filter(nw.col("foo") > 1).to_native() ┌───────┬───────┬─────────┐ │ foo │ bar │ ham │ │ int32 │ int32 │ varchar │ ├───────┼───────┼─────────┤ │ 2 │ 7 │ b │ │ 3 │ 8 │ c │ └───────┴───────┴─────────┘ Filter on multiple conditions with implicit `&` >>> nw.from_native(df_native).filter( ... nw.col("foo") < 3, nw.col("ham") == "a" ... ).to_native() ┌───────┬───────┬─────────┐ │ foo │ bar │ ham │ │ int32 │ int32 │ varchar │ ├───────┼───────┼─────────┤ │ 1 │ 6 │ a │ └───────┴───────┴─────────┘ Filter on multiple conditions with `|` >>> nw.from_native(df_native).filter( ... (nw.col("foo") == 1) | (nw.col("ham") == "c") ... ).to_native() ┌───────┬───────┬─────────┐ │ foo │ bar │ ham │ │ int32 │ int32 │ varchar │ ├───────┼───────┼─────────┤ │ 1 │ 6 │ a │ │ 3 │ 8 │ c │ └───────┴───────┴─────────┘ Filter using `**kwargs` syntax >>> nw.from_native(df_native).filter(foo=2, ham="b").to_native() ┌───────┬───────┬─────────┐ │ foo │ bar │ ham │ │ int32 │ int32 │ varchar │ ├───────┼───────┼─────────┤ │ 2 │ 7 │ b │ └───────┴───────┴─────────┘ """ if ( len(predicates) == 1 and is_list_of(predicates[0], bool) and not constraints ): # pragma: no cover msg = "`LazyFrame.filter` is not supported with Python boolean masks - use expressions instead." raise TypeError(msg) return super().filter(*predicates, **constraints) def sink_parquet(self, file: str | Path | BytesIO) -> None: """Write LazyFrame to Parquet file. This may allow larger-than-RAM datasets to be written to disk. Arguments: file: String, path object or file-like object to which the dataframe will be written. Returns: None. Examples: >>> import polars as pl >>> import narwhals as nw >>> df_native = pl.LazyFrame({"foo": [1, 2], "bar": [6.0, 7.0]}) >>> df = nw.from_native(df_native) >>> df.sink_parquet("out.parquet") # doctest:+SKIP """ self._compliant_frame.sink_parquet(file) @overload def group_by( self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: Literal[False] = ... ) -> LazyGroupBy[Self]: ... @overload def group_by( self, *keys: str | Iterable[str], drop_null_keys: Literal[True] ) -> LazyGroupBy[Self]: ... def group_by( self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: bool = False ) -> LazyGroupBy[Self]: r"""Start a group by operation. Arguments: *keys: Column(s) to group by. Accepts expression input. Strings are parsed as column names. drop_null_keys: if True, then groups where any key is null won't be included in the result. Returns: Object which can be used to perform aggregations. Examples: >>> import duckdb >>> import narwhals as nw >>> df_native = duckdb.sql( ... "SELECT * FROM VALUES (1, 'a'), (2, 'b'), (3, 'a') df(a, b)" ... ) >>> df = nw.from_native(df_native) >>> df.group_by("b").agg(nw.col("a").sum()).sort("b").to_native() ┌─────────┬────────┐ │ b │ a │ │ varchar │ int128 │ ├─────────┼────────┤ │ a │ 4 │ │ b │ 2 │ └─────────┴────────┘ Expressions are also accepted. >>> df.group_by(nw.col("b").str.len_chars()).agg( ... nw.col("a").sum() ... ).to_native() ┌───────┬────────┐ │ b │ a │ │ int64 │ int128 │ ├───────┼────────┤ │ 1 │ 6 │ └───────┴────────┘ """ from narwhals.group_by import LazyGroupBy flat_keys = flatten(keys) if all(isinstance(key, str) for key in flat_keys): return LazyGroupBy(self, flat_keys, drop_null_keys=drop_null_keys) from narwhals import col from narwhals.expr import Expr key_is_expr = tuple(isinstance(k, Expr) for k in flat_keys) if drop_null_keys and any(key_is_expr): msg = "drop_null_keys cannot be True when keys contains Expr" raise NotImplementedError(msg) _keys = [k if is_expr else col(k) for k, is_expr in zip(flat_keys, key_is_expr)] expr_flat_keys, kinds = self._flatten_and_extract(*_keys) if not all(kind is ExprKind.ELEMENTWISE for kind in kinds): from narwhals.exceptions import ComputeError msg = ( "Group by is not supported with keys that are not elementwise expressions" ) raise ComputeError(msg) return LazyGroupBy(self, expr_flat_keys, drop_null_keys=drop_null_keys) def sort( self, by: str | Iterable[str], *more_by: str, descending: bool | Sequence[bool] = False, nulls_last: bool = False, ) -> Self: r"""Sort the LazyFrame by the given columns. Arguments: by: Column(s) names to sort by. *more_by: Additional columns to sort by, specified as positional arguments. descending: Sort in descending order. When sorting by multiple columns, can be specified per column by passing a sequence of booleans. nulls_last: Place null values last; can specify a single boolean applying to all columns or a sequence of booleans for per-column control. Returns: The sorted LazyFrame. Warning: Unlike Polars, it is not possible to specify a sequence of booleans for `nulls_last` in order to control per-column behaviour. Instead a single boolean is applied for all `by` columns. Examples: >>> import duckdb >>> import narwhals as nw >>> df_native = duckdb.sql( ... "SELECT * FROM VALUES (1, 6.0, 'a'), (2, 5.0, 'c'), (NULL, 4.0, 'b') df(a, b, c)" ... ) >>> df = nw.from_native(df_native) >>> df.sort("a") ┌──────────────────────────────────┐ | Narwhals LazyFrame | |----------------------------------| |┌───────┬──────────────┬─────────┐| |│ a │ b │ c │| |│ int32 │ decimal(2,1) │ varchar │| |├───────┼──────────────┼─────────┤| |│ NULL │ 4.0 │ b │| |│ 1 │ 6.0 │ a │| |│ 2 │ 5.0 │ c │| |└───────┴──────────────┴─────────┘| └──────────────────────────────────┘ """ return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) def join( self, other: Self, on: str | list[str] | None = None, how: JoinStrategy = "inner", *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, suffix: str = "_right", ) -> Self: r"""Add a join operation to the Logical Plan. Arguments: other: Lazy DataFrame to join with. on: Name(s) of the join columns in both DataFrames. If set, `left_on` and `right_on` should be None. how: Join strategy. * *inner*: Returns rows that have matching values in both tables. * *left*: Returns all rows from the left table, and the matched rows from the right table. * *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys. * *cross*: Returns the Cartesian product of rows from both tables. * *semi*: Filter rows that have a match in the right table. * *anti*: Filter rows that do not have a match in the right table. left_on: Join column of the left DataFrame. right_on: Join column of the right DataFrame. suffix: Suffix to append to columns with a duplicate name. Returns: A new joined LazyFrame. Examples: >>> import duckdb >>> import narwhals as nw >>> df_native1 = duckdb.sql( ... "SELECT * FROM VALUES (1, 'a'), (2, 'b') df(a, b)" ... ) >>> df_native2 = duckdb.sql( ... "SELECT * FROM VALUES (1, 'x'), (3, 'y') df(a, c)" ... ) >>> df1 = nw.from_native(df_native1) >>> df2 = nw.from_native(df_native2) >>> df1.join(df2, on="a") ┌─────────────────────────────┐ | Narwhals LazyFrame | |-----------------------------| |┌───────┬─────────┬─────────┐| |│ a │ b │ c │| |│ int32 │ varchar │ varchar │| |├───────┼─────────┼─────────┤| |│ 1 │ a │ x │| |└───────┴─────────┴─────────┘| └─────────────────────────────┘ """ return super().join( other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix ) def join_asof( self, other: Self, *, left_on: str | None = None, right_on: str | None = None, on: str | None = None, by_left: str | list[str] | None = None, by_right: str | list[str] | None = None, by: str | list[str] | None = None, strategy: AsofJoinStrategy = "backward", suffix: str = "_right", ) -> Self: """Perform an asof join. This is similar to a left-join except that we match on nearest key rather than equal keys. For Polars, both DataFrames must be sorted by the `on` key (within each `by` group if specified). Arguments: other: DataFrame to join with. left_on: Name(s) of the left join column(s). right_on: Name(s) of the right join column(s). on: Join column of both DataFrames. If set, left_on and right_on should be None. by_left: join on these columns before doing asof join by_right: join on these columns before doing asof join by: join on these columns before doing asof join strategy: Join strategy. The default is "backward". * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. * *forward*: selects the first row in the right DataFrame whose "on" key is greater than or equal to the left's key. * *nearest*: search selects the last row in the right DataFrame whose value is nearest to the left's key. suffix: Suffix to append to columns with a duplicate name. Returns: A new joined LazyFrame. Examples: >>> from datetime import datetime >>> import polars as pl >>> import narwhals as nw >>> data_gdp = { ... "datetime": [ ... datetime(2016, 1, 1), ... datetime(2017, 1, 1), ... datetime(2018, 1, 1), ... datetime(2019, 1, 1), ... datetime(2020, 1, 1), ... ], ... "gdp": [4164, 4411, 4566, 4696, 4827], ... } >>> data_population = { ... "datetime": [ ... datetime(2016, 3, 1), ... datetime(2018, 8, 1), ... datetime(2019, 1, 1), ... ], ... "population": [82.19, 82.66, 83.12], ... } >>> gdp_native = pl.DataFrame(data_gdp) >>> population_native = pl.DataFrame(data_population) >>> gdp = nw.from_native(gdp_native) >>> population = nw.from_native(population_native) >>> population.join_asof(gdp, on="datetime", strategy="backward").to_native() shape: (3, 3) ┌─────────────────────┬────────────┬──────┐ │ datetime ┆ population ┆ gdp │ │ --- ┆ --- ┆ --- │ │ datetime[μs] ┆ f64 ┆ i64 │ ╞═════════════════════╪════════════╪══════╡ │ 2016-03-01 00:00:00 ┆ 82.19 ┆ 4164 │ │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │ │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │ └─────────────────────┴────────────┴──────┘ """ return super().join_asof( other, left_on=left_on, right_on=right_on, on=on, by_left=by_left, by_right=by_right, by=by, strategy=strategy, suffix=suffix, ) def lazy(self) -> Self: """Restrict available API methods to lazy-only ones. This is a no-op, and exists only for compatibility with `DataFrame.lazy`. Returns: A LazyFrame. """ return self def unpivot( self, on: str | list[str] | None = None, *, index: str | list[str] | None = None, variable_name: str = "variable", value_name: str = "value", ) -> Self: r"""Unpivot a DataFrame from wide to long format. Optionally leaves identifiers set. This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (index) while all other columns, considered measured variables (on), are "unpivoted" to the row axis leaving just two non-identifier columns, 'variable' and 'value'. Arguments: on: Column(s) to use as values variables; if `on` is empty all columns that are not in `index` will be used. index: Column(s) to use as identifier variables. variable_name: Name to give to the `variable` column. Defaults to "variable". value_name: Name to give to the `value` column. Defaults to "value". Returns: The unpivoted LazyFrame. Notes: If you're coming from pandas, this is similar to `pandas.DataFrame.melt`, but with `index` replacing `id_vars` and `on` replacing `value_vars`. In other frameworks, you might know this operation as `pivot_longer`. Examples: >>> import duckdb >>> import narwhals as nw >>> df_native = duckdb.sql( ... "SELECT * FROM VALUES ('x', 1, 2), ('y', 3, 4), ('z', 5, 6) df(a, b, c)" ... ) >>> df = nw.from_native(df_native) >>> df.unpivot(on=["b", "c"], index="a").sort("a", "variable").to_native() ┌─────────┬──────────┬───────┐ │ a │ variable │ value │ │ varchar │ varchar │ int32 │ ├─────────┼──────────┼───────┤ │ x │ b │ 1 │ │ x │ c │ 2 │ │ y │ b │ 3 │ │ y │ c │ 4 │ │ z │ b │ 5 │ │ z │ c │ 6 │ └─────────┴──────────┴───────┘ """ return super().unpivot( on=on, index=index, variable_name=variable_name, value_name=value_name ) def explode(self, columns: str | Sequence[str], *more_columns: str) -> Self: """Explode the dataframe to long format by exploding the given columns. Notes: It is possible to explode multiple columns only if these columns have matching element counts. Arguments: columns: Column names. The underlying columns being exploded must be of the `List` data type. *more_columns: Additional names of columns to explode, specified as positional arguments. Returns: New LazyFrame Examples: >>> import duckdb >>> import narwhals as nw >>> df_native = duckdb.sql( ... "SELECT * FROM VALUES ('x', [1, 2]), ('y', [3, 4]), ('z', [5, 6]) df(a, b)" ... ) >>> df = nw.from_native(df_native) >>> df.explode("b").to_native() ┌─────────┬───────┐ │ a │ b │ │ varchar │ int32 │ ├─────────┼───────┤ │ x │ 1 │ │ x │ 2 │ │ y │ 3 │ │ y │ 4 │ │ z │ 5 │ │ z │ 6 │ └─────────┴───────┘ """ return super().explode(columns, *more_columns)