team-10/env/Lib/site-packages/narwhals/_pandas_like/series.py
2025-08-02 07:34:44 +02:00

1136 lines
41 KiB
Python

from __future__ import annotations
import warnings
from typing import TYPE_CHECKING, Any, Literal, cast
import numpy as np
from narwhals._compliant import EagerSeries, EagerSeriesHist
from narwhals._pandas_like.series_cat import PandasLikeSeriesCatNamespace
from narwhals._pandas_like.series_dt import PandasLikeSeriesDateTimeNamespace
from narwhals._pandas_like.series_list import PandasLikeSeriesListNamespace
from narwhals._pandas_like.series_str import PandasLikeSeriesStringNamespace
from narwhals._pandas_like.series_struct import PandasLikeSeriesStructNamespace
from narwhals._pandas_like.utils import (
align_and_extract_native,
get_dtype_backend,
import_array_module,
narwhals_to_native_dtype,
native_to_narwhals_dtype,
object_native_to_narwhals_dtype,
rename,
select_columns_by_name,
set_index,
)
from narwhals._typing_compat import assert_never
from narwhals._utils import Implementation, is_list_of, parse_version
from narwhals.dependencies import is_numpy_array_1d, is_pandas_like_series
from narwhals.exceptions import InvalidOperationError
if TYPE_CHECKING:
from collections.abc import Hashable, Iterable, Iterator, Mapping, Sequence
from types import ModuleType
import pandas as pd
import polars as pl
import pyarrow as pa
from typing_extensions import Self, TypeAlias, TypeIs
from narwhals._arrow.typing import ChunkedArrayAny
from narwhals._compliant.series import HistData
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
from narwhals._pandas_like.namespace import PandasLikeNamespace
from narwhals._utils import Version, _LimitedContext
from narwhals.dtypes import DType
from narwhals.typing import (
ClosedInterval,
FillNullStrategy,
Into1DArray,
IntoDType,
NonNestedLiteral,
NumericLiteral,
RankMethod,
RollingInterpolationMethod,
SizedMultiIndexSelector,
TemporalLiteral,
_1DArray,
_SliceIndex,
)
PandasHistData: TypeAlias = "HistData[pd.Series[Any], list[float]]"
PANDAS_TO_NUMPY_DTYPE_NO_MISSING = {
"Int64": "int64",
"int64[pyarrow]": "int64",
"Int32": "int32",
"int32[pyarrow]": "int32",
"Int16": "int16",
"int16[pyarrow]": "int16",
"Int8": "int8",
"int8[pyarrow]": "int8",
"UInt64": "uint64",
"uint64[pyarrow]": "uint64",
"UInt32": "uint32",
"uint32[pyarrow]": "uint32",
"UInt16": "uint16",
"uint16[pyarrow]": "uint16",
"UInt8": "uint8",
"uint8[pyarrow]": "uint8",
"Float64": "float64",
"float64[pyarrow]": "float64",
"Float32": "float32",
"float32[pyarrow]": "float32",
}
PANDAS_TO_NUMPY_DTYPE_MISSING = {
"Int64": "float64",
"int64[pyarrow]": "float64",
"Int32": "float64",
"int32[pyarrow]": "float64",
"Int16": "float64",
"int16[pyarrow]": "float64",
"Int8": "float64",
"int8[pyarrow]": "float64",
"UInt64": "float64",
"uint64[pyarrow]": "float64",
"UInt32": "float64",
"uint32[pyarrow]": "float64",
"UInt16": "float64",
"uint16[pyarrow]": "float64",
"UInt8": "float64",
"uint8[pyarrow]": "float64",
"Float64": "float64",
"float64[pyarrow]": "float64",
"Float32": "float32",
"float32[pyarrow]": "float32",
}
class PandasLikeSeries(EagerSeries[Any]):
def __init__(
self, native_series: Any, *, implementation: Implementation, version: Version
) -> None:
self._name = native_series.name
self._native_series = native_series
self._implementation = implementation
self._version = version
# Flag which indicates if, in the final step before applying an operation,
# the single value behind the PandasLikeSeries should be extract and treated
# as a Scalar. For example, in `nw.col('a') - nw.lit(3)`, the latter would
# become a Series of length 1. Rather that doing a full broadcast so it matches
# the length of the whole dataframe, we just extract the scalar.
self._broadcast = False
@property
def native(self) -> Any:
return self._native_series
def __native_namespace__(self) -> ModuleType:
if self._implementation.is_pandas_like():
return self._implementation.to_native_namespace()
msg = f"Expected pandas/modin/cudf, got: {type(self._implementation)}" # pragma: no cover
raise AssertionError(msg)
def __narwhals_namespace__(self) -> PandasLikeNamespace:
from narwhals._pandas_like.namespace import PandasLikeNamespace
return PandasLikeNamespace(self._implementation, self._version)
def _gather(self, rows: SizedMultiIndexSelector[pd.Series[Any]]) -> Self:
rows = list(rows) if isinstance(rows, tuple) else rows
return self._with_native(self.native.iloc[rows])
def _gather_slice(self, rows: _SliceIndex | range) -> Self:
return self._with_native(
self.native.iloc[slice(rows.start, rows.stop, rows.step)]
)
def _with_version(self, version: Version) -> Self:
return self.__class__(
self.native, implementation=self._implementation, version=version
)
def _with_native(self, series: Any, *, preserve_broadcast: bool = False) -> Self:
result = self.__class__(
series, implementation=self._implementation, version=self._version
)
if preserve_broadcast:
result._broadcast = self._broadcast
return result
@classmethod
def from_iterable(
cls,
data: Iterable[Any],
*,
context: _LimitedContext,
name: str = "",
dtype: IntoDType | None = None,
index: Any = None,
) -> Self:
implementation = context._implementation
version = context._version
ns = implementation.to_native_namespace()
kwds: dict[str, Any] = {}
if dtype:
kwds["dtype"] = narwhals_to_native_dtype(dtype, None, implementation, version)
else:
if implementation.is_pandas():
kwds["copy"] = False
if index is not None and len(index):
kwds["index"] = index
return cls.from_native(ns.Series(data, name=name, **kwds), context=context)
@staticmethod
def _is_native(obj: Any) -> TypeIs[Any]:
return is_pandas_like_series(obj) # pragma: no cover
@classmethod
def from_native(cls, data: Any, /, *, context: _LimitedContext) -> Self:
return cls(data, implementation=context._implementation, version=context._version)
@classmethod
def from_numpy(cls, data: Into1DArray, /, *, context: _LimitedContext) -> Self:
implementation = context._implementation
arr = data if is_numpy_array_1d(data) else [data]
native = implementation.to_native_namespace().Series(arr, name="")
return cls.from_native(native, context=context)
@classmethod
def _align_full_broadcast(cls, *series: Self) -> Sequence[Self]:
Series = series[0].__native_namespace__().Series # noqa: N806
lengths = [len(s) for s in series]
max_length = max(lengths)
idx = series[lengths.index(max_length)].native.index
reindexed = []
for s in series:
if s._broadcast:
native = Series(
s.native.iloc[0], index=idx, name=s.name, dtype=s.native.dtype
)
compliant = s._with_native(native)
elif s.native.index is not idx:
native = set_index(s.native, idx, implementation=s._implementation)
compliant = s._with_native(native)
else:
compliant = s
reindexed.append(compliant)
return reindexed
@property
def name(self) -> str:
return self._name
@property
def dtype(self) -> DType:
native_dtype = self.native.dtype
return (
native_to_narwhals_dtype(native_dtype, self._version, self._implementation)
if native_dtype != "object"
else object_native_to_narwhals_dtype(
self.native, self._version, self._implementation
)
)
@property
def _array_funcs(self): # type: ignore[no-untyped-def] # noqa: ANN202
if TYPE_CHECKING:
import numpy as np
return np
else:
return import_array_module(self._implementation)
def ewm_mean(
self,
*,
com: float | None,
span: float | None,
half_life: float | None,
alpha: float | None,
adjust: bool,
min_samples: int,
ignore_nulls: bool,
) -> Self:
ser = self.native
mask_na = ser.isna()
if self._implementation is Implementation.CUDF:
if (min_samples == 0 and not ignore_nulls) or (not mask_na.any()):
result = ser.ewm(
com=com, span=span, halflife=half_life, alpha=alpha, adjust=adjust
).mean()
else:
msg = (
"cuDF only supports `ewm_mean` when there are no missing values "
"or when both `min_period=0` and `ignore_nulls=False`"
)
raise NotImplementedError(msg)
else:
result = ser.ewm(
com, span, half_life, alpha, min_samples, adjust, ignore_na=ignore_nulls
).mean()
result[mask_na] = None
return self._with_native(result)
def scatter(self, indices: int | Sequence[int], values: Any) -> Self:
if isinstance(values, self.__class__):
values = set_index(
values.native,
self.native.index[indices],
implementation=self._implementation,
)
s = self.native.copy(deep=True)
s.iloc[indices] = values
s.name = self.name
return self._with_native(s)
def _scatter_in_place(self, indices: Self, values: Self) -> None:
implementation = self._implementation
backend_version = self._backend_version
# Scatter, modifying original Series. Use with care!
values_native = set_index(
values.native,
self.native.index[indices.native],
implementation=implementation,
)
if implementation is Implementation.PANDAS and parse_version(np) < (2,):
values_native = values_native.copy() # pragma: no cover
min_pd_version = (1, 2)
if implementation is Implementation.PANDAS and backend_version < min_pd_version:
self.native.iloc[indices.native.values] = values_native # noqa: PD011
else:
self.native.iloc[indices.native] = values_native
def cast(self, dtype: IntoDType) -> Self:
pd_dtype = narwhals_to_native_dtype(
dtype,
dtype_backend=get_dtype_backend(self.native.dtype, self._implementation),
implementation=self._implementation,
version=self._version,
)
return self._with_native(self.native.astype(pd_dtype), preserve_broadcast=True)
def item(self, index: int | None) -> Any:
# cuDF doesn't have Series.item().
if index is None:
if len(self) != 1:
msg = (
"can only call '.item()' if the Series is of length 1,"
f" or an explicit index is provided (Series is of length {len(self)})"
)
raise ValueError(msg)
return self.native.iloc[0]
return self.native.iloc[index]
def to_frame(self) -> PandasLikeDataFrame:
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
return PandasLikeDataFrame(
self.native.to_frame(),
implementation=self._implementation,
version=self._version,
validate_column_names=False,
)
def to_list(self) -> list[Any]:
is_cudf = self._implementation.is_cudf()
return self.native.to_arrow().to_pylist() if is_cudf else self.native.to_list()
def is_between(
self, lower_bound: Any, upper_bound: Any, closed: ClosedInterval
) -> Self:
ser = self.native
_, lower_bound = align_and_extract_native(self, lower_bound)
_, upper_bound = align_and_extract_native(self, upper_bound)
if closed == "left":
res = ser.ge(lower_bound) & ser.lt(upper_bound)
elif closed == "right":
res = ser.gt(lower_bound) & ser.le(upper_bound)
elif closed == "none":
res = ser.gt(lower_bound) & ser.lt(upper_bound)
elif closed == "both":
res = ser.ge(lower_bound) & ser.le(upper_bound)
else:
assert_never(closed)
return self._with_native(res).alias(ser.name)
def is_in(self, other: Any) -> Self:
return self._with_native(self.native.isin(other))
def arg_true(self) -> Self:
ser = self.native
size = len(ser)
data = self._array_funcs.arange(size)
result = ser.__class__(data, name=ser.name, index=ser.index).loc[ser]
return self._with_native(result)
def arg_min(self) -> int:
return self.native.argmin()
def arg_max(self) -> int:
return self.native.argmax()
# Binary comparisons
def filter(self, predicate: Any) -> Self:
if not is_list_of(predicate, bool):
_, other_native = align_and_extract_native(self, predicate)
else:
other_native = predicate
return self._with_native(self.native.loc[other_native]).alias(self.name)
def __eq__(self, other: object) -> Self: # type: ignore[override]
ser, other = align_and_extract_native(self, other)
return self._with_native(ser == other).alias(self.name)
def __ne__(self, other: object) -> Self: # type: ignore[override]
ser, other = align_and_extract_native(self, other)
return self._with_native(ser != other).alias(self.name)
def __ge__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser >= other).alias(self.name)
def __gt__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser > other).alias(self.name)
def __le__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser <= other).alias(self.name)
def __lt__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser < other).alias(self.name)
def __and__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser & other).alias(self.name)
def __rand__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
ser = cast("pd.Series[Any]", ser)
return self._with_native(ser.__and__(other)).alias(self.name)
def __or__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser | other).alias(self.name)
def __ror__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
ser = cast("pd.Series[Any]", ser)
return self._with_native(ser.__or__(other)).alias(self.name)
def __add__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser + other).alias(self.name)
def __radd__(self, other: Any) -> Self:
_, other_native = align_and_extract_native(self, other)
return self._with_native(self.native.__radd__(other_native)).alias(self.name)
def __sub__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser - other).alias(self.name)
def __rsub__(self, other: Any) -> Self:
_, other_native = align_and_extract_native(self, other)
return self._with_native(self.native.__rsub__(other_native)).alias(self.name)
def __mul__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser * other).alias(self.name)
def __rmul__(self, other: Any) -> Self:
_, other_native = align_and_extract_native(self, other)
return self._with_native(self.native.__rmul__(other_native)).alias(self.name)
def __truediv__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser / other).alias(self.name)
def __rtruediv__(self, other: Any) -> Self:
_, other_native = align_and_extract_native(self, other)
return self._with_native(self.native.__rtruediv__(other_native)).alias(self.name)
def __floordiv__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser // other).alias(self.name)
def __rfloordiv__(self, other: Any) -> Self:
_, other_native = align_and_extract_native(self, other)
return self._with_native(self.native.__rfloordiv__(other_native)).alias(self.name)
def __pow__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser**other).alias(self.name)
def __rpow__(self, other: Any) -> Self:
_, other_native = align_and_extract_native(self, other)
return self._with_native(self.native.__rpow__(other_native)).alias(self.name)
def __mod__(self, other: Any) -> Self:
ser, other = align_and_extract_native(self, other)
return self._with_native(ser % other).alias(self.name)
def __rmod__(self, other: Any) -> Self:
_, other_native = align_and_extract_native(self, other)
return self._with_native(self.native.__rmod__(other_native)).alias(self.name)
# Unary
def __invert__(self) -> Self:
return self._with_native(~self.native)
# Reductions
def any(self) -> bool:
return self.native.any()
def all(self) -> bool:
return self.native.all()
def min(self) -> Any:
return self.native.min()
def max(self) -> Any:
return self.native.max()
def sum(self) -> float:
return self.native.sum()
def count(self) -> int:
return self.native.count()
def mean(self) -> float:
return self.native.mean()
def median(self) -> float:
if not self.dtype.is_numeric():
msg = "`median` operation not supported for non-numeric input type."
raise InvalidOperationError(msg)
return self.native.median()
def std(self, *, ddof: int) -> float:
return self.native.std(ddof=ddof)
def var(self, *, ddof: int) -> float:
return self.native.var(ddof=ddof)
def skew(self) -> float | None:
ser_not_null = self.native.dropna()
if len(ser_not_null) == 0:
return None
elif len(ser_not_null) == 1:
return float("nan")
elif len(ser_not_null) == 2:
return 0.0
else:
m = ser_not_null - ser_not_null.mean()
m2 = (m**2).mean()
m3 = (m**3).mean()
return m3 / (m2**1.5) if m2 != 0 else float("nan")
def kurtosis(self) -> float | None:
ser_not_null = self.native.dropna()
if len(ser_not_null) == 0:
return None
elif len(ser_not_null) == 1:
return float("nan")
else:
m = ser_not_null - ser_not_null.mean()
m2 = (m**2).mean()
m4 = (m**4).mean()
return m4 / (m2**2) - 3.0 if m2 != 0 else float("nan")
def len(self) -> int:
return len(self.native)
# Transformations
def is_null(self) -> Self:
return self._with_native(self.native.isna(), preserve_broadcast=True)
def is_nan(self) -> Self:
ser = self.native
if self.dtype.is_numeric():
return self._with_native(ser != ser, preserve_broadcast=True) # noqa: PLR0124
msg = f"`.is_nan` only supported for numeric dtype and not {self.dtype}, did you mean `.is_null`?"
raise InvalidOperationError(msg)
def fill_null(
self,
value: Self | NonNestedLiteral,
strategy: FillNullStrategy | None,
limit: int | None,
) -> Self:
ser = self.native
kwargs = (
{"downcast": False}
if self._implementation is Implementation.PANDAS
and self._backend_version < (3,)
else {}
)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", "The 'downcast' keyword .*is deprecated", category=FutureWarning
)
if value is not None:
_, native_value = align_and_extract_native(self, value)
res_ser = self._with_native(
ser.fillna(value=native_value, **kwargs), preserve_broadcast=True
)
else:
res_ser = self._with_native(
ser.ffill(limit=limit, **kwargs)
if strategy == "forward"
else ser.bfill(limit=limit, **kwargs),
preserve_broadcast=True,
)
return res_ser
def drop_nulls(self) -> Self:
return self._with_native(self.native.dropna())
def n_unique(self) -> int:
return self.native.nunique(dropna=False)
def sample(
self,
n: int | None,
*,
fraction: float | None,
with_replacement: bool,
seed: int | None,
) -> Self:
return self._with_native(
self.native.sample(
n=n, frac=fraction, replace=with_replacement, random_state=seed
)
)
def abs(self) -> Self:
return self._with_native(self.native.abs())
def cum_sum(self, *, reverse: bool) -> Self:
result = (
self.native.cumsum(skipna=True)
if not reverse
else self.native[::-1].cumsum(skipna=True)[::-1]
)
return self._with_native(result)
def unique(self, *, maintain_order: bool = True) -> Self:
"""Pandas always maintains order, as per its docstring.
> Uniques are returned in order of appearance.
"""
return self._with_native(type(self.native)(self.native.unique(), name=self.name))
def diff(self) -> Self:
return self._with_native(self.native.diff())
def shift(self, n: int) -> Self:
return self._with_native(self.native.shift(n))
def replace_strict(
self,
old: Sequence[Any] | Mapping[Any, Any],
new: Sequence[Any],
*,
return_dtype: IntoDType | None,
) -> PandasLikeSeries:
tmp_name = f"{self.name}_tmp"
dtype_backend = get_dtype_backend(self.native.dtype, self._implementation)
dtype = (
narwhals_to_native_dtype(
return_dtype, dtype_backend, self._implementation, self._version
)
if return_dtype
else None
)
namespace = self.__native_namespace__()
other = namespace.DataFrame(
{self.name: old, tmp_name: namespace.Series(new, dtype=dtype)}
)
result = self._with_native(
self.native.to_frame().merge(other, on=self.name, how="left")[tmp_name]
).alias(self.name)
if result.is_null().sum() != self.is_null().sum():
msg = (
"replace_strict did not replace all non-null values.\n\n"
f"The following did not get replaced: {self.filter(~self.is_null() & result.is_null()).unique(maintain_order=False).to_list()}"
)
raise ValueError(msg)
return result
def sort(self, *, descending: bool, nulls_last: bool) -> PandasLikeSeries:
na_position = "last" if nulls_last else "first"
return self._with_native(
self.native.sort_values(ascending=not descending, na_position=na_position)
).alias(self.name)
def alias(self, name: str | Hashable) -> Self:
if name != self.name:
return self._with_native(
rename(self.native, name, implementation=self._implementation),
preserve_broadcast=True,
)
return self
def __array__(self, dtype: Any, *, copy: bool | None) -> _1DArray:
# pandas used to always return object dtype for nullable dtypes.
# So, we intercept __array__ and pass to `to_numpy` ourselves to make
# sure an appropriate numpy dtype is returned.
return self.to_numpy(dtype=dtype, copy=copy)
def to_numpy(self, dtype: Any = None, *, copy: bool | None = None) -> _1DArray:
# the default is meant to be None, but pandas doesn't allow it?
# https://numpy.org/doc/stable/reference/generated/numpy.ndarray.__array__.html
dtypes = self._version.dtypes
if isinstance(self.dtype, dtypes.Datetime) and self.dtype.time_zone is not None:
s = self.dt.convert_time_zone("UTC").dt.replace_time_zone(None).native
else:
s = self.native
has_missing = s.isna().any()
kwargs: dict[Any, Any] = {"copy": copy or self._implementation.is_cudf()}
if has_missing and str(s.dtype) in PANDAS_TO_NUMPY_DTYPE_MISSING:
kwargs.update({"na_value": float("nan")})
dtype = dtype or PANDAS_TO_NUMPY_DTYPE_MISSING[str(s.dtype)]
if not has_missing and str(s.dtype) in PANDAS_TO_NUMPY_DTYPE_NO_MISSING:
dtype = dtype or PANDAS_TO_NUMPY_DTYPE_NO_MISSING[str(s.dtype)]
return s.to_numpy(dtype=dtype, **kwargs)
def to_pandas(self) -> pd.Series[Any]:
if self._implementation is Implementation.PANDAS:
return self.native
elif self._implementation is Implementation.CUDF: # pragma: no cover
return self.native.to_pandas()
elif self._implementation is Implementation.MODIN:
return self.native._to_pandas()
msg = f"Unknown implementation: {self._implementation}" # pragma: no cover
raise AssertionError(msg)
def to_polars(self) -> pl.Series:
import polars as pl # ignore-banned-import
return pl.from_pandas(self.to_pandas())
# --- descriptive ---
def is_unique(self) -> Self:
return self._with_native(~self.native.duplicated(keep=False)).alias(self.name)
def null_count(self) -> int:
return self.native.isna().sum()
def is_first_distinct(self) -> Self:
return self._with_native(~self.native.duplicated(keep="first")).alias(self.name)
def is_last_distinct(self) -> Self:
return self._with_native(~self.native.duplicated(keep="last")).alias(self.name)
def is_sorted(self, *, descending: bool) -> bool:
if not isinstance(descending, bool):
msg = f"argument 'descending' should be boolean, found {type(descending)}"
raise TypeError(msg)
if descending:
return self.native.is_monotonic_decreasing
else:
return self.native.is_monotonic_increasing
def value_counts(
self, *, sort: bool, parallel: bool, name: str | None, normalize: bool
) -> PandasLikeDataFrame:
"""Parallel is unused, exists for compatibility."""
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
index_name_ = "index" if self._name is None else self._name
value_name_ = name or ("proportion" if normalize else "count")
val_count = self.native.value_counts(
dropna=False, sort=False, normalize=normalize
).reset_index()
val_count.columns = [index_name_, value_name_]
if sort:
val_count = val_count.sort_values(value_name_, ascending=False)
return PandasLikeDataFrame.from_native(val_count, context=self)
def quantile(
self, quantile: float, interpolation: RollingInterpolationMethod
) -> float:
return self.native.quantile(q=quantile, interpolation=interpolation)
def zip_with(self, mask: Any, other: Any) -> Self:
ser = self.native
_, mask = align_and_extract_native(self, mask)
_, other = align_and_extract_native(self, other)
res = ser.where(mask, other)
return self._with_native(res)
def head(self, n: int) -> Self:
return self._with_native(self.native.head(n))
def tail(self, n: int) -> Self:
return self._with_native(self.native.tail(n))
def round(self, decimals: int) -> Self:
return self._with_native(self.native.round(decimals=decimals))
def to_dummies(self, *, separator: str, drop_first: bool) -> PandasLikeDataFrame:
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
plx = self.__native_namespace__()
series = self.native
name = str(self._name) if self._name else ""
null_col_pl = f"{name}{separator}null"
has_nulls = series.isna().any()
result = plx.get_dummies(
series,
prefix=name,
prefix_sep=separator,
drop_first=drop_first,
# Adds a null column at the end, depending on whether or not there are any.
dummy_na=has_nulls,
dtype="int8",
)
if has_nulls:
*cols, null_col_pd = list(result.columns)
output_order = [null_col_pd, *cols]
result = rename(
select_columns_by_name(result, output_order, self._implementation),
columns={null_col_pd: null_col_pl},
implementation=self._implementation,
)
return PandasLikeDataFrame.from_native(result, context=self)
def gather_every(self, n: int, offset: int) -> Self:
return self._with_native(self.native.iloc[offset::n])
def clip(
self,
lower_bound: Self | NumericLiteral | TemporalLiteral | None,
upper_bound: Self | NumericLiteral | TemporalLiteral | None,
) -> Self:
_, lower = (
align_and_extract_native(self, lower_bound)
if lower_bound is not None
else (None, None)
)
_, upper = (
align_and_extract_native(self, upper_bound)
if upper_bound is not None
else (None, None)
)
kwargs = {"axis": 0} if self._implementation is Implementation.MODIN else {}
return self._with_native(self.native.clip(lower, upper, **kwargs))
def to_arrow(self) -> pa.Array[Any]:
if self._implementation is Implementation.CUDF:
return self.native.to_arrow()
import pyarrow as pa # ignore-banned-import()
return pa.Array.from_pandas(self.native)
def mode(self) -> Self:
result = self.native.mode()
result.name = self.name
return self._with_native(result)
def cum_count(self, *, reverse: bool) -> Self:
not_na_series = ~self.native.isna()
result = (
not_na_series.cumsum()
if not reverse
else len(self) - not_na_series.cumsum() + not_na_series - 1
)
return self._with_native(result)
def cum_min(self, *, reverse: bool) -> Self:
result = (
self.native.cummin(skipna=True)
if not reverse
else self.native[::-1].cummin(skipna=True)[::-1]
)
return self._with_native(result)
def cum_max(self, *, reverse: bool) -> Self:
result = (
self.native.cummax(skipna=True)
if not reverse
else self.native[::-1].cummax(skipna=True)[::-1]
)
return self._with_native(result)
def cum_prod(self, *, reverse: bool) -> Self:
result = (
self.native.cumprod(skipna=True)
if not reverse
else self.native[::-1].cumprod(skipna=True)[::-1]
)
return self._with_native(result)
def rolling_sum(self, window_size: int, *, min_samples: int, center: bool) -> Self:
result = self.native.rolling(
window=window_size, min_periods=min_samples, center=center
).sum()
return self._with_native(result)
def rolling_mean(self, window_size: int, *, min_samples: int, center: bool) -> Self:
result = self.native.rolling(
window=window_size, min_periods=min_samples, center=center
).mean()
return self._with_native(result)
def rolling_var(
self, window_size: int, *, min_samples: int, center: bool, ddof: int
) -> Self:
result = self.native.rolling(
window=window_size, min_periods=min_samples, center=center
).var(ddof=ddof)
return self._with_native(result)
def rolling_std(
self, window_size: int, *, min_samples: int, center: bool, ddof: int
) -> Self:
result = self.native.rolling(
window=window_size, min_periods=min_samples, center=center
).std(ddof=ddof)
return self._with_native(result)
def __iter__(self) -> Iterator[Any]:
yield from self.native.__iter__()
def __contains__(self, other: Any) -> bool:
return self.native.isna().any() if other is None else (self.native == other).any()
def is_finite(self) -> Self:
s = self.native
return self._with_native((s > float("-inf")) & (s < float("inf")))
def rank(self, method: RankMethod, *, descending: bool) -> Self:
pd_method = "first" if method == "ordinal" else method
name = self.name
if (
self._implementation is Implementation.PANDAS
and self._backend_version < (3,)
and get_dtype_backend(self.native.dtype, self._implementation)
== "numpy_nullable"
and self.dtype.is_integer()
and (null_mask := self.is_null()).any()
):
# crazy workaround for the case of `na_option="keep"` and nullable
# integer dtypes. This should be supported in pandas > 3.0
# https://github.com/pandas-dev/pandas/issues/56976
mask_name = f"{name}_is_null"
plx = self.__narwhals_namespace__()
df = (
self.to_frame()
.with_columns(plx._expr._from_series(null_mask).alias(mask_name))
.native
)
return self._with_native(
df.groupby(mask_name)
.rank(
method=pd_method,
na_option="keep",
ascending=not descending,
pct=False,
)
.iloc[:, 0]
).alias(self.name)
return self._with_native(
self.native.rank(
method=pd_method, na_option="keep", ascending=not descending, pct=False
)
)
def hist_from_bins(
self, bins: list[float], *, include_breakpoint: bool
) -> PandasLikeDataFrame:
return (
_PandasHist.from_series(self, include_breakpoint=include_breakpoint)
.with_bins(bins)
.to_frame()
)
def hist_from_bin_count(
self, bin_count: int, *, include_breakpoint: bool
) -> PandasLikeDataFrame:
return (
_PandasHist.from_series(self, include_breakpoint=include_breakpoint)
.with_bin_count(bin_count)
.to_frame()
)
def log(self, base: float) -> Self:
native = self.native
native_cls = type(native)
implementation = self._implementation
if get_dtype_backend(native.dtype, implementation=implementation) == "pyarrow":
import pyarrow.compute as pc
from narwhals._arrow.utils import native_to_narwhals_dtype
ca = native.array._pa_array
result_arr = cast("ChunkedArrayAny", pc.logb(ca, base))
nw_dtype = native_to_narwhals_dtype(result_arr.type, self._version)
out_dtype = narwhals_to_native_dtype(
nw_dtype, "pyarrow", self._implementation, self._version
)
result_native = native_cls(
result_arr, dtype=out_dtype, index=native.index, name=native.name
)
else:
array_funcs = self._array_funcs
result_arr = array_funcs.log(native) / array_funcs.log(base)
result_native = (
native_cls(result_arr, index=native.index, name=native.name)
if implementation.is_cudf()
else result_arr
)
return self._with_native(result_native)
def exp(self) -> Self:
native = self.native
native_cls = type(native)
implementation = self._implementation
if get_dtype_backend(native.dtype, implementation=implementation) == "pyarrow":
import pyarrow.compute as pc
from narwhals._arrow.utils import native_to_narwhals_dtype
ca = native.array._pa_array
result_arr = cast("ChunkedArrayAny", pc.exp(ca))
nw_dtype = native_to_narwhals_dtype(result_arr.type, self._version)
out_dtype = narwhals_to_native_dtype(
nw_dtype, "pyarrow", self._implementation, self._version
)
result_native = native_cls(
result_arr, dtype=out_dtype, index=native.index, name=native.name
)
else:
result_arr = self._array_funcs.exp(native)
result_native = (
native_cls(result_arr, index=native.index, name=native.name)
if implementation.is_cudf()
else result_arr
)
return self._with_native(result_native)
def sqrt(self) -> Self:
return self._with_native(self.native.pow(0.5))
@property
def str(self) -> PandasLikeSeriesStringNamespace:
return PandasLikeSeriesStringNamespace(self)
@property
def dt(self) -> PandasLikeSeriesDateTimeNamespace:
return PandasLikeSeriesDateTimeNamespace(self)
@property
def cat(self) -> PandasLikeSeriesCatNamespace:
return PandasLikeSeriesCatNamespace(self)
@property
def list(self) -> PandasLikeSeriesListNamespace:
if not hasattr(self.native, "list"):
msg = "Series must be of PyArrow List type to support list namespace."
raise TypeError(msg)
return PandasLikeSeriesListNamespace(self)
@property
def struct(self) -> PandasLikeSeriesStructNamespace:
if not hasattr(self.native, "struct"):
msg = "Series must be of PyArrow Struct type to support struct namespace."
raise TypeError(msg)
return PandasLikeSeriesStructNamespace(self)
class _PandasHist(EagerSeriesHist["pd.Series[Any]", "list[float]"]):
_series: PandasLikeSeries
def to_frame(self) -> PandasLikeDataFrame:
from_native = self._series.__narwhals_namespace__()._dataframe.from_native
DataFrame = self._series.__native_namespace__().DataFrame # noqa: N806
return from_native(DataFrame(self._data), context=self._series)
# NOTE: *Could* be handled at narwhals-level
def is_empty_series(self) -> bool:
return self._series.count() < 1
# NOTE: *Could* be handled at narwhals-level, **iff** we add `nw.repeat`, `nw.linear_space`
# See https://github.com/narwhals-dev/narwhals/pull/2839#discussion_r2215630696
def series_empty(self, arg: int | list[float], /) -> PandasHistData:
count = self._zeros(arg)
if self._breakpoint:
return {"breakpoint": self._calculate_breakpoint(arg), "count": count}
return {"count": count}
def _zeros(self, arg: int | list[float], /) -> _1DArray:
zeros = self._series._array_funcs.zeros
return zeros(arg) if isinstance(arg, int) else zeros(len(arg) - 1)
# NOTE: Based on `pl.Expr.cut`
def _cut(
self,
breaks: list[float] | _1DArray,
*,
labels: Sequence[str] | None = None,
closed: Literal["left", "right"] = "right",
) -> pd.Series[Any]:
# NOTE: Polars 1.27.0 always includes the lowest bin
cut = self._series.__native_namespace__().cut
return cut(
self.native,
bins=breaks,
right=closed == "right",
labels=labels,
include_lowest=True,
)
def _linear_space(
self,
start: float,
end: float,
num_samples: int,
*,
closed: Literal["both", "none"] = "both",
) -> _1DArray:
return self._series._array_funcs.linspace(
start=start, stop=end, num=num_samples, endpoint=closed == "both"
)
def _calculate_bins(self, bin_count: int) -> _1DArray:
"""Prepare bins for histogram calculation from bin_count."""
lower, upper = self.native.min(), self.native.max()
if lower == upper:
lower -= 0.5
upper += 0.5
return self._linear_space(lower, upper, bin_count + 1)
def _calculate_hist(self, bins: list[float] | _1DArray) -> PandasHistData:
# pandas (2.2.*) .value_counts(bins=[...]) adjusts the lowest bin which should not
# happen since the bins were explicitly passed in.
categories = self._cut(bins)
# modin (0.32.0) .value_counts(...) silently drops bins with empty observations,
# .reindex is necessary to restore these bins.
count = categories.value_counts(dropna=True, sort=False).reindex(
categories.cat.categories, fill_value=0
)
count.reset_index(drop=True, inplace=True) # noqa: PD002
if self._breakpoint:
return {"breakpoint": bins[1:], "count": count}
return {"count": count}