94 lines
2.6 KiB
Python
94 lines
2.6 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Literal,
|
|
)
|
|
|
|
import numpy as np
|
|
|
|
from pandas._config import using_string_dtype
|
|
|
|
from pandas._libs import lib
|
|
from pandas.compat import (
|
|
pa_version_under18p0,
|
|
pa_version_under19p0,
|
|
)
|
|
from pandas.compat._optional import import_optional_dependency
|
|
|
|
import pandas as pd
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Callable
|
|
|
|
import pyarrow
|
|
|
|
from pandas._typing import DtypeBackend
|
|
|
|
|
|
def _arrow_dtype_mapping() -> dict:
|
|
pa = import_optional_dependency("pyarrow")
|
|
return {
|
|
pa.int8(): pd.Int8Dtype(),
|
|
pa.int16(): pd.Int16Dtype(),
|
|
pa.int32(): pd.Int32Dtype(),
|
|
pa.int64(): pd.Int64Dtype(),
|
|
pa.uint8(): pd.UInt8Dtype(),
|
|
pa.uint16(): pd.UInt16Dtype(),
|
|
pa.uint32(): pd.UInt32Dtype(),
|
|
pa.uint64(): pd.UInt64Dtype(),
|
|
pa.bool_(): pd.BooleanDtype(),
|
|
pa.string(): pd.StringDtype(),
|
|
pa.float32(): pd.Float32Dtype(),
|
|
pa.float64(): pd.Float64Dtype(),
|
|
pa.string(): pd.StringDtype(),
|
|
pa.large_string(): pd.StringDtype(),
|
|
}
|
|
|
|
|
|
def _arrow_string_types_mapper() -> Callable:
|
|
pa = import_optional_dependency("pyarrow")
|
|
|
|
mapping = {
|
|
pa.string(): pd.StringDtype(na_value=np.nan),
|
|
pa.large_string(): pd.StringDtype(na_value=np.nan),
|
|
}
|
|
if not pa_version_under18p0:
|
|
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
|
|
|
|
return mapping.get
|
|
|
|
|
|
def arrow_table_to_pandas(
|
|
table: pyarrow.Table,
|
|
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
|
|
null_to_int64: bool = False,
|
|
to_pandas_kwargs: dict | None = None,
|
|
) -> pd.DataFrame:
|
|
if to_pandas_kwargs is None:
|
|
to_pandas_kwargs = {}
|
|
|
|
pa = import_optional_dependency("pyarrow")
|
|
|
|
types_mapper: type[pd.ArrowDtype] | None | Callable
|
|
if dtype_backend == "numpy_nullable":
|
|
mapping = _arrow_dtype_mapping()
|
|
if null_to_int64:
|
|
# Modify the default mapping to also map null to Int64
|
|
# (to match other engines - only for CSV parser)
|
|
mapping[pa.null()] = pd.Int64Dtype()
|
|
types_mapper = mapping.get
|
|
elif dtype_backend == "pyarrow":
|
|
types_mapper = pd.ArrowDtype
|
|
elif using_string_dtype():
|
|
if pa_version_under19p0:
|
|
types_mapper = _arrow_string_types_mapper()
|
|
else:
|
|
types_mapper = None
|
|
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
|
|
types_mapper = None
|
|
else:
|
|
raise NotImplementedError
|
|
|
|
df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
|
|
return df
|