284 lines
8.6 KiB
Cython
284 lines
8.6 KiB
Cython
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||
|
# or more contributor license agreements. See the NOTICE file
|
||
|
# distributed with this work for additional information
|
||
|
# regarding copyright ownership. The ASF licenses this file
|
||
|
# to you under the Apache License, Version 2.0 (the
|
||
|
# "License"); you may not use this file except in compliance
|
||
|
# with the License. You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing,
|
||
|
# software distributed under the License is distributed on an
|
||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
|
# KIND, either express or implied. See the License for the
|
||
|
# specific language governing permissions and limitations
|
||
|
# under the License.
|
||
|
|
||
|
# pandas lazy-loading API shim that reduces API call and import overhead
|
||
|
|
||
|
import warnings
|
||
|
from threading import Lock
|
||
|
|
||
|
|
||
|
cdef class _PandasAPIShim(object):
|
||
|
"""
|
||
|
Lazy pandas importer that isolates usages of pandas APIs and avoids
|
||
|
importing pandas until it's actually needed
|
||
|
"""
|
||
|
cdef:
|
||
|
bint _tried_importing_pandas
|
||
|
bint _have_pandas
|
||
|
|
||
|
cdef readonly:
|
||
|
object _loose_version, _version
|
||
|
object _pd, _types_api, _compat_module
|
||
|
object _data_frame, _index, _series, _categorical_type
|
||
|
object _datetimetz_type, _extension_array, _extension_dtype
|
||
|
object _array_like_types, _is_extension_array_dtype, _lock
|
||
|
bint has_sparse
|
||
|
bint _pd024
|
||
|
bint _is_v1, _is_ge_v21, _is_ge_v23, _is_ge_v3, _is_ge_v3_strict
|
||
|
|
||
|
def __init__(self):
|
||
|
self._lock = Lock()
|
||
|
self._tried_importing_pandas = False
|
||
|
self._have_pandas = 0
|
||
|
|
||
|
cdef _import_pandas(self, bint raise_):
|
||
|
try:
|
||
|
import pandas as pd
|
||
|
import pyarrow.pandas_compat as pdcompat
|
||
|
except ImportError:
|
||
|
self._have_pandas = False
|
||
|
if raise_:
|
||
|
raise
|
||
|
else:
|
||
|
return
|
||
|
|
||
|
from pyarrow.vendored.version import Version
|
||
|
|
||
|
self._pd = pd
|
||
|
self._version = pd.__version__
|
||
|
self._loose_version = Version(pd.__version__)
|
||
|
self._is_v1 = False
|
||
|
|
||
|
if self._loose_version < Version('1.0.0'):
|
||
|
self._have_pandas = False
|
||
|
if raise_:
|
||
|
raise ImportError(
|
||
|
f"pyarrow requires pandas 1.0.0 or above, pandas {self._version} is "
|
||
|
"installed"
|
||
|
)
|
||
|
else:
|
||
|
warnings.warn(
|
||
|
f"pyarrow requires pandas 1.0.0 or above, pandas {self._version} is "
|
||
|
"installed. Therefore, pandas-specific integration is not "
|
||
|
"used.",
|
||
|
stacklevel=2
|
||
|
)
|
||
|
return
|
||
|
|
||
|
self._is_v1 = self._loose_version < Version('2.0.0')
|
||
|
self._is_ge_v21 = self._loose_version >= Version('2.1.0')
|
||
|
self._is_ge_v23 = self._loose_version >= Version('2.3.0.dev0')
|
||
|
self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0')
|
||
|
self._is_ge_v3_strict = self._loose_version >= Version('3.0.0')
|
||
|
|
||
|
self._compat_module = pdcompat
|
||
|
self._data_frame = pd.DataFrame
|
||
|
self._index = pd.Index
|
||
|
self._categorical_type = pd.Categorical
|
||
|
self._series = pd.Series
|
||
|
self._extension_array = pd.api.extensions.ExtensionArray
|
||
|
self._array_like_types = (
|
||
|
self._series, self._index, self._categorical_type,
|
||
|
self._extension_array)
|
||
|
self._extension_dtype = pd.api.extensions.ExtensionDtype
|
||
|
self._is_extension_array_dtype = (
|
||
|
pd.api.types.is_extension_array_dtype)
|
||
|
self._types_api = pd.api.types
|
||
|
self._datetimetz_type = pd.api.types.DatetimeTZDtype
|
||
|
self._have_pandas = True
|
||
|
self.has_sparse = False
|
||
|
|
||
|
cdef inline _check_import(self, bint raise_=True):
|
||
|
if not self._tried_importing_pandas:
|
||
|
with self._lock:
|
||
|
if not self._tried_importing_pandas:
|
||
|
try:
|
||
|
self._import_pandas(raise_)
|
||
|
finally:
|
||
|
self._tried_importing_pandas = True
|
||
|
return
|
||
|
|
||
|
if not self._have_pandas and raise_:
|
||
|
self._import_pandas(raise_)
|
||
|
|
||
|
def series(self, *args, **kwargs):
|
||
|
self._check_import()
|
||
|
return self._series(*args, **kwargs)
|
||
|
|
||
|
def data_frame(self, *args, **kwargs):
|
||
|
self._check_import()
|
||
|
return self._data_frame(*args, **kwargs)
|
||
|
|
||
|
cdef inline bint _have_pandas_internal(self):
|
||
|
if not self._tried_importing_pandas:
|
||
|
self._check_import(raise_=False)
|
||
|
return self._have_pandas
|
||
|
|
||
|
@property
|
||
|
def have_pandas(self):
|
||
|
return self._have_pandas_internal()
|
||
|
|
||
|
@property
|
||
|
def compat(self):
|
||
|
self._check_import()
|
||
|
return self._compat_module
|
||
|
|
||
|
@property
|
||
|
def pd(self):
|
||
|
self._check_import()
|
||
|
return self._pd
|
||
|
|
||
|
cpdef infer_dtype(self, obj):
|
||
|
self._check_import()
|
||
|
try:
|
||
|
return self._types_api.infer_dtype(obj, skipna=False)
|
||
|
except AttributeError:
|
||
|
return self._pd.lib.infer_dtype(obj)
|
||
|
|
||
|
cpdef pandas_dtype(self, dtype):
|
||
|
self._check_import()
|
||
|
try:
|
||
|
return self._types_api.pandas_dtype(dtype)
|
||
|
except AttributeError:
|
||
|
return None
|
||
|
|
||
|
@property
|
||
|
def loose_version(self):
|
||
|
self._check_import()
|
||
|
return self._loose_version
|
||
|
|
||
|
@property
|
||
|
def version(self):
|
||
|
self._check_import()
|
||
|
return self._version
|
||
|
|
||
|
def is_v1(self):
|
||
|
self._check_import()
|
||
|
return self._is_v1
|
||
|
|
||
|
def is_ge_v21(self):
|
||
|
self._check_import()
|
||
|
return self._is_ge_v21
|
||
|
|
||
|
def is_ge_v23(self):
|
||
|
self._check_import()
|
||
|
return self._is_ge_v23
|
||
|
|
||
|
def is_ge_v3(self):
|
||
|
self._check_import()
|
||
|
return self._is_ge_v3
|
||
|
|
||
|
def is_ge_v3_strict(self):
|
||
|
self._check_import()
|
||
|
return self._is_ge_v3_strict
|
||
|
|
||
|
def uses_string_dtype(self):
|
||
|
if self.is_ge_v3_strict():
|
||
|
return True
|
||
|
try:
|
||
|
if self.is_ge_v23() and self.pd.options.future.infer_string:
|
||
|
return True
|
||
|
except:
|
||
|
pass
|
||
|
return False
|
||
|
|
||
|
@property
|
||
|
def categorical_type(self):
|
||
|
self._check_import()
|
||
|
return self._categorical_type
|
||
|
|
||
|
@property
|
||
|
def datetimetz_type(self):
|
||
|
self._check_import()
|
||
|
return self._datetimetz_type
|
||
|
|
||
|
@property
|
||
|
def extension_dtype(self):
|
||
|
self._check_import()
|
||
|
return self._extension_dtype
|
||
|
|
||
|
cpdef is_array_like(self, obj):
|
||
|
self._check_import()
|
||
|
return isinstance(obj, self._array_like_types)
|
||
|
|
||
|
cpdef is_categorical(self, obj):
|
||
|
if self._have_pandas_internal():
|
||
|
return isinstance(obj, self._categorical_type)
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
cpdef is_datetimetz(self, obj):
|
||
|
if self._have_pandas_internal():
|
||
|
return isinstance(obj, self._datetimetz_type)
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
cpdef is_extension_array_dtype(self, obj):
|
||
|
self._check_import()
|
||
|
if self._is_extension_array_dtype:
|
||
|
return self._is_extension_array_dtype(obj)
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
cpdef is_sparse(self, obj):
|
||
|
if self._have_pandas_internal():
|
||
|
return isinstance(obj.dtype, self.pd.SparseDtype)
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
cpdef is_data_frame(self, obj):
|
||
|
if self._have_pandas_internal():
|
||
|
return isinstance(obj, self._data_frame)
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
cpdef is_series(self, obj):
|
||
|
if self._have_pandas_internal():
|
||
|
return isinstance(obj, self._series)
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
cpdef is_index(self, obj):
|
||
|
if self._have_pandas_internal():
|
||
|
return isinstance(obj, self._index)
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
cpdef get_values(self, obj):
|
||
|
"""
|
||
|
Get the underlying array values of a pandas Series or Index in the
|
||
|
format (np.ndarray or pandas ExtensionArray) as we need them.
|
||
|
|
||
|
Assumes obj is a pandas Series or Index.
|
||
|
"""
|
||
|
self._check_import()
|
||
|
if isinstance(obj.dtype, (self.pd.api.types.IntervalDtype,
|
||
|
self.pd.api.types.PeriodDtype)):
|
||
|
return obj.array
|
||
|
return obj.values
|
||
|
|
||
|
def get_rangeindex_attribute(self, level, name):
|
||
|
# public start/stop/step attributes added in pandas 0.25.0
|
||
|
self._check_import()
|
||
|
if hasattr(level, name):
|
||
|
return getattr(level, name)
|
||
|
return getattr(level, '_' + name)
|
||
|
|
||
|
|
||
|
cdef _PandasAPIShim pandas_api = _PandasAPIShim()
|
||
|
_pandas_api = pandas_api
|