530 lines
19 KiB
Python
530 lines
19 KiB
Python
![]() |
# Licensed to the Apache Software Foundation (ASF) under one
|
||
|
# or more contributor license agreements. See the NOTICE file
|
||
|
# distributed with this work for additional information
|
||
|
# regarding copyright ownership. The ASF licenses this file
|
||
|
# to you under the Apache License, Version 2.0 (the
|
||
|
# "License"); you may not use this file except in compliance
|
||
|
# with the License. You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing,
|
||
|
# software distributed under the License is distributed on an
|
||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
|
# KIND, either express or implied. See the License for the
|
||
|
# specific language governing permissions and limitations
|
||
|
# under the License.
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import enum
|
||
|
from typing import (
|
||
|
Any,
|
||
|
Dict,
|
||
|
Iterable,
|
||
|
Optional,
|
||
|
Tuple,
|
||
|
)
|
||
|
|
||
|
import sys
|
||
|
if sys.version_info >= (3, 8):
|
||
|
from typing import TypedDict
|
||
|
else:
|
||
|
from typing_extensions import TypedDict
|
||
|
|
||
|
import pyarrow as pa
|
||
|
import pyarrow.compute as pc
|
||
|
from pyarrow.interchange.buffer import _PyArrowBuffer
|
||
|
|
||
|
|
||
|
class DtypeKind(enum.IntEnum):
|
||
|
"""
|
||
|
Integer enum for data types.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
INT : int
|
||
|
Matches to signed integer data type.
|
||
|
UINT : int
|
||
|
Matches to unsigned integer data type.
|
||
|
FLOAT : int
|
||
|
Matches to floating point data type.
|
||
|
BOOL : int
|
||
|
Matches to boolean data type.
|
||
|
STRING : int
|
||
|
Matches to string data type (UTF-8 encoded).
|
||
|
DATETIME : int
|
||
|
Matches to datetime data type.
|
||
|
CATEGORICAL : int
|
||
|
Matches to categorical data type.
|
||
|
"""
|
||
|
|
||
|
INT = 0
|
||
|
UINT = 1
|
||
|
FLOAT = 2
|
||
|
BOOL = 20
|
||
|
STRING = 21 # UTF-8
|
||
|
DATETIME = 22
|
||
|
CATEGORICAL = 23
|
||
|
|
||
|
|
||
|
Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype
|
||
|
|
||
|
|
||
|
_PYARROW_KINDS = {
|
||
|
pa.int8(): (DtypeKind.INT, "c"),
|
||
|
pa.int16(): (DtypeKind.INT, "s"),
|
||
|
pa.int32(): (DtypeKind.INT, "i"),
|
||
|
pa.int64(): (DtypeKind.INT, "l"),
|
||
|
pa.uint8(): (DtypeKind.UINT, "C"),
|
||
|
pa.uint16(): (DtypeKind.UINT, "S"),
|
||
|
pa.uint32(): (DtypeKind.UINT, "I"),
|
||
|
pa.uint64(): (DtypeKind.UINT, "L"),
|
||
|
pa.float16(): (DtypeKind.FLOAT, "e"),
|
||
|
pa.float32(): (DtypeKind.FLOAT, "f"),
|
||
|
pa.float64(): (DtypeKind.FLOAT, "g"),
|
||
|
pa.bool_(): (DtypeKind.BOOL, "b"),
|
||
|
pa.string(): (DtypeKind.STRING, "u"),
|
||
|
pa.large_string(): (DtypeKind.STRING, "U"),
|
||
|
}
|
||
|
|
||
|
|
||
|
class ColumnNullType(enum.IntEnum):
|
||
|
"""
|
||
|
Integer enum for null type representation.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
NON_NULLABLE : int
|
||
|
Non-nullable column.
|
||
|
USE_NAN : int
|
||
|
Use explicit float NaN value.
|
||
|
USE_SENTINEL : int
|
||
|
Sentinel value besides NaN.
|
||
|
USE_BITMASK : int
|
||
|
The bit is set/unset representing a null on a certain position.
|
||
|
USE_BYTEMASK : int
|
||
|
The byte is set/unset representing a null on a certain position.
|
||
|
"""
|
||
|
|
||
|
NON_NULLABLE = 0
|
||
|
USE_NAN = 1
|
||
|
USE_SENTINEL = 2
|
||
|
USE_BITMASK = 3
|
||
|
USE_BYTEMASK = 4
|
||
|
|
||
|
|
||
|
class ColumnBuffers(TypedDict):
|
||
|
# first element is a buffer containing the column data;
|
||
|
# second element is the data buffer's associated dtype
|
||
|
data: Tuple[_PyArrowBuffer, Dtype]
|
||
|
|
||
|
# first element is a buffer containing mask values indicating missing data;
|
||
|
# second element is the mask value buffer's associated dtype.
|
||
|
# None if the null representation is not a bit or byte mask
|
||
|
validity: Optional[Tuple[_PyArrowBuffer, Dtype]]
|
||
|
|
||
|
# first element is a buffer containing the offset values for
|
||
|
# variable-size binary data (e.g., variable-length strings);
|
||
|
# second element is the offsets buffer's associated dtype.
|
||
|
# None if the data buffer does not have an associated offsets buffer
|
||
|
offsets: Optional[Tuple[_PyArrowBuffer, Dtype]]
|
||
|
|
||
|
|
||
|
class CategoricalDescription(TypedDict):
|
||
|
# whether the ordering of dictionary indices is semantically meaningful
|
||
|
is_ordered: bool
|
||
|
# whether a dictionary-style mapping of categorical values to other objects
|
||
|
# exists
|
||
|
is_dictionary: bool
|
||
|
# Python-level only (e.g. ``{int: str}``).
|
||
|
# None if not a dictionary-style categorical.
|
||
|
categories: Optional[_PyArrowColumn]
|
||
|
|
||
|
|
||
|
class Endianness:
|
||
|
"""Enum indicating the byte-order of a data-type."""
|
||
|
|
||
|
LITTLE = "<"
|
||
|
BIG = ">"
|
||
|
NATIVE = "="
|
||
|
NA = "|"
|
||
|
|
||
|
|
||
|
class NoBufferPresent(Exception):
|
||
|
"""Exception to signal that there is no requested buffer."""
|
||
|
|
||
|
|
||
|
class _PyArrowColumn:
|
||
|
"""
|
||
|
A column object, with only the methods and properties required by the
|
||
|
interchange protocol defined.
|
||
|
|
||
|
A column can contain one or more chunks. Each chunk can contain up to three
|
||
|
buffers - a data buffer, a mask buffer (depending on null representation),
|
||
|
and an offsets buffer (if variable-size binary; e.g., variable-length
|
||
|
strings).
|
||
|
|
||
|
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
|
||
|
Instead, it seems to use "children" for both columns with a bit mask,
|
||
|
and for nested dtypes. Unclear whether this is elegant or confusing.
|
||
|
This design requires checking the null representation explicitly.
|
||
|
|
||
|
The Arrow design requires checking:
|
||
|
1. the ARROW_FLAG_NULLABLE (for sentinel values)
|
||
|
2. if a column has two children, combined with one of those children
|
||
|
having a null dtype.
|
||
|
|
||
|
Making the mask concept explicit seems useful. One null dtype would
|
||
|
not be enough to cover both bit and byte masks, so that would mean
|
||
|
even more checking if we did it the Arrow way.
|
||
|
|
||
|
TBD: there's also the "chunk" concept here, which is implicit in Arrow as
|
||
|
multiple buffers per array (= column here). Semantically it may make
|
||
|
sense to have both: chunks were meant for example for lazy evaluation
|
||
|
of data which doesn't fit in memory, while multiple buffers per column
|
||
|
could also come from doing a selection operation on a single
|
||
|
contiguous buffer.
|
||
|
|
||
|
Given these concepts, one would expect chunks to be all of the same
|
||
|
size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
|
||
|
while multiple buffers could have data-dependent lengths. Not an issue
|
||
|
in pandas if one column is backed by a single NumPy array, but in
|
||
|
Arrow it seems possible.
|
||
|
Are multiple chunks *and* multiple buffers per column necessary for
|
||
|
the purposes of this interchange protocol, or must producers either
|
||
|
reuse the chunk concept for this or copy the data?
|
||
|
|
||
|
Note: this Column object can only be produced by ``__dataframe__``, so
|
||
|
doesn't need its own version or ``__column__`` protocol.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True
|
||
|
) -> None:
|
||
|
"""
|
||
|
Handles PyArrow Arrays and ChunkedArrays.
|
||
|
"""
|
||
|
# Store the column as a private attribute
|
||
|
if isinstance(column, pa.ChunkedArray):
|
||
|
if column.num_chunks == 1:
|
||
|
column = column.chunk(0)
|
||
|
else:
|
||
|
if not allow_copy:
|
||
|
raise RuntimeError(
|
||
|
"Chunks will be combined and a copy is required which "
|
||
|
"is forbidden by allow_copy=False"
|
||
|
)
|
||
|
column = column.combine_chunks()
|
||
|
|
||
|
self._allow_copy = allow_copy
|
||
|
|
||
|
if pa.types.is_boolean(column.type):
|
||
|
if not allow_copy:
|
||
|
raise RuntimeError(
|
||
|
"Boolean column will be casted to uint8 and a copy "
|
||
|
"is required which is forbidden by allow_copy=False"
|
||
|
)
|
||
|
self._dtype = self._dtype_from_arrowdtype(column.type, 8)
|
||
|
self._col = pc.cast(column, pa.uint8())
|
||
|
else:
|
||
|
self._col = column
|
||
|
dtype = self._col.type
|
||
|
try:
|
||
|
bit_width = dtype.bit_width
|
||
|
except ValueError:
|
||
|
# in case of a variable-length strings, considered as array
|
||
|
# of bytes (8 bits)
|
||
|
bit_width = 8
|
||
|
self._dtype = self._dtype_from_arrowdtype(dtype, bit_width)
|
||
|
|
||
|
def size(self) -> int:
|
||
|
"""
|
||
|
Size of the column, in elements.
|
||
|
|
||
|
Corresponds to DataFrame.num_rows() if column is a single chunk;
|
||
|
equal to size of this current chunk otherwise.
|
||
|
|
||
|
Is a method rather than a property because it may cause a (potentially
|
||
|
expensive) computation for some dataframe implementations.
|
||
|
"""
|
||
|
return len(self._col)
|
||
|
|
||
|
@property
|
||
|
def offset(self) -> int:
|
||
|
"""
|
||
|
Offset of first element.
|
||
|
|
||
|
May be > 0 if using chunks; for example for a column with N chunks of
|
||
|
equal size M (only the last chunk may be shorter),
|
||
|
``offset = n * M``, ``n = 0 .. N-1``.
|
||
|
"""
|
||
|
return self._col.offset
|
||
|
|
||
|
@property
|
||
|
def dtype(self) -> Tuple[DtypeKind, int, str, str]:
|
||
|
"""
|
||
|
Dtype description as a tuple ``(kind, bit-width, format string,
|
||
|
endianness)``.
|
||
|
|
||
|
Bit-width : the number of bits as an integer
|
||
|
Format string : data type description format string in Apache Arrow C
|
||
|
Data Interface format.
|
||
|
Endianness : current only native endianness (``=``) is supported
|
||
|
|
||
|
Notes:
|
||
|
- Kind specifiers are aligned with DLPack where possible (hence the
|
||
|
jump to 20, leave enough room for future extension)
|
||
|
- Masks must be specified as boolean with either bit width 1 (for
|
||
|
bit masks) or 8 (for byte masks).
|
||
|
- Dtype width in bits was preferred over bytes
|
||
|
- Endianness isn't too useful, but included now in case in the
|
||
|
future we need to support non-native endianness
|
||
|
- Went with Apache Arrow format strings over NumPy format strings
|
||
|
because they're more complete from a dataframe perspective
|
||
|
- Format strings are mostly useful for datetime specification, and
|
||
|
for categoricals.
|
||
|
- For categoricals, the format string describes the type of the
|
||
|
categorical in the data buffer. In case of a separate encoding of
|
||
|
the categorical (e.g. an integer to string mapping), this can
|
||
|
be derived from ``self.describe_categorical``.
|
||
|
- Data types not included: complex, Arrow-style null, binary,
|
||
|
decimal, and nested (list, struct, map, union) dtypes.
|
||
|
"""
|
||
|
return self._dtype
|
||
|
|
||
|
def _dtype_from_arrowdtype(
|
||
|
self, dtype: pa.DataType, bit_width: int
|
||
|
) -> Tuple[DtypeKind, int, str, str]:
|
||
|
"""
|
||
|
See `self.dtype` for details.
|
||
|
"""
|
||
|
# Note: 'c' (complex) not handled yet (not in array spec v1).
|
||
|
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
|
||
|
# not handled datetime and timedelta both map to datetime
|
||
|
# (is timedelta handled?)
|
||
|
|
||
|
if pa.types.is_timestamp(dtype):
|
||
|
kind = DtypeKind.DATETIME
|
||
|
ts = dtype.unit[0]
|
||
|
tz = dtype.tz if dtype.tz else ""
|
||
|
f_string = f"ts{ts}:{tz}"
|
||
|
return kind, bit_width, f_string, Endianness.NATIVE
|
||
|
elif pa.types.is_dictionary(dtype):
|
||
|
kind = DtypeKind.CATEGORICAL
|
||
|
arr = self._col
|
||
|
indices_dtype = arr.indices.type
|
||
|
_, f_string = _PYARROW_KINDS.get(indices_dtype)
|
||
|
return kind, bit_width, f_string, Endianness.NATIVE
|
||
|
else:
|
||
|
kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
|
||
|
if kind is None:
|
||
|
raise ValueError(
|
||
|
f"Data type {dtype} not supported by interchange protocol")
|
||
|
|
||
|
return kind, bit_width, f_string, Endianness.NATIVE
|
||
|
|
||
|
@property
|
||
|
def describe_categorical(self) -> CategoricalDescription:
|
||
|
"""
|
||
|
If the dtype is categorical, there are two options:
|
||
|
- There are only values in the data buffer.
|
||
|
- There is a separate non-categorical Column encoding categorical
|
||
|
values.
|
||
|
|
||
|
Raises TypeError if the dtype is not categorical
|
||
|
|
||
|
Returns the dictionary with description on how to interpret the
|
||
|
data buffer:
|
||
|
- "is_ordered" : bool, whether the ordering of dictionary indices
|
||
|
is semantically meaningful.
|
||
|
- "is_dictionary" : bool, whether a mapping of
|
||
|
categorical values to other objects exists
|
||
|
- "categories" : Column representing the (implicit) mapping of
|
||
|
indices to category values (e.g. an array of
|
||
|
cat1, cat2, ...). None if not a dictionary-style
|
||
|
categorical.
|
||
|
|
||
|
TBD: are there any other in-memory representations that are needed?
|
||
|
"""
|
||
|
arr = self._col
|
||
|
if not pa.types.is_dictionary(arr.type):
|
||
|
raise TypeError(
|
||
|
"describe_categorical only works on a column with "
|
||
|
"categorical dtype!"
|
||
|
)
|
||
|
|
||
|
return {
|
||
|
"is_ordered": self._col.type.ordered,
|
||
|
"is_dictionary": True,
|
||
|
"categories": _PyArrowColumn(arr.dictionary),
|
||
|
}
|
||
|
|
||
|
@property
|
||
|
def describe_null(self) -> Tuple[ColumnNullType, Any]:
|
||
|
"""
|
||
|
Return the missing value (or "null") representation the column dtype
|
||
|
uses, as a tuple ``(kind, value)``.
|
||
|
|
||
|
Value : if kind is "sentinel value", the actual value. If kind is a bit
|
||
|
mask or a byte mask, the value (0 or 1) indicating a missing value.
|
||
|
None otherwise.
|
||
|
"""
|
||
|
# In case of no missing values, we need to set ColumnNullType to
|
||
|
# non nullable as in the current __dataframe__ protocol bit/byte masks
|
||
|
# cannot be None
|
||
|
if self.null_count == 0:
|
||
|
return ColumnNullType.NON_NULLABLE, None
|
||
|
else:
|
||
|
return ColumnNullType.USE_BITMASK, 0
|
||
|
|
||
|
@property
|
||
|
def null_count(self) -> int:
|
||
|
"""
|
||
|
Number of null elements, if known.
|
||
|
|
||
|
Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
|
||
|
"""
|
||
|
arrow_null_count = self._col.null_count
|
||
|
n = arrow_null_count if arrow_null_count != -1 else None
|
||
|
return n
|
||
|
|
||
|
@property
|
||
|
def metadata(self) -> Dict[str, Any]:
|
||
|
"""
|
||
|
The metadata for the column. See `DataFrame.metadata` for more details.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
def num_chunks(self) -> int:
|
||
|
"""
|
||
|
Return the number of chunks the column consists of.
|
||
|
"""
|
||
|
return 1
|
||
|
|
||
|
def get_chunks(
|
||
|
self, n_chunks: Optional[int] = None
|
||
|
) -> Iterable[_PyArrowColumn]:
|
||
|
"""
|
||
|
Return an iterator yielding the chunks.
|
||
|
|
||
|
See `DataFrame.get_chunks` for details on ``n_chunks``.
|
||
|
"""
|
||
|
if n_chunks and n_chunks > 1:
|
||
|
chunk_size = self.size() // n_chunks
|
||
|
if self.size() % n_chunks != 0:
|
||
|
chunk_size += 1
|
||
|
|
||
|
array = self._col
|
||
|
i = 0
|
||
|
for start in range(0, chunk_size * n_chunks, chunk_size):
|
||
|
yield _PyArrowColumn(
|
||
|
array.slice(start, chunk_size), self._allow_copy
|
||
|
)
|
||
|
i += 1
|
||
|
else:
|
||
|
yield self
|
||
|
|
||
|
def get_buffers(self) -> ColumnBuffers:
|
||
|
"""
|
||
|
Return a dictionary containing the underlying buffers.
|
||
|
|
||
|
The returned dictionary has the following contents:
|
||
|
|
||
|
- "data": a two-element tuple whose first element is a buffer
|
||
|
containing the data and whose second element is the data
|
||
|
buffer's associated dtype.
|
||
|
- "validity": a two-element tuple whose first element is a buffer
|
||
|
containing mask values indicating missing data and
|
||
|
whose second element is the mask value buffer's
|
||
|
associated dtype. None if the null representation is
|
||
|
not a bit or byte mask.
|
||
|
- "offsets": a two-element tuple whose first element is a buffer
|
||
|
containing the offset values for variable-size binary
|
||
|
data (e.g., variable-length strings) and whose second
|
||
|
element is the offsets buffer's associated dtype. None
|
||
|
if the data buffer does not have an associated offsets
|
||
|
buffer.
|
||
|
"""
|
||
|
buffers: ColumnBuffers = {
|
||
|
"data": self._get_data_buffer(),
|
||
|
"validity": None,
|
||
|
"offsets": None,
|
||
|
}
|
||
|
|
||
|
try:
|
||
|
buffers["validity"] = self._get_validity_buffer()
|
||
|
except NoBufferPresent:
|
||
|
pass
|
||
|
|
||
|
try:
|
||
|
buffers["offsets"] = self._get_offsets_buffer()
|
||
|
except NoBufferPresent:
|
||
|
pass
|
||
|
|
||
|
return buffers
|
||
|
|
||
|
def _get_data_buffer(
|
||
|
self,
|
||
|
) -> Tuple[_PyArrowBuffer, Any]: # Any is for self.dtype tuple
|
||
|
"""
|
||
|
Return the buffer containing the data and the buffer's
|
||
|
associated dtype.
|
||
|
"""
|
||
|
array = self._col
|
||
|
dtype = self.dtype
|
||
|
|
||
|
# In case of dictionary arrays, use indices
|
||
|
# to define a buffer, codes are transferred through
|
||
|
# describe_categorical()
|
||
|
if pa.types.is_dictionary(array.type):
|
||
|
array = array.indices
|
||
|
dtype = _PyArrowColumn(array).dtype
|
||
|
|
||
|
n = len(array.buffers())
|
||
|
if n == 2:
|
||
|
return _PyArrowBuffer(array.buffers()[1]), dtype
|
||
|
elif n == 3:
|
||
|
return _PyArrowBuffer(array.buffers()[2]), dtype
|
||
|
|
||
|
def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
|
||
|
"""
|
||
|
Return the buffer containing the mask values indicating missing data
|
||
|
and the buffer's associated dtype.
|
||
|
Raises NoBufferPresent if null representation is not a bit or byte
|
||
|
mask.
|
||
|
"""
|
||
|
# Define the dtype of the returned buffer
|
||
|
dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE)
|
||
|
array = self._col
|
||
|
buff = array.buffers()[0]
|
||
|
if buff:
|
||
|
return _PyArrowBuffer(buff), dtype
|
||
|
else:
|
||
|
raise NoBufferPresent(
|
||
|
"There are no missing values so "
|
||
|
"does not have a separate mask")
|
||
|
|
||
|
def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
|
||
|
"""
|
||
|
Return the buffer containing the offset values for variable-size binary
|
||
|
data (e.g., variable-length strings) and the buffer's associated dtype.
|
||
|
Raises NoBufferPresent if the data buffer does not have an associated
|
||
|
offsets buffer.
|
||
|
"""
|
||
|
array = self._col
|
||
|
n = len(array.buffers())
|
||
|
if n == 2:
|
||
|
raise NoBufferPresent(
|
||
|
"This column has a fixed-length dtype so "
|
||
|
"it does not have an offsets buffer"
|
||
|
)
|
||
|
elif n == 3:
|
||
|
# Define the dtype of the returned buffer
|
||
|
dtype = self._col.type
|
||
|
if pa.types.is_large_string(dtype):
|
||
|
dtype = (DtypeKind.INT, 64, "l", Endianness.NATIVE)
|
||
|
else:
|
||
|
dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE)
|
||
|
return _PyArrowBuffer(array.buffers()[1]), dtype
|