614 lines
19 KiB
Python
614 lines
19 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import (
|
|
Any,
|
|
Tuple,
|
|
)
|
|
|
|
from pyarrow.interchange.column import (
|
|
DtypeKind,
|
|
ColumnBuffers,
|
|
ColumnNullType,
|
|
)
|
|
|
|
import pyarrow as pa
|
|
import re
|
|
|
|
import pyarrow.compute as pc
|
|
from pyarrow.interchange.column import Dtype
|
|
|
|
|
|
# A typing protocol could be added later to let Mypy validate code using
|
|
# `from_dataframe` better.
|
|
DataFrameObject = Any
|
|
ColumnObject = Any
|
|
BufferObject = Any
|
|
|
|
|
|
_PYARROW_DTYPES: dict[DtypeKind, dict[int, Any]] = {
|
|
DtypeKind.INT: {8: pa.int8(),
|
|
16: pa.int16(),
|
|
32: pa.int32(),
|
|
64: pa.int64()},
|
|
DtypeKind.UINT: {8: pa.uint8(),
|
|
16: pa.uint16(),
|
|
32: pa.uint32(),
|
|
64: pa.uint64()},
|
|
DtypeKind.FLOAT: {16: pa.float16(),
|
|
32: pa.float32(),
|
|
64: pa.float64()},
|
|
DtypeKind.BOOL: {1: pa.bool_(),
|
|
8: pa.uint8()},
|
|
DtypeKind.STRING: {8: pa.string()},
|
|
}
|
|
|
|
|
|
def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table:
|
|
"""
|
|
Build a ``pa.Table`` from any DataFrame supporting the interchange protocol.
|
|
|
|
Parameters
|
|
----------
|
|
df : DataFrameObject
|
|
Object supporting the interchange protocol, i.e. `__dataframe__`
|
|
method.
|
|
allow_copy : bool, default: True
|
|
Whether to allow copying the memory to perform the conversion
|
|
(if false then zero-copy approach is requested).
|
|
|
|
Returns
|
|
-------
|
|
pa.Table
|
|
|
|
Examples
|
|
--------
|
|
>>> import pyarrow
|
|
>>> from pyarrow.interchange import from_dataframe
|
|
|
|
Convert a pandas dataframe to a pyarrow table:
|
|
|
|
>>> import pandas as pd
|
|
>>> df = pd.DataFrame({
|
|
... "n_attendees": [100, 10, 1],
|
|
... "country": ["Italy", "Spain", "Slovenia"],
|
|
... })
|
|
>>> df
|
|
n_attendees country
|
|
0 100 Italy
|
|
1 10 Spain
|
|
2 1 Slovenia
|
|
>>> from_dataframe(df)
|
|
pyarrow.Table
|
|
n_attendees: int64
|
|
country: large_string
|
|
----
|
|
n_attendees: [[100,10,1]]
|
|
country: [["Italy","Spain","Slovenia"]]
|
|
"""
|
|
if isinstance(df, pa.Table):
|
|
return df
|
|
elif isinstance(df, pa.RecordBatch):
|
|
return pa.Table.from_batches([df])
|
|
|
|
if not hasattr(df, "__dataframe__"):
|
|
raise ValueError("`df` does not support __dataframe__")
|
|
|
|
return _from_dataframe(df.__dataframe__(allow_copy=allow_copy),
|
|
allow_copy=allow_copy)
|
|
|
|
|
|
def _from_dataframe(df: DataFrameObject, allow_copy=True):
|
|
"""
|
|
Build a ``pa.Table`` from the DataFrame interchange object.
|
|
|
|
Parameters
|
|
----------
|
|
df : DataFrameObject
|
|
Object supporting the interchange protocol, i.e. `__dataframe__`
|
|
method.
|
|
allow_copy : bool, default: True
|
|
Whether to allow copying the memory to perform the conversion
|
|
(if false then zero-copy approach is requested).
|
|
|
|
Returns
|
|
-------
|
|
pa.Table
|
|
"""
|
|
batches = []
|
|
for chunk in df.get_chunks():
|
|
batch = protocol_df_chunk_to_pyarrow(chunk, allow_copy)
|
|
batches.append(batch)
|
|
|
|
if not batches:
|
|
batch = protocol_df_chunk_to_pyarrow(df)
|
|
batches.append(batch)
|
|
|
|
return pa.Table.from_batches(batches)
|
|
|
|
|
|
def protocol_df_chunk_to_pyarrow(
|
|
df: DataFrameObject,
|
|
allow_copy: bool = True
|
|
) -> pa.RecordBatch:
|
|
"""
|
|
Convert interchange protocol chunk to ``pa.RecordBatch``.
|
|
|
|
Parameters
|
|
----------
|
|
df : DataFrameObject
|
|
Object supporting the interchange protocol, i.e. `__dataframe__`
|
|
method.
|
|
allow_copy : bool, default: True
|
|
Whether to allow copying the memory to perform the conversion
|
|
(if false then zero-copy approach is requested).
|
|
|
|
Returns
|
|
-------
|
|
pa.RecordBatch
|
|
"""
|
|
# We need a dict of columns here, with each column being a pa.Array
|
|
columns: dict[str, pa.Array] = {}
|
|
for name in df.column_names():
|
|
if not isinstance(name, str):
|
|
raise ValueError(f"Column {name} is not a string")
|
|
if name in columns:
|
|
raise ValueError(f"Column {name} is not unique")
|
|
col = df.get_column_by_name(name)
|
|
dtype = col.dtype[0]
|
|
if dtype in (
|
|
DtypeKind.INT,
|
|
DtypeKind.UINT,
|
|
DtypeKind.FLOAT,
|
|
DtypeKind.STRING,
|
|
DtypeKind.DATETIME,
|
|
):
|
|
columns[name] = column_to_array(col, allow_copy)
|
|
elif dtype == DtypeKind.BOOL:
|
|
columns[name] = bool_column_to_array(col, allow_copy)
|
|
elif dtype == DtypeKind.CATEGORICAL:
|
|
columns[name] = categorical_column_to_dictionary(col, allow_copy)
|
|
else:
|
|
raise NotImplementedError(f"Data type {dtype} not handled yet")
|
|
|
|
return pa.RecordBatch.from_pydict(columns)
|
|
|
|
|
|
def column_to_array(
|
|
col: ColumnObject,
|
|
allow_copy: bool = True,
|
|
) -> pa.Array:
|
|
"""
|
|
Convert a column holding one of the primitive dtypes to a PyArrow array.
|
|
A primitive type is one of: int, uint, float, bool (1 bit).
|
|
|
|
Parameters
|
|
----------
|
|
col : ColumnObject
|
|
allow_copy : bool, default: True
|
|
Whether to allow copying the memory to perform the conversion
|
|
(if false then zero-copy approach is requested).
|
|
|
|
Returns
|
|
-------
|
|
pa.Array
|
|
"""
|
|
buffers = col.get_buffers()
|
|
data_type = col.dtype
|
|
data = buffers_to_array(buffers, data_type,
|
|
col.size(),
|
|
col.describe_null,
|
|
col.offset,
|
|
allow_copy)
|
|
return data
|
|
|
|
|
|
def bool_column_to_array(
|
|
col: ColumnObject,
|
|
allow_copy: bool = True,
|
|
) -> pa.Array:
|
|
"""
|
|
Convert a column holding boolean dtype to a PyArrow array.
|
|
|
|
Parameters
|
|
----------
|
|
col : ColumnObject
|
|
allow_copy : bool, default: True
|
|
Whether to allow copying the memory to perform the conversion
|
|
(if false then zero-copy approach is requested).
|
|
|
|
Returns
|
|
-------
|
|
pa.Array
|
|
"""
|
|
buffers = col.get_buffers()
|
|
size = buffers["data"][1][1]
|
|
|
|
# If booleans are byte-packed a copy to bit-packed will be made
|
|
if size == 8 and not allow_copy:
|
|
raise RuntimeError(
|
|
"Boolean column will be casted from uint8 and a copy "
|
|
"is required which is forbidden by allow_copy=False"
|
|
)
|
|
|
|
data_type = col.dtype
|
|
data = buffers_to_array(buffers, data_type,
|
|
col.size(),
|
|
col.describe_null,
|
|
col.offset)
|
|
if size == 8:
|
|
data = pc.cast(data, pa.bool_())
|
|
|
|
return data
|
|
|
|
|
|
def categorical_column_to_dictionary(
|
|
col: ColumnObject,
|
|
allow_copy: bool = True,
|
|
) -> pa.DictionaryArray:
|
|
"""
|
|
Convert a column holding categorical data to a pa.DictionaryArray.
|
|
|
|
Parameters
|
|
----------
|
|
col : ColumnObject
|
|
allow_copy : bool, default: True
|
|
Whether to allow copying the memory to perform the conversion
|
|
(if false then zero-copy approach is requested).
|
|
|
|
Returns
|
|
-------
|
|
pa.DictionaryArray
|
|
"""
|
|
if not allow_copy:
|
|
raise RuntimeError(
|
|
"Categorical column will be casted from uint8 and a copy "
|
|
"is required which is forbidden by allow_copy=False"
|
|
)
|
|
|
|
categorical = col.describe_categorical
|
|
|
|
if not categorical["is_dictionary"]:
|
|
raise NotImplementedError(
|
|
"Non-dictionary categoricals not supported yet")
|
|
|
|
# We need to first convert the dictionary column
|
|
cat_column = categorical["categories"]
|
|
dictionary = column_to_array(cat_column)
|
|
# Then we need to convert the indices
|
|
# Here we need to use the buffer data type!
|
|
buffers = col.get_buffers()
|
|
_, data_type = buffers["data"]
|
|
indices = buffers_to_array(buffers, data_type,
|
|
col.size(),
|
|
col.describe_null,
|
|
col.offset)
|
|
|
|
# Constructing a pa.DictionaryArray
|
|
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
|
|
|
|
return dict_array
|
|
|
|
|
|
def parse_datetime_format_str(format_str):
|
|
"""Parse datetime `format_str` to interpret the `data`."""
|
|
|
|
# timestamp 'ts{unit}:tz'
|
|
timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
|
|
if timestamp_meta:
|
|
unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
|
|
if unit != "s":
|
|
# the format string describes only a first letter of the unit, so
|
|
# add one extra letter to convert the unit to numpy-style:
|
|
# 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
|
|
unit += "s"
|
|
|
|
return unit, tz
|
|
|
|
raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
|
|
|
|
|
|
def map_date_type(data_type):
|
|
"""Map column date type to pyarrow date type. """
|
|
kind, bit_width, f_string, _ = data_type
|
|
|
|
if kind == DtypeKind.DATETIME:
|
|
unit, tz = parse_datetime_format_str(f_string)
|
|
return pa.timestamp(unit, tz=tz)
|
|
else:
|
|
pa_dtype = _PYARROW_DTYPES.get(kind, {}).get(bit_width, None)
|
|
|
|
# Error if dtype is not supported
|
|
if pa_dtype:
|
|
return pa_dtype
|
|
else:
|
|
raise NotImplementedError(
|
|
f"Conversion for {data_type} is not yet supported.")
|
|
|
|
|
|
def buffers_to_array(
|
|
buffers: ColumnBuffers,
|
|
data_type: Tuple[DtypeKind, int, str, str],
|
|
length: int,
|
|
describe_null: ColumnNullType,
|
|
offset: int = 0,
|
|
allow_copy: bool = True,
|
|
) -> pa.Array:
|
|
"""
|
|
Build a PyArrow array from the passed buffer.
|
|
|
|
Parameters
|
|
----------
|
|
buffer : ColumnBuffers
|
|
Dictionary containing tuples of underlying buffers and
|
|
their associated dtype.
|
|
data_type : Tuple[DtypeKind, int, str, str],
|
|
Dtype description of the column as a tuple ``(kind, bit-width, format string,
|
|
endianness)``.
|
|
length : int
|
|
The number of values in the array.
|
|
describe_null: ColumnNullType
|
|
Null representation the column dtype uses,
|
|
as a tuple ``(kind, value)``
|
|
offset : int, default: 0
|
|
Number of elements to offset from the start of the buffer.
|
|
allow_copy : bool, default: True
|
|
Whether to allow copying the memory to perform the conversion
|
|
(if false then zero-copy approach is requested).
|
|
|
|
Returns
|
|
-------
|
|
pa.Array
|
|
|
|
Notes
|
|
-----
|
|
The returned array doesn't own the memory. The caller of this function
|
|
is responsible for keeping the memory owner object alive as long as
|
|
the returned PyArrow array is being used.
|
|
"""
|
|
data_buff, _ = buffers["data"]
|
|
try:
|
|
validity_buff, validity_dtype = buffers["validity"]
|
|
except TypeError:
|
|
validity_buff = None
|
|
try:
|
|
offset_buff, offset_dtype = buffers["offsets"]
|
|
except TypeError:
|
|
offset_buff = None
|
|
|
|
# Construct a pyarrow Buffer
|
|
data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize,
|
|
base=data_buff)
|
|
|
|
# Construct a validity pyarrow Buffer, if applicable
|
|
if validity_buff:
|
|
validity_pa_buff = validity_buffer_from_mask(validity_buff,
|
|
validity_dtype,
|
|
describe_null,
|
|
length,
|
|
offset,
|
|
allow_copy)
|
|
else:
|
|
validity_pa_buff = validity_buffer_nan_sentinel(data_pa_buffer,
|
|
data_type,
|
|
describe_null,
|
|
length,
|
|
offset,
|
|
allow_copy)
|
|
|
|
# Construct a pyarrow Array from buffers
|
|
data_dtype = map_date_type(data_type)
|
|
|
|
if offset_buff:
|
|
_, offset_bit_width, _, _ = offset_dtype
|
|
# If an offset buffer exists, construct an offset pyarrow Buffer
|
|
# and add it to the construction of an array
|
|
offset_pa_buffer = pa.foreign_buffer(offset_buff.ptr,
|
|
offset_buff.bufsize,
|
|
base=offset_buff)
|
|
|
|
if data_type[2] == 'U':
|
|
string_type = pa.large_string()
|
|
else:
|
|
if offset_bit_width == 64:
|
|
string_type = pa.large_string()
|
|
else:
|
|
string_type = pa.string()
|
|
array = pa.Array.from_buffers(
|
|
string_type,
|
|
length,
|
|
[validity_pa_buff, offset_pa_buffer, data_pa_buffer],
|
|
offset=offset,
|
|
)
|
|
else:
|
|
array = pa.Array.from_buffers(
|
|
data_dtype,
|
|
length,
|
|
[validity_pa_buff, data_pa_buffer],
|
|
offset=offset,
|
|
)
|
|
|
|
return array
|
|
|
|
|
|
def validity_buffer_from_mask(
|
|
validity_buff: BufferObject,
|
|
validity_dtype: Dtype,
|
|
describe_null: ColumnNullType,
|
|
length: int,
|
|
offset: int = 0,
|
|
allow_copy: bool = True,
|
|
) -> pa.Buffer:
|
|
"""
|
|
Build a PyArrow buffer from the passed mask buffer.
|
|
|
|
Parameters
|
|
----------
|
|
validity_buff : BufferObject
|
|
Tuple of underlying validity buffer and associated dtype.
|
|
validity_dtype : Dtype
|
|
Dtype description as a tuple ``(kind, bit-width, format string,
|
|
endianness)``.
|
|
describe_null : ColumnNullType
|
|
Null representation the column dtype uses,
|
|
as a tuple ``(kind, value)``
|
|
length : int
|
|
The number of values in the array.
|
|
offset : int, default: 0
|
|
Number of elements to offset from the start of the buffer.
|
|
allow_copy : bool, default: True
|
|
Whether to allow copying the memory to perform the conversion
|
|
(if false then zero-copy approach is requested).
|
|
|
|
Returns
|
|
-------
|
|
pa.Buffer
|
|
"""
|
|
null_kind, sentinel_val = describe_null
|
|
validity_kind, _, _, _ = validity_dtype
|
|
assert validity_kind == DtypeKind.BOOL
|
|
|
|
if null_kind == ColumnNullType.NON_NULLABLE:
|
|
# Sliced array can have a NON_NULLABLE ColumnNullType due
|
|
# to no missing values in that slice of an array though the bitmask
|
|
# exists and validity_buff must be set to None in this case
|
|
return None
|
|
|
|
elif null_kind == ColumnNullType.USE_BYTEMASK or (
|
|
null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 1
|
|
):
|
|
buff = pa.foreign_buffer(validity_buff.ptr,
|
|
validity_buff.bufsize,
|
|
base=validity_buff)
|
|
|
|
if null_kind == ColumnNullType.USE_BYTEMASK:
|
|
if not allow_copy:
|
|
raise RuntimeError(
|
|
"To create a bitmask a copy of the data is "
|
|
"required which is forbidden by allow_copy=False"
|
|
)
|
|
mask = pa.Array.from_buffers(pa.int8(), length,
|
|
[None, buff],
|
|
offset=offset)
|
|
mask_bool = pc.cast(mask, pa.bool_())
|
|
else:
|
|
mask_bool = pa.Array.from_buffers(pa.bool_(), length,
|
|
[None, buff],
|
|
offset=offset)
|
|
|
|
if sentinel_val == 1:
|
|
mask_bool = pc.invert(mask_bool)
|
|
|
|
return mask_bool.buffers()[1]
|
|
|
|
elif null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 0:
|
|
return pa.foreign_buffer(validity_buff.ptr,
|
|
validity_buff.bufsize,
|
|
base=validity_buff)
|
|
else:
|
|
raise NotImplementedError(
|
|
f"{describe_null} null representation is not yet supported.")
|
|
|
|
|
|
def validity_buffer_nan_sentinel(
|
|
data_pa_buffer: BufferObject,
|
|
data_type: Dtype,
|
|
describe_null: ColumnNullType,
|
|
length: int,
|
|
offset: int = 0,
|
|
allow_copy: bool = True,
|
|
) -> pa.Buffer:
|
|
"""
|
|
Build a PyArrow buffer from NaN or sentinel values.
|
|
|
|
Parameters
|
|
----------
|
|
data_pa_buffer : pa.Buffer
|
|
PyArrow buffer for the column data.
|
|
data_type : Dtype
|
|
Dtype description as a tuple ``(kind, bit-width, format string,
|
|
endianness)``.
|
|
describe_null : ColumnNullType
|
|
Null representation the column dtype uses,
|
|
as a tuple ``(kind, value)``
|
|
length : int
|
|
The number of values in the array.
|
|
offset : int, default: 0
|
|
Number of elements to offset from the start of the buffer.
|
|
allow_copy : bool, default: True
|
|
Whether to allow copying the memory to perform the conversion
|
|
(if false then zero-copy approach is requested).
|
|
|
|
Returns
|
|
-------
|
|
pa.Buffer
|
|
"""
|
|
kind, bit_width, _, _ = data_type
|
|
data_dtype = map_date_type(data_type)
|
|
null_kind, sentinel_val = describe_null
|
|
|
|
# Check for float NaN values
|
|
if null_kind == ColumnNullType.USE_NAN:
|
|
if not allow_copy:
|
|
raise RuntimeError(
|
|
"To create a bitmask a copy of the data is "
|
|
"required which is forbidden by allow_copy=False"
|
|
)
|
|
|
|
if kind == DtypeKind.FLOAT and bit_width == 16:
|
|
# 'pyarrow.compute.is_nan' kernel not yet implemented
|
|
# for float16
|
|
raise NotImplementedError(
|
|
f"{data_type} with {null_kind} is not yet supported.")
|
|
else:
|
|
pyarrow_data = pa.Array.from_buffers(
|
|
data_dtype,
|
|
length,
|
|
[None, data_pa_buffer],
|
|
offset=offset,
|
|
)
|
|
mask = pc.is_nan(pyarrow_data)
|
|
mask = pc.invert(mask)
|
|
return mask.buffers()[1]
|
|
|
|
# Check for sentinel values
|
|
elif null_kind == ColumnNullType.USE_SENTINEL:
|
|
if not allow_copy:
|
|
raise RuntimeError(
|
|
"To create a bitmask a copy of the data is "
|
|
"required which is forbidden by allow_copy=False"
|
|
)
|
|
|
|
if kind == DtypeKind.DATETIME:
|
|
sentinel_dtype = pa.int64()
|
|
else:
|
|
sentinel_dtype = data_dtype
|
|
pyarrow_data = pa.Array.from_buffers(sentinel_dtype,
|
|
length,
|
|
[None, data_pa_buffer],
|
|
offset=offset)
|
|
sentinel_arr = pc.equal(pyarrow_data, sentinel_val)
|
|
mask_bool = pc.invert(sentinel_arr)
|
|
return mask_bool.buffers()[1]
|
|
|
|
elif null_kind == ColumnNullType.NON_NULLABLE:
|
|
pass
|
|
else:
|
|
raise NotImplementedError(
|
|
f"{describe_null} null representation is not yet supported.")
|