2410 lines
78 KiB
Cython
2410 lines
78 KiB
Cython
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# cython: profile=False
|
|
# distutils: language = c++
|
|
|
|
from collections.abc import Sequence
|
|
from textwrap import indent
|
|
import warnings
|
|
|
|
from cython.operator cimport dereference as deref
|
|
from pyarrow.includes.common cimport *
|
|
from pyarrow.includes.libarrow cimport *
|
|
from pyarrow.includes.libarrow_python cimport *
|
|
from pyarrow.lib cimport (_Weakrefable, Buffer, Schema,
|
|
check_status,
|
|
MemoryPool, maybe_unbox_memory_pool,
|
|
Table, KeyValueMetadata,
|
|
pyarrow_wrap_chunked_array,
|
|
pyarrow_wrap_schema,
|
|
pyarrow_unwrap_data_type,
|
|
pyarrow_unwrap_metadata,
|
|
pyarrow_unwrap_schema,
|
|
pyarrow_wrap_table,
|
|
pyarrow_wrap_batch,
|
|
pyarrow_wrap_scalar,
|
|
NativeFile, get_reader, get_writer,
|
|
string_to_timeunit)
|
|
|
|
from pyarrow.lib import (ArrowException, NativeFile, BufferOutputStream,
|
|
ListType, LargeListType,
|
|
_stringify_path,
|
|
tobytes, frombytes, is_threading_enabled)
|
|
|
|
cimport cpython as cp
|
|
|
|
_DEFAULT_ROW_GROUP_SIZE = 1024*1024
|
|
_MAX_ROW_GROUP_SIZE = 64*1024*1024
|
|
|
|
|
|
cdef Type _unwrap_list_type(obj) except *:
|
|
if obj is ListType:
|
|
return _Type_LIST
|
|
elif obj is LargeListType:
|
|
return _Type_LARGE_LIST
|
|
else:
|
|
raise TypeError(f"Unexpected list_type: {obj!r}")
|
|
|
|
|
|
cdef class Statistics(_Weakrefable):
|
|
"""Statistics for a single column in a single row group."""
|
|
|
|
def __init__(self):
|
|
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
|
|
|
|
def __cinit__(self):
|
|
pass
|
|
|
|
def __repr__(self):
|
|
return f"""{object.__repr__(self)}
|
|
has_min_max: {self.has_min_max}
|
|
min: {self.min}
|
|
max: {self.max}
|
|
null_count: {self.null_count}
|
|
distinct_count: {self.distinct_count}
|
|
num_values: {self.num_values}
|
|
physical_type: {self.physical_type}
|
|
logical_type: {self.logical_type}
|
|
converted_type (legacy): {self.converted_type}"""
|
|
|
|
def to_dict(self):
|
|
"""
|
|
Get dictionary representation of statistics.
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
Dictionary with a key for each attribute of this class.
|
|
"""
|
|
d = dict(
|
|
has_min_max=self.has_min_max,
|
|
min=self.min,
|
|
max=self.max,
|
|
null_count=self.null_count,
|
|
distinct_count=self.distinct_count,
|
|
num_values=self.num_values,
|
|
physical_type=self.physical_type
|
|
)
|
|
return d
|
|
|
|
def __eq__(self, other):
|
|
try:
|
|
return self.equals(other)
|
|
except TypeError:
|
|
return NotImplemented
|
|
|
|
def equals(self, Statistics other):
|
|
"""
|
|
Return whether the two column statistics objects are equal.
|
|
|
|
Parameters
|
|
----------
|
|
other : Statistics
|
|
Statistics to compare against.
|
|
|
|
Returns
|
|
-------
|
|
are_equal : bool
|
|
"""
|
|
return self.statistics.get().Equals(deref(other.statistics.get()))
|
|
|
|
@property
|
|
def has_min_max(self):
|
|
"""Whether min and max are present (bool)."""
|
|
return self.statistics.get().HasMinMax()
|
|
|
|
@property
|
|
def has_null_count(self):
|
|
"""Whether null count is present (bool)."""
|
|
return self.statistics.get().HasNullCount()
|
|
|
|
@property
|
|
def has_distinct_count(self):
|
|
"""Whether distinct count is preset (bool)."""
|
|
return self.statistics.get().HasDistinctCount()
|
|
|
|
@property
|
|
def min_raw(self):
|
|
"""Min value as physical type (bool, int, float, or bytes)."""
|
|
if self.has_min_max:
|
|
return _cast_statistic_raw_min(self.statistics.get())
|
|
else:
|
|
return None
|
|
|
|
@property
|
|
def max_raw(self):
|
|
"""Max value as physical type (bool, int, float, or bytes)."""
|
|
if self.has_min_max:
|
|
return _cast_statistic_raw_max(self.statistics.get())
|
|
else:
|
|
return None
|
|
|
|
@property
|
|
def min(self):
|
|
"""
|
|
Min value as logical type.
|
|
|
|
Returned as the Python equivalent of logical type, such as datetime.date
|
|
for dates and decimal.Decimal for decimals.
|
|
"""
|
|
if self.has_min_max:
|
|
min_scalar, _ = _cast_statistics(self.statistics.get())
|
|
return min_scalar.as_py()
|
|
else:
|
|
return None
|
|
|
|
@property
|
|
def max(self):
|
|
"""
|
|
Max value as logical type.
|
|
|
|
Returned as the Python equivalent of logical type, such as datetime.date
|
|
for dates and decimal.Decimal for decimals.
|
|
"""
|
|
if self.has_min_max:
|
|
_, max_scalar = _cast_statistics(self.statistics.get())
|
|
return max_scalar.as_py()
|
|
else:
|
|
return None
|
|
|
|
@property
|
|
def null_count(self):
|
|
"""Number of null values in chunk (int)."""
|
|
if self.has_null_count:
|
|
return self.statistics.get().null_count()
|
|
else:
|
|
return None
|
|
|
|
@property
|
|
def distinct_count(self):
|
|
"""Distinct number of values in chunk (int)."""
|
|
if self.has_distinct_count:
|
|
return self.statistics.get().distinct_count()
|
|
else:
|
|
return None
|
|
|
|
@property
|
|
def num_values(self):
|
|
"""Number of non-null values (int)."""
|
|
return self.statistics.get().num_values()
|
|
|
|
@property
|
|
def physical_type(self):
|
|
"""Physical type of column (str)."""
|
|
raw_physical_type = self.statistics.get().physical_type()
|
|
return physical_type_name_from_enum(raw_physical_type)
|
|
|
|
@property
|
|
def logical_type(self):
|
|
"""Logical type of column (:class:`ParquetLogicalType`)."""
|
|
return wrap_logical_type(self.statistics.get().descr().logical_type())
|
|
|
|
@property
|
|
def converted_type(self):
|
|
"""Legacy converted type (str or None)."""
|
|
raw_converted_type = self.statistics.get().descr().converted_type()
|
|
return converted_type_name_from_enum(raw_converted_type)
|
|
|
|
|
|
cdef class ParquetLogicalType(_Weakrefable):
|
|
"""Logical type of parquet type."""
|
|
|
|
def __init__(self):
|
|
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
|
|
|
|
cdef:
|
|
shared_ptr[const CParquetLogicalType] type
|
|
|
|
def __cinit__(self):
|
|
pass
|
|
|
|
cdef init(self, const shared_ptr[const CParquetLogicalType]& type):
|
|
self.type = type
|
|
|
|
def __repr__(self):
|
|
return f"{object.__repr__(self)}\n {self}"
|
|
|
|
def __str__(self):
|
|
return frombytes(self.type.get().ToString(), safe=True)
|
|
|
|
def to_json(self):
|
|
"""
|
|
Get a JSON string containing type and type parameters.
|
|
|
|
Returns
|
|
-------
|
|
json : str
|
|
JSON representation of type, with at least a field called 'Type'
|
|
which contains the type name. If the type is parameterized, such
|
|
as a decimal with scale and precision, will contain those as fields
|
|
as well.
|
|
"""
|
|
return frombytes(self.type.get().ToJSON())
|
|
|
|
@property
|
|
def type(self):
|
|
"""Name of the logical type (str)."""
|
|
return logical_type_name_from_enum(self.type.get().type())
|
|
|
|
|
|
cdef wrap_logical_type(const shared_ptr[const CParquetLogicalType]& type):
|
|
cdef ParquetLogicalType out = ParquetLogicalType.__new__(ParquetLogicalType)
|
|
out.init(type)
|
|
return out
|
|
|
|
|
|
cdef _cast_statistic_raw_min(CStatistics* statistics):
|
|
cdef ParquetType physical_type = statistics.physical_type()
|
|
cdef uint32_t type_length = statistics.descr().type_length()
|
|
if physical_type == ParquetType_BOOLEAN:
|
|
return (<CBoolStatistics*> statistics).min()
|
|
elif physical_type == ParquetType_INT32:
|
|
return (<CInt32Statistics*> statistics).min()
|
|
elif physical_type == ParquetType_INT64:
|
|
return (<CInt64Statistics*> statistics).min()
|
|
elif physical_type == ParquetType_FLOAT:
|
|
return (<CFloatStatistics*> statistics).min()
|
|
elif physical_type == ParquetType_DOUBLE:
|
|
return (<CDoubleStatistics*> statistics).min()
|
|
elif physical_type == ParquetType_BYTE_ARRAY:
|
|
return _box_byte_array((<CByteArrayStatistics*> statistics).min())
|
|
elif physical_type == ParquetType_FIXED_LEN_BYTE_ARRAY:
|
|
return _box_flba((<CFLBAStatistics*> statistics).min(), type_length)
|
|
|
|
|
|
cdef _cast_statistic_raw_max(CStatistics* statistics):
|
|
cdef ParquetType physical_type = statistics.physical_type()
|
|
cdef uint32_t type_length = statistics.descr().type_length()
|
|
if physical_type == ParquetType_BOOLEAN:
|
|
return (<CBoolStatistics*> statistics).max()
|
|
elif physical_type == ParquetType_INT32:
|
|
return (<CInt32Statistics*> statistics).max()
|
|
elif physical_type == ParquetType_INT64:
|
|
return (<CInt64Statistics*> statistics).max()
|
|
elif physical_type == ParquetType_FLOAT:
|
|
return (<CFloatStatistics*> statistics).max()
|
|
elif physical_type == ParquetType_DOUBLE:
|
|
return (<CDoubleStatistics*> statistics).max()
|
|
elif physical_type == ParquetType_BYTE_ARRAY:
|
|
return _box_byte_array((<CByteArrayStatistics*> statistics).max())
|
|
elif physical_type == ParquetType_FIXED_LEN_BYTE_ARRAY:
|
|
return _box_flba((<CFLBAStatistics*> statistics).max(), type_length)
|
|
|
|
|
|
cdef _cast_statistics(CStatistics* statistics):
|
|
cdef:
|
|
shared_ptr[CScalar] c_min
|
|
shared_ptr[CScalar] c_max
|
|
check_status(StatisticsAsScalars(statistics[0], &c_min, &c_max))
|
|
return (pyarrow_wrap_scalar(c_min), pyarrow_wrap_scalar(c_max))
|
|
|
|
|
|
cdef _box_byte_array(ParquetByteArray val):
|
|
return cp.PyBytes_FromStringAndSize(<char*> val.ptr, <Py_ssize_t> val.len)
|
|
|
|
|
|
cdef _box_flba(ParquetFLBA val, uint32_t len):
|
|
return cp.PyBytes_FromStringAndSize(<char*> val.ptr, <Py_ssize_t> len)
|
|
|
|
|
|
cdef class GeoStatistics(_Weakrefable):
|
|
"""Statistics for columns with geospatial data types (experimental)
|
|
|
|
These statistics provide a bounding box and list of geometry types for
|
|
geospatial columns (GEOMETRY or GEOGRAPHY). All components may be None
|
|
if a file does not provide information about a particular component.
|
|
"""
|
|
|
|
def __init__(self):
|
|
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
|
|
|
|
def __cinit__(self):
|
|
pass
|
|
|
|
def __repr__(self):
|
|
return f"""{object.__repr__(self)}
|
|
geospatial_types: {self.geospatial_types}
|
|
xmin: {self.xmin}, xmax: {self.xmax}
|
|
ymin: {self.ymin}, ymax: {self.ymax}
|
|
zmin: {self.zmin}, zmax: {self.zmax}
|
|
mmin: {self.mmin}, mmax: {self.mmax}"""
|
|
|
|
def to_dict(self):
|
|
"""Dictionary summary of these statistics"""
|
|
out = {
|
|
"geospatial_types": self.geospatial_types,
|
|
"xmin": self.xmin,
|
|
"xmax": self.xmax,
|
|
"ymin": self.ymin,
|
|
"ymax": self.ymax,
|
|
"zmin": self.zmin,
|
|
"zmax": self.zmax,
|
|
"mmin": self.mmin,
|
|
"mmax": self.mmax
|
|
}
|
|
|
|
return out
|
|
|
|
@property
|
|
def geospatial_types(self):
|
|
"""Unique geometry types
|
|
|
|
Contains an integer code for each geometry type code encountered in the
|
|
geometries represented by these statistics. The geometry type codes are
|
|
ISO WKB geometry type codes returned in sorted order without duplicates.
|
|
|
|
This property may be None if geospatial types are not available.
|
|
"""
|
|
cdef optional[vector[int32_t]] maybe_geometry_types = \
|
|
self.statistics.get().geometry_types()
|
|
if not maybe_geometry_types.has_value():
|
|
return None
|
|
|
|
return list(maybe_geometry_types.value())
|
|
|
|
@property
|
|
def xmin(self):
|
|
"""Minimum X value or None if not available
|
|
|
|
Note that Parquet permits "wraparound" bounds in the X direction only
|
|
to more compactly represent bounds for geometries with components on
|
|
both sides of the antimeridian. This case is denoted by xmin > xmax.
|
|
"""
|
|
return self.statistics.get().lower_bound()[0] if self._x_valid() else None
|
|
|
|
@property
|
|
def xmax(self):
|
|
"""Maximum X value or None if not available
|
|
|
|
Note that Parquet permits "wraparound" bounds in the X direction only
|
|
(see xmin).
|
|
"""
|
|
return self.statistics.get().upper_bound()[0] if self._x_valid() else None
|
|
|
|
@property
|
|
def ymin(self):
|
|
"""Minimum Y value or None if not available"""
|
|
return self.statistics.get().lower_bound()[1] if self._y_valid() else None
|
|
|
|
@property
|
|
def ymax(self):
|
|
"""Maximum Y value or None if not available"""
|
|
return self.statistics.get().upper_bound()[1] if self._y_valid() else None
|
|
|
|
@property
|
|
def zmin(self):
|
|
"""Minimum Z value or None if not available"""
|
|
return self.statistics.get().lower_bound()[2] if self._z_valid() else None
|
|
|
|
@property
|
|
def zmax(self):
|
|
"""Maximum Z value or None if not available"""
|
|
return self.statistics.get().upper_bound()[2] if self._z_valid() else None
|
|
|
|
@property
|
|
def mmin(self):
|
|
"""Minimum M value or None if not available"""
|
|
return self.statistics.get().lower_bound()[3] if self._m_valid() else None
|
|
|
|
@property
|
|
def mmax(self):
|
|
"""Maximum M value or None if not available"""
|
|
return self.statistics.get().upper_bound()[3] if self._m_valid() else None
|
|
|
|
# Helpers to calculate the availability of a given dimension. For statistics
|
|
# read from a file, dimension_empty should always be false because there is
|
|
# no way to represent an empty range in Thrift; however, we check to be safe.
|
|
def _x_valid(self):
|
|
return self.statistics.get().dimension_valid()[0] \
|
|
and not self.statistics.get().dimension_empty()[0]
|
|
|
|
def _y_valid(self):
|
|
return self.statistics.get().dimension_valid()[1] \
|
|
and not self.statistics.get().dimension_empty()[1]
|
|
|
|
def _z_valid(self):
|
|
return self.statistics.get().dimension_valid()[2] \
|
|
and not self.statistics.get().dimension_empty()[2]
|
|
|
|
def _m_valid(self):
|
|
return self.statistics.get().dimension_valid()[3] \
|
|
and not self.statistics.get().dimension_empty()[3]
|
|
|
|
|
|
cdef class ColumnChunkMetaData(_Weakrefable):
|
|
"""Column metadata for a single row group."""
|
|
|
|
def __init__(self):
|
|
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
|
|
|
|
def __cinit__(self):
|
|
pass
|
|
|
|
def __repr__(self):
|
|
statistics = indent(repr(self.statistics), 4 * ' ')
|
|
geo_statistics = indent(repr(self.geo_statistics), 4 * ' ')
|
|
return f"""{object.__repr__(self)}
|
|
file_offset: {self.file_offset}
|
|
file_path: {self.file_path}
|
|
physical_type: {self.physical_type}
|
|
num_values: {self.num_values}
|
|
path_in_schema: {self.path_in_schema}
|
|
is_stats_set: {self.is_stats_set}
|
|
statistics:
|
|
{statistics}
|
|
geo_statistics:
|
|
{geo_statistics}
|
|
compression: {self.compression}
|
|
encodings: {self.encodings}
|
|
has_dictionary_page: {self.has_dictionary_page}
|
|
dictionary_page_offset: {self.dictionary_page_offset}
|
|
data_page_offset: {self.data_page_offset}
|
|
total_compressed_size: {self.total_compressed_size}
|
|
total_uncompressed_size: {self.total_uncompressed_size}"""
|
|
|
|
def to_dict(self):
|
|
"""
|
|
Get dictionary representation of the column chunk metadata.
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
Dictionary with a key for each attribute of this class.
|
|
"""
|
|
statistics = self.statistics.to_dict() if self.is_stats_set else None
|
|
if self.is_geo_stats_set:
|
|
geo_statistics = self.geo_statistics.to_dict()
|
|
else:
|
|
geo_statistics = None
|
|
|
|
d = dict(
|
|
file_offset=self.file_offset,
|
|
file_path=self.file_path,
|
|
physical_type=self.physical_type,
|
|
num_values=self.num_values,
|
|
path_in_schema=self.path_in_schema,
|
|
is_stats_set=self.is_stats_set,
|
|
statistics=statistics,
|
|
geo_statistics=geo_statistics,
|
|
compression=self.compression,
|
|
encodings=self.encodings,
|
|
has_dictionary_page=self.has_dictionary_page,
|
|
dictionary_page_offset=self.dictionary_page_offset,
|
|
data_page_offset=self.data_page_offset,
|
|
total_compressed_size=self.total_compressed_size,
|
|
total_uncompressed_size=self.total_uncompressed_size
|
|
)
|
|
return d
|
|
|
|
def __eq__(self, other):
|
|
try:
|
|
return self.equals(other)
|
|
except TypeError:
|
|
return NotImplemented
|
|
|
|
def equals(self, ColumnChunkMetaData other):
|
|
"""
|
|
Return whether the two column chunk metadata objects are equal.
|
|
|
|
Parameters
|
|
----------
|
|
other : ColumnChunkMetaData
|
|
Metadata to compare against.
|
|
|
|
Returns
|
|
-------
|
|
are_equal : bool
|
|
"""
|
|
return self.metadata.Equals(deref(other.metadata))
|
|
|
|
@property
|
|
def file_offset(self):
|
|
"""Offset into file where column chunk is located (int)."""
|
|
return self.metadata.file_offset()
|
|
|
|
@property
|
|
def file_path(self):
|
|
"""Optional file path if set (str or None)."""
|
|
return frombytes(self.metadata.file_path())
|
|
|
|
@property
|
|
def physical_type(self):
|
|
"""Physical type of column (str)."""
|
|
return physical_type_name_from_enum(self.metadata.type())
|
|
|
|
@property
|
|
def num_values(self):
|
|
"""Total number of values (int)."""
|
|
return self.metadata.num_values()
|
|
|
|
@property
|
|
def path_in_schema(self):
|
|
"""Nested path to field, separated by periods (str)."""
|
|
path = self.metadata.path_in_schema().get().ToDotString()
|
|
return frombytes(path)
|
|
|
|
@property
|
|
def is_stats_set(self):
|
|
"""Whether or not statistics are present in metadata (bool)."""
|
|
return self.metadata.is_stats_set()
|
|
|
|
@property
|
|
def statistics(self):
|
|
"""Statistics for column chunk (:class:`Statistics`)."""
|
|
if not self.metadata.is_stats_set():
|
|
return None
|
|
cdef Statistics statistics = Statistics.__new__(Statistics)
|
|
statistics.init(self.metadata.statistics(), self)
|
|
return statistics
|
|
|
|
@property
|
|
def is_geo_stats_set(self):
|
|
"""Whether or not geometry statistics are present in metadata (bool)."""
|
|
return self.metadata.is_geo_stats_set()
|
|
|
|
@property
|
|
def geo_statistics(self):
|
|
"""Statistics for column chunk (:class:`GeoStatistics`)."""
|
|
c_geo_statistics = self.metadata.geo_statistics()
|
|
if not c_geo_statistics or not c_geo_statistics.get().is_valid():
|
|
return None
|
|
cdef GeoStatistics geo_statistics = GeoStatistics.__new__(GeoStatistics)
|
|
geo_statistics.init(c_geo_statistics, self)
|
|
return geo_statistics
|
|
|
|
@property
|
|
def compression(self):
|
|
"""
|
|
Type of compression used for column (str).
|
|
|
|
One of 'UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD',
|
|
or 'UNKNOWN'.
|
|
"""
|
|
return compression_name_from_enum(self.metadata.compression())
|
|
|
|
@property
|
|
def encodings(self):
|
|
"""
|
|
Encodings used for column (tuple of str).
|
|
|
|
One of 'PLAIN', 'BIT_PACKED', 'RLE', 'BYTE_STREAM_SPLIT', 'DELTA_BINARY_PACKED',
|
|
'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'.
|
|
"""
|
|
return tuple(map(encoding_name_from_enum, self.metadata.encodings()))
|
|
|
|
@property
|
|
def has_dictionary_page(self):
|
|
"""Whether there is dictionary data present in the column chunk (bool)."""
|
|
return bool(self.metadata.has_dictionary_page())
|
|
|
|
@property
|
|
def dictionary_page_offset(self):
|
|
"""Offset of dictionary page relative to beginning of the file (int)."""
|
|
if self.has_dictionary_page:
|
|
return self.metadata.dictionary_page_offset()
|
|
else:
|
|
return None
|
|
|
|
@property
|
|
def data_page_offset(self):
|
|
"""Offset of data page relative to beginning of the file (int)."""
|
|
return self.metadata.data_page_offset()
|
|
|
|
@property
|
|
def has_index_page(self):
|
|
"""Not yet supported."""
|
|
raise NotImplementedError('not supported in parquet-cpp')
|
|
|
|
@property
|
|
def index_page_offset(self):
|
|
"""Not yet supported."""
|
|
raise NotImplementedError("parquet-cpp doesn't return valid values")
|
|
|
|
@property
|
|
def total_compressed_size(self):
|
|
"""Compressed size in bytes (int)."""
|
|
return self.metadata.total_compressed_size()
|
|
|
|
@property
|
|
def total_uncompressed_size(self):
|
|
"""Uncompressed size in bytes (int)."""
|
|
return self.metadata.total_uncompressed_size()
|
|
|
|
@property
|
|
def has_offset_index(self):
|
|
"""Whether the column chunk has an offset index"""
|
|
return self.metadata.GetOffsetIndexLocation().has_value()
|
|
|
|
@property
|
|
def has_column_index(self):
|
|
"""Whether the column chunk has a column index"""
|
|
return self.metadata.GetColumnIndexLocation().has_value()
|
|
|
|
@property
|
|
def metadata(self):
|
|
"""Additional metadata as key value pairs (dict[bytes, bytes])."""
|
|
cdef:
|
|
unordered_map[c_string, c_string] metadata
|
|
const CKeyValueMetadata* underlying_metadata
|
|
underlying_metadata = self.metadata.key_value_metadata().get()
|
|
if underlying_metadata != NULL:
|
|
underlying_metadata.ToUnorderedMap(&metadata)
|
|
return metadata
|
|
else:
|
|
return None
|
|
|
|
|
|
cdef class SortingColumn:
|
|
"""
|
|
Sorting specification for a single column.
|
|
|
|
Returned by :meth:`RowGroupMetaData.sorting_columns` and used in
|
|
:class:`ParquetWriter` to specify the sort order of the data.
|
|
|
|
Parameters
|
|
----------
|
|
column_index : int
|
|
Index of column that data is sorted by.
|
|
descending : bool, default False
|
|
Whether column is sorted in descending order.
|
|
nulls_first : bool, default False
|
|
Whether null values appear before valid values.
|
|
|
|
Notes
|
|
-----
|
|
|
|
Column indices are zero-based, refer only to leaf fields, and are in
|
|
depth-first order. This may make the column indices for nested schemas
|
|
different from what you expect. In most cases, it will be easier to
|
|
specify the sort order using column names instead of column indices
|
|
and converting using the ``from_ordering`` method.
|
|
|
|
Examples
|
|
--------
|
|
|
|
In other APIs, sort order is specified by names, such as:
|
|
|
|
>>> sort_order = [('id', 'ascending'), ('timestamp', 'descending')]
|
|
|
|
For Parquet, the column index must be used instead:
|
|
|
|
>>> import pyarrow.parquet as pq
|
|
>>> [pq.SortingColumn(0), pq.SortingColumn(1, descending=True)]
|
|
[SortingColumn(column_index=0, descending=False, nulls_first=False), SortingColumn(column_index=1, descending=True, nulls_first=False)]
|
|
|
|
Convert the sort_order into the list of sorting columns with
|
|
``from_ordering`` (note that the schema must be provided as well):
|
|
|
|
>>> import pyarrow as pa
|
|
>>> schema = pa.schema([('id', pa.int64()), ('timestamp', pa.timestamp('ms'))])
|
|
>>> sorting_columns = pq.SortingColumn.from_ordering(schema, sort_order)
|
|
>>> sorting_columns
|
|
(SortingColumn(column_index=0, descending=False, nulls_first=False), SortingColumn(column_index=1, descending=True, nulls_first=False))
|
|
|
|
Convert back to the sort order with ``to_ordering``:
|
|
|
|
>>> pq.SortingColumn.to_ordering(schema, sorting_columns)
|
|
((('id', 'ascending'), ('timestamp', 'descending')), 'at_end')
|
|
|
|
See Also
|
|
--------
|
|
RowGroupMetaData.sorting_columns
|
|
"""
|
|
cdef int column_index
|
|
cdef c_bool descending
|
|
cdef c_bool nulls_first
|
|
|
|
def __init__(self, int column_index, c_bool descending=False, c_bool nulls_first=False):
|
|
self.column_index = column_index
|
|
self.descending = descending
|
|
self.nulls_first = nulls_first
|
|
|
|
@classmethod
|
|
def from_ordering(cls, Schema schema, sort_keys, null_placement='at_end'):
|
|
"""
|
|
Create a tuple of SortingColumn objects from the same arguments as
|
|
:class:`pyarrow.compute.SortOptions`.
|
|
|
|
Parameters
|
|
----------
|
|
schema : Schema
|
|
Schema of the input data.
|
|
sort_keys : Sequence of (name, order) tuples
|
|
Names of field/column keys (str) to sort the input on,
|
|
along with the order each field/column is sorted in.
|
|
Accepted values for `order` are "ascending", "descending".
|
|
null_placement : {'at_start', 'at_end'}, default 'at_end'
|
|
Where null values should appear in the sort order.
|
|
|
|
Returns
|
|
-------
|
|
sorting_columns : tuple of SortingColumn
|
|
"""
|
|
if null_placement == 'at_start':
|
|
nulls_first = True
|
|
elif null_placement == 'at_end':
|
|
nulls_first = False
|
|
else:
|
|
raise ValueError('null_placement must be "at_start" or "at_end"')
|
|
|
|
col_map = _name_to_index_map(schema)
|
|
|
|
sorting_columns = []
|
|
|
|
for sort_key in sort_keys:
|
|
if isinstance(sort_key, str):
|
|
name = sort_key
|
|
descending = False
|
|
elif (isinstance(sort_key, tuple) and len(sort_key) == 2 and
|
|
isinstance(sort_key[0], str) and
|
|
isinstance(sort_key[1], str)):
|
|
name, descending = sort_key
|
|
if descending == "descending":
|
|
descending = True
|
|
elif descending == "ascending":
|
|
descending = False
|
|
else:
|
|
raise ValueError(f"Invalid sort key direction: {descending}")
|
|
else:
|
|
raise ValueError(f"Invalid sort key: {sort_key}")
|
|
|
|
try:
|
|
column_index = col_map[name]
|
|
except KeyError:
|
|
raise ValueError(
|
|
f"Sort key name '{name}' not found in schema:\n{schema}")
|
|
|
|
sorting_columns.append(
|
|
cls(column_index, descending=descending, nulls_first=nulls_first)
|
|
)
|
|
|
|
return tuple(sorting_columns)
|
|
|
|
@staticmethod
|
|
def to_ordering(Schema schema, sorting_columns):
|
|
"""
|
|
Convert a tuple of SortingColumn objects to the same format as
|
|
:class:`pyarrow.compute.SortOptions`.
|
|
|
|
Parameters
|
|
----------
|
|
schema : Schema
|
|
Schema of the input data.
|
|
sorting_columns : tuple of SortingColumn
|
|
Columns to sort the input on.
|
|
|
|
Returns
|
|
-------
|
|
sort_keys : tuple of (name, order) tuples
|
|
null_placement : {'at_start', 'at_end'}
|
|
"""
|
|
col_map = {i: name for name, i in _name_to_index_map(schema).items()}
|
|
|
|
sort_keys = []
|
|
nulls_first = None
|
|
|
|
for sorting_column in sorting_columns:
|
|
name = col_map[sorting_column.column_index]
|
|
if sorting_column.descending:
|
|
order = "descending"
|
|
else:
|
|
order = "ascending"
|
|
sort_keys.append((name, order))
|
|
if nulls_first is None:
|
|
nulls_first = sorting_column.nulls_first
|
|
elif nulls_first != sorting_column.nulls_first:
|
|
raise ValueError("Sorting columns have inconsistent null placement")
|
|
|
|
if nulls_first:
|
|
null_placement = "at_start"
|
|
else:
|
|
null_placement = "at_end"
|
|
|
|
return tuple(sort_keys), null_placement
|
|
|
|
def __repr__(self):
|
|
return f"{self.__class__.__name__}(column_index={self.column_index}, " \
|
|
f"descending={self.descending}, nulls_first={self.nulls_first})"
|
|
|
|
def __eq__(self, SortingColumn other):
|
|
return (self.column_index == other.column_index and
|
|
self.descending == other.descending and
|
|
self.nulls_first == other.nulls_first)
|
|
|
|
def __hash__(self):
|
|
return hash((self.column_index, self.descending, self.nulls_first))
|
|
|
|
@property
|
|
def column_index(self):
|
|
""""Index of column data is sorted by (int)."""
|
|
return self.column_index
|
|
|
|
@property
|
|
def descending(self):
|
|
"""Whether column is sorted in descending order (bool)."""
|
|
return self.descending
|
|
|
|
@property
|
|
def nulls_first(self):
|
|
"""Whether null values appear before valid values (bool)."""
|
|
return self.nulls_first
|
|
|
|
def to_dict(self):
|
|
"""
|
|
Get dictionary representation of the SortingColumn.
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
Dictionary with a key for each attribute of this class.
|
|
"""
|
|
d = dict(
|
|
column_index=self.column_index,
|
|
descending=self.descending,
|
|
nulls_first=self.nulls_first
|
|
)
|
|
return d
|
|
|
|
|
|
cdef class RowGroupMetaData(_Weakrefable):
|
|
"""Metadata for a single row group."""
|
|
|
|
def __init__(self):
|
|
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
|
|
|
|
def __cinit__(self):
|
|
pass
|
|
|
|
def __reduce__(self):
|
|
return RowGroupMetaData, (self.parent, self.index)
|
|
|
|
def __eq__(self, other):
|
|
try:
|
|
return self.equals(other)
|
|
except TypeError:
|
|
return NotImplemented
|
|
|
|
def equals(self, RowGroupMetaData other):
|
|
"""
|
|
Return whether the two row group metadata objects are equal.
|
|
|
|
Parameters
|
|
----------
|
|
other : RowGroupMetaData
|
|
Metadata to compare against.
|
|
|
|
Returns
|
|
-------
|
|
are_equal : bool
|
|
"""
|
|
return self.metadata.Equals(deref(other.metadata))
|
|
|
|
def column(self, int i):
|
|
"""
|
|
Get column metadata at given index.
|
|
|
|
Parameters
|
|
----------
|
|
i : int
|
|
Index of column to get metadata for.
|
|
|
|
Returns
|
|
-------
|
|
ColumnChunkMetaData
|
|
Metadata for column within this chunk.
|
|
"""
|
|
if i < 0 or i >= self.num_columns:
|
|
raise IndexError(f'{i} out of bounds')
|
|
cdef ColumnChunkMetaData chunk = ColumnChunkMetaData.__new__(ColumnChunkMetaData)
|
|
chunk.init(self, i)
|
|
return chunk
|
|
|
|
def __repr__(self):
|
|
return f"""{object.__repr__(self)}
|
|
num_columns: {self.num_columns}
|
|
num_rows: {self.num_rows}
|
|
total_byte_size: {self.total_byte_size}
|
|
sorting_columns: {self.sorting_columns}"""
|
|
|
|
def to_dict(self):
|
|
"""
|
|
Get dictionary representation of the row group metadata.
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
Dictionary with a key for each attribute of this class.
|
|
"""
|
|
columns = []
|
|
d = dict(
|
|
num_columns=self.num_columns,
|
|
num_rows=self.num_rows,
|
|
total_byte_size=self.total_byte_size,
|
|
columns=columns,
|
|
sorting_columns=[col.to_dict() for col in self.sorting_columns]
|
|
)
|
|
for i in range(self.num_columns):
|
|
columns.append(self.column(i).to_dict())
|
|
return d
|
|
|
|
@property
|
|
def num_columns(self):
|
|
"""Number of columns in this row group (int)."""
|
|
return self.metadata.num_columns()
|
|
|
|
@property
|
|
def num_rows(self):
|
|
"""Number of rows in this row group (int)."""
|
|
return self.metadata.num_rows()
|
|
|
|
@property
|
|
def total_byte_size(self):
|
|
"""Total byte size of all the uncompressed column data in this row group (int)."""
|
|
return self.metadata.total_byte_size()
|
|
|
|
@property
|
|
def sorting_columns(self):
|
|
"""Columns the row group is sorted by (tuple of :class:`SortingColumn`))."""
|
|
out = []
|
|
cdef vector[CSortingColumn] sorting_columns = self.metadata.sorting_columns()
|
|
for sorting_col in sorting_columns:
|
|
out.append(SortingColumn(
|
|
sorting_col.column_idx,
|
|
sorting_col.descending,
|
|
sorting_col.nulls_first
|
|
))
|
|
return tuple(out)
|
|
|
|
|
|
def _reconstruct_filemetadata(Buffer serialized):
|
|
cdef:
|
|
FileMetaData metadata = FileMetaData.__new__(FileMetaData)
|
|
CBuffer *buffer = serialized.buffer.get()
|
|
uint32_t metadata_len = <uint32_t>buffer.size()
|
|
|
|
metadata.init(CFileMetaData_Make(buffer.data(), &metadata_len))
|
|
|
|
return metadata
|
|
|
|
|
|
cdef class FileMetaData(_Weakrefable):
|
|
"""Parquet metadata for a single file."""
|
|
|
|
def __init__(self):
|
|
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly")
|
|
|
|
def __cinit__(self):
|
|
pass
|
|
|
|
def __reduce__(self):
|
|
cdef:
|
|
NativeFile sink = BufferOutputStream()
|
|
COutputStream* c_sink = sink.get_output_stream().get()
|
|
with nogil:
|
|
self._metadata.WriteTo(c_sink)
|
|
|
|
cdef Buffer buffer = sink.getvalue()
|
|
return _reconstruct_filemetadata, (buffer,)
|
|
|
|
def __hash__(self):
|
|
return hash((self.schema,
|
|
self.num_rows,
|
|
self.num_row_groups,
|
|
self.format_version,
|
|
self.serialized_size))
|
|
|
|
def __repr__(self):
|
|
return f"""{object.__repr__(self)}
|
|
created_by: {self.created_by}
|
|
num_columns: {self.num_columns}
|
|
num_rows: {self.num_rows}
|
|
num_row_groups: {self.num_row_groups}
|
|
format_version: {self.format_version}
|
|
serialized_size: {self.serialized_size}"""
|
|
|
|
def to_dict(self):
|
|
"""
|
|
Get dictionary representation of the file metadata.
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
Dictionary with a key for each attribute of this class.
|
|
"""
|
|
row_groups = []
|
|
d = dict(
|
|
created_by=self.created_by,
|
|
num_columns=self.num_columns,
|
|
num_rows=self.num_rows,
|
|
num_row_groups=self.num_row_groups,
|
|
row_groups=row_groups,
|
|
format_version=self.format_version,
|
|
serialized_size=self.serialized_size
|
|
)
|
|
for i in range(self.num_row_groups):
|
|
row_groups.append(self.row_group(i).to_dict())
|
|
return d
|
|
|
|
def __eq__(self, other):
|
|
try:
|
|
return self.equals(other)
|
|
except TypeError:
|
|
return NotImplemented
|
|
|
|
def equals(self, FileMetaData other not None):
|
|
"""
|
|
Return whether the two file metadata objects are equal.
|
|
|
|
Parameters
|
|
----------
|
|
other : FileMetaData
|
|
Metadata to compare against.
|
|
|
|
Returns
|
|
-------
|
|
are_equal : bool
|
|
"""
|
|
return self._metadata.Equals(deref(other._metadata))
|
|
|
|
@property
|
|
def schema(self):
|
|
"""Schema of the file (:class:`ParquetSchema`)."""
|
|
if self._schema is None:
|
|
self._schema = ParquetSchema(self)
|
|
return self._schema
|
|
|
|
@property
|
|
def serialized_size(self):
|
|
"""Size of the original thrift encoded metadata footer (int)."""
|
|
return self._metadata.size()
|
|
|
|
@property
|
|
def num_columns(self):
|
|
"""Number of columns in file (int)."""
|
|
return self._metadata.num_columns()
|
|
|
|
@property
|
|
def num_rows(self):
|
|
"""Total number of rows in file (int)."""
|
|
return self._metadata.num_rows()
|
|
|
|
@property
|
|
def num_row_groups(self):
|
|
"""Number of row groups in file (int)."""
|
|
return self._metadata.num_row_groups()
|
|
|
|
@property
|
|
def format_version(self):
|
|
"""
|
|
Parquet format version used in file (str, such as '1.0', '2.4').
|
|
|
|
If version is missing or unparsable, will default to assuming '2.6'.
|
|
"""
|
|
cdef ParquetVersion version = self._metadata.version()
|
|
if version == ParquetVersion_V1:
|
|
return '1.0'
|
|
elif version == ParquetVersion_V2_4:
|
|
return '2.4'
|
|
elif version == ParquetVersion_V2_6:
|
|
return '2.6'
|
|
else:
|
|
warnings.warn(f'Unrecognized file version, assuming 2.6: {version}')
|
|
return '2.6'
|
|
|
|
@property
|
|
def created_by(self):
|
|
"""
|
|
String describing source of the parquet file (str).
|
|
|
|
This typically includes library name and version number. For example, Arrow 7.0's
|
|
writer returns 'parquet-cpp-arrow version 7.0.0'.
|
|
"""
|
|
return frombytes(self._metadata.created_by())
|
|
|
|
@property
|
|
def metadata(self):
|
|
"""Additional metadata as key value pairs (dict[bytes, bytes])."""
|
|
cdef:
|
|
unordered_map[c_string, c_string] metadata
|
|
const CKeyValueMetadata* underlying_metadata
|
|
underlying_metadata = self._metadata.key_value_metadata().get()
|
|
if underlying_metadata != NULL:
|
|
underlying_metadata.ToUnorderedMap(&metadata)
|
|
return metadata
|
|
else:
|
|
return None
|
|
|
|
def row_group(self, int i):
|
|
"""
|
|
Get metadata for row group at index i.
|
|
|
|
Parameters
|
|
----------
|
|
i : int
|
|
Row group index to get.
|
|
|
|
Returns
|
|
-------
|
|
row_group_metadata : RowGroupMetaData
|
|
"""
|
|
cdef RowGroupMetaData row_group = RowGroupMetaData.__new__(RowGroupMetaData)
|
|
row_group.init(self, i)
|
|
return row_group
|
|
|
|
def set_file_path(self, path):
|
|
"""
|
|
Set ColumnChunk file paths to the given value.
|
|
|
|
This method modifies the ``file_path`` field of each ColumnChunk
|
|
in the FileMetaData to be a particular value.
|
|
|
|
Parameters
|
|
----------
|
|
path : str
|
|
The file path to set on all ColumnChunks.
|
|
"""
|
|
cdef:
|
|
c_string c_path = tobytes(path)
|
|
self._metadata.set_file_path(c_path)
|
|
|
|
def append_row_groups(self, FileMetaData other):
|
|
"""
|
|
Append row groups from other FileMetaData object.
|
|
|
|
Parameters
|
|
----------
|
|
other : FileMetaData
|
|
Other metadata to append row groups from.
|
|
"""
|
|
cdef shared_ptr[CFileMetaData] c_metadata
|
|
|
|
c_metadata = other.sp_metadata
|
|
self._metadata.AppendRowGroups(deref(c_metadata))
|
|
|
|
def write_metadata_file(self, where):
|
|
"""
|
|
Write the metadata to a metadata-only Parquet file.
|
|
|
|
Parameters
|
|
----------
|
|
where : path or file-like object
|
|
Where to write the metadata. Should be a writable path on
|
|
the local filesystem, or a writable file-like object.
|
|
"""
|
|
cdef:
|
|
shared_ptr[COutputStream] sink
|
|
c_string c_where
|
|
|
|
try:
|
|
where = _stringify_path(where)
|
|
except TypeError:
|
|
get_writer(where, &sink)
|
|
else:
|
|
c_where = tobytes(where)
|
|
with nogil:
|
|
sink = GetResultValue(FileOutputStream.Open(c_where))
|
|
|
|
with nogil:
|
|
check_status(
|
|
WriteMetaDataFile(deref(self._metadata), sink.get()))
|
|
|
|
|
|
cdef class ParquetSchema(_Weakrefable):
|
|
"""A Parquet schema."""
|
|
|
|
def __cinit__(self, FileMetaData container):
|
|
self.parent = container
|
|
self.schema = container._metadata.schema()
|
|
|
|
def __repr__(self):
|
|
return f"{object.__repr__(self)}\n{frombytes(self.schema.ToString(), safe=True)}"
|
|
|
|
def __reduce__(self):
|
|
return ParquetSchema, (self.parent,)
|
|
|
|
def __len__(self):
|
|
return self.schema.num_columns()
|
|
|
|
def __getitem__(self, i):
|
|
return self.column(i)
|
|
|
|
def __hash__(self):
|
|
return hash(self.schema.ToString())
|
|
|
|
@property
|
|
def names(self):
|
|
"""Name of each field (list of str)."""
|
|
return [self[i].name for i in range(len(self))]
|
|
|
|
def to_arrow_schema(self):
|
|
"""
|
|
Convert Parquet schema to effective Arrow schema.
|
|
|
|
Returns
|
|
-------
|
|
schema : Schema
|
|
"""
|
|
cdef shared_ptr[CSchema] sp_arrow_schema
|
|
|
|
with nogil:
|
|
check_status(FromParquetSchema(
|
|
self.schema, default_arrow_reader_properties(),
|
|
self.parent._metadata.key_value_metadata(),
|
|
&sp_arrow_schema))
|
|
|
|
return pyarrow_wrap_schema(sp_arrow_schema)
|
|
|
|
def __eq__(self, other):
|
|
try:
|
|
return self.equals(other)
|
|
except TypeError:
|
|
return NotImplemented
|
|
|
|
def equals(self, ParquetSchema other):
|
|
"""
|
|
Return whether the two schemas are equal.
|
|
|
|
Parameters
|
|
----------
|
|
other : ParquetSchema
|
|
Schema to compare against.
|
|
|
|
Returns
|
|
-------
|
|
are_equal : bool
|
|
"""
|
|
return self.schema.Equals(deref(other.schema))
|
|
|
|
def column(self, i):
|
|
"""
|
|
Return the schema for a single column.
|
|
|
|
Parameters
|
|
----------
|
|
i : int
|
|
Index of column in schema.
|
|
|
|
Returns
|
|
-------
|
|
column_schema : ColumnSchema
|
|
"""
|
|
if i < 0 or i >= len(self):
|
|
raise IndexError(f'{i} out of bounds')
|
|
|
|
return ColumnSchema(self, i)
|
|
|
|
|
|
cdef class ColumnSchema(_Weakrefable):
|
|
"""Schema for a single column."""
|
|
cdef:
|
|
int index
|
|
ParquetSchema parent
|
|
const ColumnDescriptor* descr
|
|
|
|
def __cinit__(self, ParquetSchema schema, int index):
|
|
self.parent = schema
|
|
self.index = index # for pickling support
|
|
self.descr = schema.schema.Column(index)
|
|
|
|
def __eq__(self, other):
|
|
try:
|
|
return self.equals(other)
|
|
except TypeError:
|
|
return NotImplemented
|
|
|
|
def __reduce__(self):
|
|
return ColumnSchema, (self.parent, self.index)
|
|
|
|
def equals(self, ColumnSchema other):
|
|
"""
|
|
Return whether the two column schemas are equal.
|
|
|
|
Parameters
|
|
----------
|
|
other : ColumnSchema
|
|
Schema to compare against.
|
|
|
|
Returns
|
|
-------
|
|
are_equal : bool
|
|
"""
|
|
return self.descr.Equals(deref(other.descr))
|
|
|
|
def __repr__(self):
|
|
physical_type = self.physical_type
|
|
converted_type = self.converted_type
|
|
if converted_type == 'DECIMAL':
|
|
converted_type = f'DECIMAL({self.precision}, {self.scale})'
|
|
elif physical_type == 'FIXED_LEN_BYTE_ARRAY':
|
|
converted_type = f'FIXED_LEN_BYTE_ARRAY(length={self.length})'
|
|
|
|
return f"""<ParquetColumnSchema>
|
|
name: {self.name}
|
|
path: {self.path}
|
|
max_definition_level: {self.max_definition_level}
|
|
max_repetition_level: {self.max_repetition_level}
|
|
physical_type: {physical_type}
|
|
logical_type: {self.logical_type}
|
|
converted_type (legacy): {converted_type}"""
|
|
|
|
@property
|
|
def name(self):
|
|
"""Name of field (str)."""
|
|
return frombytes(self.descr.name())
|
|
|
|
@property
|
|
def path(self):
|
|
"""Nested path to field, separated by periods (str)."""
|
|
return frombytes(self.descr.path().get().ToDotString())
|
|
|
|
@property
|
|
def max_definition_level(self):
|
|
"""Maximum definition level (int)."""
|
|
return self.descr.max_definition_level()
|
|
|
|
@property
|
|
def max_repetition_level(self):
|
|
"""Maximum repetition level (int)."""
|
|
return self.descr.max_repetition_level()
|
|
|
|
@property
|
|
def physical_type(self):
|
|
"""Name of physical type (str)."""
|
|
return physical_type_name_from_enum(self.descr.physical_type())
|
|
|
|
@property
|
|
def logical_type(self):
|
|
"""Logical type of column (:class:`ParquetLogicalType`)."""
|
|
return wrap_logical_type(self.descr.logical_type())
|
|
|
|
@property
|
|
def converted_type(self):
|
|
"""Legacy converted type (str or None)."""
|
|
return converted_type_name_from_enum(self.descr.converted_type())
|
|
|
|
# FIXED_LEN_BYTE_ARRAY attribute
|
|
@property
|
|
def length(self):
|
|
"""Array length if fixed length byte array type, None otherwise (int or None)."""
|
|
return self.descr.type_length()
|
|
|
|
# Decimal attributes
|
|
@property
|
|
def precision(self):
|
|
"""Precision if decimal type, None otherwise (int or None)."""
|
|
return self.descr.type_precision()
|
|
|
|
@property
|
|
def scale(self):
|
|
"""Scale if decimal type, None otherwise (int or None)."""
|
|
return self.descr.type_scale()
|
|
|
|
|
|
cdef physical_type_name_from_enum(ParquetType type_):
|
|
return {
|
|
ParquetType_BOOLEAN: 'BOOLEAN',
|
|
ParquetType_INT32: 'INT32',
|
|
ParquetType_INT64: 'INT64',
|
|
ParquetType_INT96: 'INT96',
|
|
ParquetType_FLOAT: 'FLOAT',
|
|
ParquetType_DOUBLE: 'DOUBLE',
|
|
ParquetType_BYTE_ARRAY: 'BYTE_ARRAY',
|
|
ParquetType_FIXED_LEN_BYTE_ARRAY: 'FIXED_LEN_BYTE_ARRAY',
|
|
}.get(type_, 'UNKNOWN')
|
|
|
|
|
|
cdef logical_type_name_from_enum(ParquetLogicalTypeId type_):
|
|
return {
|
|
ParquetLogicalType_UNDEFINED: 'UNDEFINED',
|
|
ParquetLogicalType_STRING: 'STRING',
|
|
ParquetLogicalType_MAP: 'MAP',
|
|
ParquetLogicalType_LIST: 'LIST',
|
|
ParquetLogicalType_ENUM: 'ENUM',
|
|
ParquetLogicalType_DECIMAL: 'DECIMAL',
|
|
ParquetLogicalType_DATE: 'DATE',
|
|
ParquetLogicalType_TIME: 'TIME',
|
|
ParquetLogicalType_TIMESTAMP: 'TIMESTAMP',
|
|
ParquetLogicalType_INT: 'INT',
|
|
ParquetLogicalType_FLOAT16: 'FLOAT16',
|
|
ParquetLogicalType_JSON: 'JSON',
|
|
ParquetLogicalType_BSON: 'BSON',
|
|
ParquetLogicalType_UUID: 'UUID',
|
|
ParquetLogicalType_NONE: 'NONE',
|
|
}.get(type_, 'UNKNOWN')
|
|
|
|
|
|
cdef converted_type_name_from_enum(ParquetConvertedType type_):
|
|
return {
|
|
ParquetConvertedType_NONE: 'NONE',
|
|
ParquetConvertedType_UTF8: 'UTF8',
|
|
ParquetConvertedType_MAP: 'MAP',
|
|
ParquetConvertedType_MAP_KEY_VALUE: 'MAP_KEY_VALUE',
|
|
ParquetConvertedType_LIST: 'LIST',
|
|
ParquetConvertedType_ENUM: 'ENUM',
|
|
ParquetConvertedType_DECIMAL: 'DECIMAL',
|
|
ParquetConvertedType_DATE: 'DATE',
|
|
ParquetConvertedType_TIME_MILLIS: 'TIME_MILLIS',
|
|
ParquetConvertedType_TIME_MICROS: 'TIME_MICROS',
|
|
ParquetConvertedType_TIMESTAMP_MILLIS: 'TIMESTAMP_MILLIS',
|
|
ParquetConvertedType_TIMESTAMP_MICROS: 'TIMESTAMP_MICROS',
|
|
ParquetConvertedType_UINT_8: 'UINT_8',
|
|
ParquetConvertedType_UINT_16: 'UINT_16',
|
|
ParquetConvertedType_UINT_32: 'UINT_32',
|
|
ParquetConvertedType_UINT_64: 'UINT_64',
|
|
ParquetConvertedType_INT_8: 'INT_8',
|
|
ParquetConvertedType_INT_16: 'INT_16',
|
|
ParquetConvertedType_INT_32: 'INT_32',
|
|
ParquetConvertedType_INT_64: 'INT_64',
|
|
ParquetConvertedType_JSON: 'JSON',
|
|
ParquetConvertedType_BSON: 'BSON',
|
|
ParquetConvertedType_INTERVAL: 'INTERVAL',
|
|
}.get(type_, 'UNKNOWN')
|
|
|
|
|
|
cdef encoding_name_from_enum(ParquetEncoding encoding_):
|
|
return {
|
|
ParquetEncoding_PLAIN: 'PLAIN',
|
|
ParquetEncoding_PLAIN_DICTIONARY: 'PLAIN_DICTIONARY',
|
|
ParquetEncoding_RLE: 'RLE',
|
|
ParquetEncoding_BIT_PACKED: 'BIT_PACKED',
|
|
ParquetEncoding_DELTA_BINARY_PACKED: 'DELTA_BINARY_PACKED',
|
|
ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: 'DELTA_LENGTH_BYTE_ARRAY',
|
|
ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY',
|
|
ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY',
|
|
ParquetEncoding_BYTE_STREAM_SPLIT: 'BYTE_STREAM_SPLIT',
|
|
}.get(encoding_, 'UNKNOWN')
|
|
|
|
|
|
cdef encoding_enum_from_name(str encoding_name):
|
|
enc = {
|
|
'PLAIN': ParquetEncoding_PLAIN,
|
|
'BIT_PACKED': ParquetEncoding_BIT_PACKED,
|
|
'RLE': ParquetEncoding_RLE,
|
|
'BYTE_STREAM_SPLIT': ParquetEncoding_BYTE_STREAM_SPLIT,
|
|
'DELTA_BINARY_PACKED': ParquetEncoding_DELTA_BINARY_PACKED,
|
|
'DELTA_LENGTH_BYTE_ARRAY': ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY,
|
|
'DELTA_BYTE_ARRAY': ParquetEncoding_DELTA_BYTE_ARRAY,
|
|
'RLE_DICTIONARY': 'dict',
|
|
'PLAIN_DICTIONARY': 'dict',
|
|
}.get(encoding_name, None)
|
|
if enc is None:
|
|
raise ValueError(f"Unsupported column encoding: {encoding_name!r}")
|
|
elif enc == 'dict':
|
|
raise ValueError(f"{encoding_name!r} is already used by default.")
|
|
else:
|
|
return enc
|
|
|
|
|
|
cdef compression_name_from_enum(ParquetCompression compression_):
|
|
return {
|
|
ParquetCompression_UNCOMPRESSED: 'UNCOMPRESSED',
|
|
ParquetCompression_SNAPPY: 'SNAPPY',
|
|
ParquetCompression_GZIP: 'GZIP',
|
|
ParquetCompression_LZO: 'LZO',
|
|
ParquetCompression_BROTLI: 'BROTLI',
|
|
ParquetCompression_LZ4: 'LZ4',
|
|
ParquetCompression_ZSTD: 'ZSTD',
|
|
}.get(compression_, 'UNKNOWN')
|
|
|
|
|
|
cdef int check_compression_name(name) except -1:
|
|
if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4',
|
|
'ZSTD'}:
|
|
raise ArrowException("Unsupported compression: " + name)
|
|
return 0
|
|
|
|
|
|
cdef ParquetCompression compression_from_name(name):
|
|
name = name.upper()
|
|
if name == 'SNAPPY':
|
|
return ParquetCompression_SNAPPY
|
|
elif name == 'GZIP':
|
|
return ParquetCompression_GZIP
|
|
elif name == 'LZO':
|
|
return ParquetCompression_LZO
|
|
elif name == 'BROTLI':
|
|
return ParquetCompression_BROTLI
|
|
elif name == 'LZ4':
|
|
return ParquetCompression_LZ4
|
|
elif name == 'ZSTD':
|
|
return ParquetCompression_ZSTD
|
|
else:
|
|
return ParquetCompression_UNCOMPRESSED
|
|
|
|
|
|
cdef class ParquetReader(_Weakrefable):
|
|
cdef:
|
|
object source
|
|
CMemoryPool* pool
|
|
UniquePtrNoGIL[FileReader] reader
|
|
FileMetaData _metadata
|
|
shared_ptr[CRandomAccessFile] rd_handle
|
|
|
|
cdef public:
|
|
_column_idx_map
|
|
|
|
def __cinit__(self, MemoryPool memory_pool=None):
|
|
self.pool = maybe_unbox_memory_pool(memory_pool)
|
|
self._metadata = None
|
|
|
|
def open(self, object source not None, *, bint use_memory_map=False,
|
|
read_dictionary=None, binary_type=None, list_type=None,
|
|
FileMetaData metadata=None,
|
|
int buffer_size=0, bint pre_buffer=False,
|
|
coerce_int96_timestamp_unit=None,
|
|
FileDecryptionProperties decryption_properties=None,
|
|
thrift_string_size_limit=None,
|
|
thrift_container_size_limit=None,
|
|
page_checksum_verification=False,
|
|
arrow_extensions_enabled=False):
|
|
"""
|
|
Open a parquet file for reading.
|
|
|
|
Parameters
|
|
----------
|
|
source : str, pathlib.Path, pyarrow.NativeFile, or file-like object
|
|
use_memory_map : bool, default False
|
|
read_dictionary : iterable[int or str], optional
|
|
binary_type : pyarrow.DataType, optional
|
|
list_type : subclass of pyarrow.DataType, optional
|
|
metadata : FileMetaData, optional
|
|
buffer_size : int, default 0
|
|
pre_buffer : bool, default False
|
|
coerce_int96_timestamp_unit : str, optional
|
|
decryption_properties : FileDecryptionProperties, optional
|
|
thrift_string_size_limit : int, optional
|
|
thrift_container_size_limit : int, optional
|
|
page_checksum_verification : bool, default False
|
|
arrow_extensions_enabled : bool, default False
|
|
"""
|
|
cdef:
|
|
shared_ptr[CFileMetaData] c_metadata
|
|
CReaderProperties properties = default_reader_properties()
|
|
ArrowReaderProperties arrow_props = (
|
|
default_arrow_reader_properties())
|
|
FileReaderBuilder builder
|
|
|
|
if pre_buffer and not is_threading_enabled():
|
|
pre_buffer = False
|
|
|
|
if metadata is not None:
|
|
c_metadata = metadata.sp_metadata
|
|
|
|
if buffer_size > 0:
|
|
properties.enable_buffered_stream()
|
|
properties.set_buffer_size(buffer_size)
|
|
elif buffer_size == 0:
|
|
properties.disable_buffered_stream()
|
|
else:
|
|
raise ValueError('Buffer size must be larger than zero')
|
|
|
|
if thrift_string_size_limit is not None:
|
|
if thrift_string_size_limit <= 0:
|
|
raise ValueError("thrift_string_size_limit "
|
|
"must be larger than zero")
|
|
properties.set_thrift_string_size_limit(thrift_string_size_limit)
|
|
if thrift_container_size_limit is not None:
|
|
if thrift_container_size_limit <= 0:
|
|
raise ValueError("thrift_container_size_limit "
|
|
"must be larger than zero")
|
|
properties.set_thrift_container_size_limit(
|
|
thrift_container_size_limit)
|
|
|
|
if decryption_properties is not None:
|
|
properties.file_decryption_properties(
|
|
decryption_properties.unwrap())
|
|
|
|
arrow_props.set_pre_buffer(pre_buffer)
|
|
|
|
properties.set_page_checksum_verification(page_checksum_verification)
|
|
|
|
if binary_type is not None:
|
|
c_binary_type = pyarrow_unwrap_data_type(binary_type)
|
|
arrow_props.set_binary_type(c_binary_type.get().id())
|
|
|
|
if list_type is not None:
|
|
arrow_props.set_list_type(_unwrap_list_type(list_type))
|
|
|
|
if coerce_int96_timestamp_unit is None:
|
|
# use the default defined in default_arrow_reader_properties()
|
|
pass
|
|
else:
|
|
arrow_props.set_coerce_int96_timestamp_unit(
|
|
string_to_timeunit(coerce_int96_timestamp_unit))
|
|
|
|
arrow_props.set_arrow_extensions_enabled(arrow_extensions_enabled)
|
|
|
|
self.source = source
|
|
get_reader(source, use_memory_map, &self.rd_handle)
|
|
|
|
with nogil:
|
|
check_status(builder.Open(self.rd_handle, properties, c_metadata))
|
|
|
|
# Set up metadata
|
|
with nogil:
|
|
c_metadata = builder.raw_reader().metadata()
|
|
cdef FileMetaData result = FileMetaData.__new__(FileMetaData)
|
|
self._metadata = result
|
|
result.init(c_metadata)
|
|
|
|
if read_dictionary is not None:
|
|
self._set_read_dictionary(read_dictionary, &arrow_props)
|
|
|
|
with nogil:
|
|
check_status(builder.memory_pool(self.pool)
|
|
.properties(arrow_props)
|
|
.Build(&self.reader))
|
|
|
|
cdef _set_read_dictionary(self, read_dictionary,
|
|
ArrowReaderProperties* props):
|
|
for column in read_dictionary:
|
|
if not isinstance(column, int):
|
|
column = self.column_name_idx(column)
|
|
props.set_read_dictionary(column, True)
|
|
|
|
@property
|
|
def column_paths(self):
|
|
cdef:
|
|
FileMetaData container = self.metadata
|
|
const CFileMetaData* metadata = container._metadata
|
|
vector[c_string] path
|
|
int i = 0
|
|
|
|
paths = []
|
|
for i in range(0, metadata.num_columns()):
|
|
path = (metadata.schema().Column(i)
|
|
.path().get().ToDotVector())
|
|
paths.append([frombytes(x) for x in path])
|
|
|
|
return paths
|
|
|
|
@property
|
|
def metadata(self):
|
|
return self._metadata
|
|
|
|
@property
|
|
def schema_arrow(self):
|
|
cdef shared_ptr[CSchema] out
|
|
with nogil:
|
|
check_status(self.reader.get().GetSchema(&out))
|
|
return pyarrow_wrap_schema(out)
|
|
|
|
@property
|
|
def num_row_groups(self):
|
|
return self.reader.get().num_row_groups()
|
|
|
|
def set_use_threads(self, bint use_threads):
|
|
"""
|
|
Parameters
|
|
----------
|
|
use_threads : bool
|
|
"""
|
|
if is_threading_enabled():
|
|
self.reader.get().set_use_threads(use_threads)
|
|
else:
|
|
self.reader.get().set_use_threads(False)
|
|
|
|
def set_batch_size(self, int64_t batch_size):
|
|
"""
|
|
Parameters
|
|
----------
|
|
batch_size : int64
|
|
"""
|
|
self.reader.get().set_batch_size(batch_size)
|
|
|
|
def iter_batches(self, int64_t batch_size, row_groups, column_indices=None,
|
|
bint use_threads=True):
|
|
"""
|
|
Parameters
|
|
----------
|
|
batch_size : int64
|
|
row_groups : list[int]
|
|
column_indices : list[int], optional
|
|
use_threads : bool, default True
|
|
|
|
Yields
|
|
------
|
|
next : RecordBatch
|
|
"""
|
|
cdef:
|
|
vector[int] c_row_groups
|
|
vector[int] c_column_indices
|
|
shared_ptr[CRecordBatch] record_batch
|
|
UniquePtrNoGIL[CRecordBatchReader] recordbatchreader
|
|
|
|
self.set_batch_size(batch_size)
|
|
|
|
if use_threads:
|
|
self.set_use_threads(use_threads)
|
|
|
|
for row_group in row_groups:
|
|
c_row_groups.push_back(row_group)
|
|
|
|
if column_indices is not None:
|
|
for index in column_indices:
|
|
c_column_indices.push_back(index)
|
|
with nogil:
|
|
recordbatchreader = GetResultValue(
|
|
self.reader.get().GetRecordBatchReader(
|
|
c_row_groups, c_column_indices
|
|
)
|
|
)
|
|
else:
|
|
with nogil:
|
|
recordbatchreader = GetResultValue(
|
|
self.reader.get().GetRecordBatchReader(
|
|
c_row_groups
|
|
)
|
|
)
|
|
|
|
while True:
|
|
with nogil:
|
|
check_status(
|
|
recordbatchreader.get().ReadNext(&record_batch)
|
|
)
|
|
if record_batch.get() == NULL:
|
|
break
|
|
|
|
yield pyarrow_wrap_batch(record_batch)
|
|
|
|
def read_row_group(self, int i, column_indices=None,
|
|
bint use_threads=True):
|
|
"""
|
|
Parameters
|
|
----------
|
|
i : int
|
|
column_indices : list[int], optional
|
|
use_threads : bool, default True
|
|
|
|
Returns
|
|
-------
|
|
table : pyarrow.Table
|
|
"""
|
|
return self.read_row_groups([i], column_indices, use_threads)
|
|
|
|
def read_row_groups(self, row_groups not None, column_indices=None,
|
|
bint use_threads=True):
|
|
"""
|
|
Parameters
|
|
----------
|
|
row_groups : list[int]
|
|
column_indices : list[int], optional
|
|
use_threads : bool, default True
|
|
|
|
Returns
|
|
-------
|
|
table : pyarrow.Table
|
|
"""
|
|
cdef:
|
|
shared_ptr[CTable] ctable
|
|
vector[int] c_row_groups
|
|
vector[int] c_column_indices
|
|
|
|
self.set_use_threads(use_threads)
|
|
|
|
for row_group in row_groups:
|
|
c_row_groups.push_back(row_group)
|
|
|
|
if column_indices is not None:
|
|
for index in column_indices:
|
|
c_column_indices.push_back(index)
|
|
|
|
with nogil:
|
|
check_status(self.reader.get()
|
|
.ReadRowGroups(c_row_groups, c_column_indices,
|
|
&ctable))
|
|
else:
|
|
# Read all columns
|
|
with nogil:
|
|
check_status(self.reader.get()
|
|
.ReadRowGroups(c_row_groups, &ctable))
|
|
return pyarrow_wrap_table(ctable)
|
|
|
|
def read_all(self, column_indices=None, bint use_threads=True):
|
|
"""
|
|
Parameters
|
|
----------
|
|
column_indices : list[int], optional
|
|
use_threads : bool, default True
|
|
|
|
Returns
|
|
-------
|
|
table : pyarrow.Table
|
|
"""
|
|
cdef:
|
|
shared_ptr[CTable] ctable
|
|
vector[int] c_column_indices
|
|
|
|
self.set_use_threads(use_threads)
|
|
|
|
if column_indices is not None:
|
|
for index in column_indices:
|
|
c_column_indices.push_back(index)
|
|
|
|
with nogil:
|
|
check_status(self.reader.get()
|
|
.ReadTable(c_column_indices, &ctable))
|
|
else:
|
|
# Read all columns
|
|
with nogil:
|
|
check_status(self.reader.get()
|
|
.ReadTable(&ctable))
|
|
return pyarrow_wrap_table(ctable)
|
|
|
|
def scan_contents(self, column_indices=None, batch_size=65536):
|
|
"""
|
|
Parameters
|
|
----------
|
|
column_indices : list[int], optional
|
|
batch_size : int32, default 65536
|
|
|
|
Returns
|
|
-------
|
|
num_rows : int64
|
|
"""
|
|
cdef:
|
|
vector[int] c_column_indices
|
|
int32_t c_batch_size
|
|
int64_t c_num_rows
|
|
|
|
if column_indices is not None:
|
|
for index in column_indices:
|
|
c_column_indices.push_back(index)
|
|
|
|
c_batch_size = batch_size
|
|
|
|
with nogil:
|
|
check_status(self.reader.get()
|
|
.ScanContents(c_column_indices, c_batch_size,
|
|
&c_num_rows))
|
|
|
|
return c_num_rows
|
|
|
|
def column_name_idx(self, column_name):
|
|
"""
|
|
Find the index of a column by its name.
|
|
|
|
Parameters
|
|
----------
|
|
column_name : str
|
|
Name of the column; separation of nesting levels is done via ".".
|
|
|
|
Returns
|
|
-------
|
|
column_idx : int
|
|
Integer index of the column in the schema.
|
|
"""
|
|
cdef:
|
|
FileMetaData container = self.metadata
|
|
const CFileMetaData* metadata = container._metadata
|
|
int i = 0
|
|
|
|
if self._column_idx_map is None:
|
|
self._column_idx_map = {}
|
|
for i in range(0, metadata.num_columns()):
|
|
col_bytes = tobytes(metadata.schema().Column(i)
|
|
.path().get().ToDotString())
|
|
self._column_idx_map[col_bytes] = i
|
|
|
|
return self._column_idx_map[tobytes(column_name)]
|
|
|
|
def read_column(self, int column_index):
|
|
"""
|
|
Read the column at the specified index.
|
|
|
|
Parameters
|
|
----------
|
|
column_index : int
|
|
Index of the column.
|
|
|
|
Returns
|
|
-------
|
|
column : pyarrow.ChunkedArray
|
|
"""
|
|
cdef shared_ptr[CChunkedArray] out
|
|
with nogil:
|
|
check_status(self.reader.get()
|
|
.ReadColumn(column_index, &out))
|
|
return pyarrow_wrap_chunked_array(out)
|
|
|
|
def close(self):
|
|
if not self.closed:
|
|
with nogil:
|
|
check_status(self.rd_handle.get().Close())
|
|
|
|
@property
|
|
def closed(self):
|
|
if self.rd_handle == NULL:
|
|
return True
|
|
with nogil:
|
|
closed = self.rd_handle.get().closed()
|
|
return closed
|
|
|
|
|
|
cdef CSortingColumn _convert_sorting_column(SortingColumn sorting_column):
|
|
cdef CSortingColumn c_sorting_column
|
|
|
|
c_sorting_column.column_idx = sorting_column.column_index
|
|
c_sorting_column.descending = sorting_column.descending
|
|
c_sorting_column.nulls_first = sorting_column.nulls_first
|
|
|
|
return c_sorting_column
|
|
|
|
|
|
cdef vector[CSortingColumn] _convert_sorting_columns(sorting_columns) except *:
|
|
if not (isinstance(sorting_columns, Sequence)
|
|
and all(isinstance(col, SortingColumn) for col in sorting_columns)):
|
|
raise ValueError(
|
|
"'sorting_columns' must be a list of `SortingColumn`")
|
|
|
|
cdef vector[CSortingColumn] c_sorting_columns = [_convert_sorting_column(col)
|
|
for col in sorting_columns]
|
|
|
|
return c_sorting_columns
|
|
|
|
|
|
cdef shared_ptr[WriterProperties] _create_writer_properties(
|
|
use_dictionary=None,
|
|
compression=None,
|
|
version=None,
|
|
write_statistics=None,
|
|
data_page_size=None,
|
|
compression_level=None,
|
|
use_byte_stream_split=False,
|
|
column_encoding=None,
|
|
data_page_version=None,
|
|
FileEncryptionProperties encryption_properties=None,
|
|
write_batch_size=None,
|
|
dictionary_pagesize_limit=None,
|
|
write_page_index=False,
|
|
write_page_checksum=False,
|
|
sorting_columns=None,
|
|
store_decimal_as_integer=False,
|
|
use_content_defined_chunking=False) except *:
|
|
|
|
"""General writer properties"""
|
|
cdef:
|
|
shared_ptr[WriterProperties] properties
|
|
WriterProperties.Builder props
|
|
CdcOptions cdc_options
|
|
|
|
# data_page_version
|
|
|
|
if data_page_version is not None:
|
|
if data_page_version == "1.0":
|
|
props.data_page_version(ParquetDataPageVersion_V1)
|
|
elif data_page_version == "2.0":
|
|
props.data_page_version(ParquetDataPageVersion_V2)
|
|
else:
|
|
raise ValueError(
|
|
f"Unsupported Parquet data page version: {data_page_version}")
|
|
|
|
# version
|
|
|
|
if version is not None:
|
|
if version == "1.0":
|
|
props.version(ParquetVersion_V1)
|
|
elif version == "2.4":
|
|
props.version(ParquetVersion_V2_4)
|
|
elif version == "2.6":
|
|
props.version(ParquetVersion_V2_6)
|
|
else:
|
|
raise ValueError(f"Unsupported Parquet format version: {version}")
|
|
|
|
# compression
|
|
|
|
if isinstance(compression, basestring):
|
|
check_compression_name(compression)
|
|
props.compression(compression_from_name(compression))
|
|
elif compression is not None:
|
|
for column, codec in compression.iteritems():
|
|
check_compression_name(codec)
|
|
props.compression(tobytes(column), compression_from_name(codec))
|
|
|
|
if isinstance(compression_level, int):
|
|
props.compression_level(compression_level)
|
|
elif compression_level is not None:
|
|
for column, level in compression_level.iteritems():
|
|
props.compression_level(tobytes(column), level)
|
|
|
|
# use_dictionary
|
|
|
|
if isinstance(use_dictionary, bool):
|
|
if use_dictionary:
|
|
props.enable_dictionary()
|
|
if column_encoding is not None:
|
|
raise ValueError(
|
|
"To use 'column_encoding' set 'use_dictionary' to False")
|
|
else:
|
|
props.disable_dictionary()
|
|
elif use_dictionary is not None:
|
|
# Deactivate dictionary encoding by default
|
|
props.disable_dictionary()
|
|
for column in use_dictionary:
|
|
props.enable_dictionary(tobytes(column))
|
|
if (column_encoding is not None and
|
|
column_encoding.get(column) is not None):
|
|
raise ValueError(
|
|
"To use 'column_encoding' set 'use_dictionary' to False")
|
|
|
|
# write_statistics
|
|
|
|
if isinstance(write_statistics, bool):
|
|
if write_statistics:
|
|
props.enable_statistics()
|
|
else:
|
|
props.disable_statistics()
|
|
elif write_statistics is not None:
|
|
# Deactivate statistics by default and enable for specified columns
|
|
props.disable_statistics()
|
|
for column in write_statistics:
|
|
props.enable_statistics(tobytes(column))
|
|
|
|
# sorting_columns
|
|
|
|
if sorting_columns is not None:
|
|
props.set_sorting_columns(_convert_sorting_columns(sorting_columns))
|
|
|
|
# use_byte_stream_split
|
|
|
|
if isinstance(use_byte_stream_split, bool):
|
|
if use_byte_stream_split:
|
|
if column_encoding is not None:
|
|
raise ValueError(
|
|
"'use_byte_stream_split' cannot be passed"
|
|
"together with 'column_encoding'")
|
|
else:
|
|
props.encoding(ParquetEncoding_BYTE_STREAM_SPLIT)
|
|
elif use_byte_stream_split is not None:
|
|
for column in use_byte_stream_split:
|
|
if column_encoding is None:
|
|
column_encoding = {column: 'BYTE_STREAM_SPLIT'}
|
|
elif column_encoding.get(column, None) is None:
|
|
column_encoding[column] = 'BYTE_STREAM_SPLIT'
|
|
else:
|
|
raise ValueError(
|
|
"'use_byte_stream_split' cannot be passed"
|
|
"together with 'column_encoding'")
|
|
|
|
# store_decimal_as_integer
|
|
|
|
if isinstance(store_decimal_as_integer, bool):
|
|
if store_decimal_as_integer:
|
|
props.enable_store_decimal_as_integer()
|
|
else:
|
|
props.disable_store_decimal_as_integer()
|
|
else:
|
|
raise TypeError("'store_decimal_as_integer' must be a boolean")
|
|
|
|
# column_encoding
|
|
# encoding map - encode individual columns
|
|
|
|
if column_encoding is not None:
|
|
if isinstance(column_encoding, dict):
|
|
for column, _encoding in column_encoding.items():
|
|
props.encoding(tobytes(column),
|
|
encoding_enum_from_name(_encoding))
|
|
elif isinstance(column_encoding, str):
|
|
props.encoding(encoding_enum_from_name(column_encoding))
|
|
else:
|
|
raise TypeError(
|
|
"'column_encoding' should be a dictionary or a string")
|
|
|
|
# size limits
|
|
if data_page_size is not None:
|
|
props.data_pagesize(data_page_size)
|
|
|
|
if write_batch_size is not None:
|
|
props.write_batch_size(write_batch_size)
|
|
|
|
if dictionary_pagesize_limit is not None:
|
|
props.dictionary_pagesize_limit(dictionary_pagesize_limit)
|
|
|
|
# content defined chunking
|
|
|
|
if use_content_defined_chunking is True:
|
|
props.enable_content_defined_chunking()
|
|
elif use_content_defined_chunking is False:
|
|
props.disable_content_defined_chunking()
|
|
elif isinstance(use_content_defined_chunking, dict):
|
|
defined_keys = use_content_defined_chunking.keys()
|
|
mandatory_keys = {"min_chunk_size", "max_chunk_size"}
|
|
allowed_keys = {"min_chunk_size", "max_chunk_size", "norm_level"}
|
|
unknown_keys = defined_keys - allowed_keys
|
|
missing_keys = mandatory_keys - defined_keys
|
|
if unknown_keys:
|
|
raise ValueError(
|
|
f"Unknown options in 'use_content_defined_chunking': {unknown_keys}")
|
|
if missing_keys:
|
|
raise ValueError(
|
|
f"Missing options in 'use_content_defined_chunking': {missing_keys}")
|
|
cdc_options.min_chunk_size = use_content_defined_chunking["min_chunk_size"]
|
|
cdc_options.max_chunk_size = use_content_defined_chunking["max_chunk_size"]
|
|
cdc_options.norm_level = use_content_defined_chunking.get("norm_level", 0)
|
|
props.enable_content_defined_chunking()
|
|
props.content_defined_chunking_options(cdc_options)
|
|
else:
|
|
raise TypeError(
|
|
"'use_content_defined_chunking' should be either boolean or a dictionary")
|
|
|
|
# encryption
|
|
|
|
if encryption_properties is not None:
|
|
props.encryption(
|
|
(<FileEncryptionProperties>encryption_properties).unwrap())
|
|
|
|
# For backwards compatibility reasons we cap the maximum row group size
|
|
# at 64Mi rows. This could be changed in the future, though it would be
|
|
# a breaking change.
|
|
#
|
|
# The user can always specify a smaller row group size (and the default
|
|
# is smaller) when calling write_table. If the call to write_table uses
|
|
# a size larger than this then it will be latched to this value.
|
|
props.max_row_group_length(_MAX_ROW_GROUP_SIZE)
|
|
|
|
# checksum
|
|
|
|
if write_page_checksum:
|
|
props.enable_page_checksum()
|
|
else:
|
|
props.disable_page_checksum()
|
|
|
|
# page index
|
|
|
|
if write_page_index:
|
|
props.enable_write_page_index()
|
|
else:
|
|
props.disable_write_page_index()
|
|
|
|
properties = props.build()
|
|
|
|
return properties
|
|
|
|
|
|
cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
|
|
use_deprecated_int96_timestamps=False,
|
|
coerce_timestamps=None,
|
|
allow_truncated_timestamps=False,
|
|
writer_engine_version=None,
|
|
use_compliant_nested_type=True,
|
|
store_schema=True) except *:
|
|
"""Arrow writer properties"""
|
|
cdef:
|
|
shared_ptr[ArrowWriterProperties] arrow_properties
|
|
ArrowWriterProperties.Builder arrow_props
|
|
|
|
# Store the original Arrow schema so things like dictionary types can
|
|
# be automatically reconstructed
|
|
if store_schema:
|
|
arrow_props.store_schema()
|
|
|
|
# int96 support
|
|
|
|
if use_deprecated_int96_timestamps:
|
|
arrow_props.enable_deprecated_int96_timestamps()
|
|
else:
|
|
arrow_props.disable_deprecated_int96_timestamps()
|
|
|
|
# coerce_timestamps
|
|
|
|
if coerce_timestamps == 'ms':
|
|
arrow_props.coerce_timestamps(TimeUnit_MILLI)
|
|
elif coerce_timestamps == 'us':
|
|
arrow_props.coerce_timestamps(TimeUnit_MICRO)
|
|
elif coerce_timestamps is not None:
|
|
raise ValueError(f'Invalid value for coerce_timestamps: {coerce_timestamps}')
|
|
|
|
# allow_truncated_timestamps
|
|
|
|
if allow_truncated_timestamps:
|
|
arrow_props.allow_truncated_timestamps()
|
|
else:
|
|
arrow_props.disallow_truncated_timestamps()
|
|
|
|
# use_compliant_nested_type
|
|
|
|
if use_compliant_nested_type:
|
|
arrow_props.enable_compliant_nested_types()
|
|
else:
|
|
arrow_props.disable_compliant_nested_types()
|
|
|
|
# writer_engine_version
|
|
|
|
if writer_engine_version == "V1":
|
|
warnings.warn("V1 parquet writer engine is a no-op. Use V2.")
|
|
arrow_props.set_engine_version(ArrowWriterEngineVersion.V1)
|
|
elif writer_engine_version != "V2":
|
|
raise ValueError(f"Unsupported Writer Engine Version: {writer_engine_version}")
|
|
|
|
arrow_properties = arrow_props.build()
|
|
|
|
return arrow_properties
|
|
|
|
cdef _name_to_index_map(Schema arrow_schema):
|
|
cdef:
|
|
shared_ptr[CSchema] sp_arrow_schema
|
|
shared_ptr[SchemaDescriptor] sp_parquet_schema
|
|
shared_ptr[WriterProperties] props = _create_writer_properties()
|
|
shared_ptr[ArrowWriterProperties] arrow_props = _create_arrow_writer_properties(
|
|
use_deprecated_int96_timestamps=False,
|
|
coerce_timestamps=None,
|
|
allow_truncated_timestamps=False,
|
|
writer_engine_version="V2"
|
|
)
|
|
|
|
sp_arrow_schema = pyarrow_unwrap_schema(arrow_schema)
|
|
|
|
with nogil:
|
|
check_status(ToParquetSchema(
|
|
sp_arrow_schema.get(), deref(props.get()), deref(arrow_props.get()), &sp_parquet_schema))
|
|
|
|
out = dict()
|
|
|
|
cdef SchemaDescriptor* parquet_schema = sp_parquet_schema.get()
|
|
|
|
for i in range(parquet_schema.num_columns()):
|
|
name = frombytes(parquet_schema.Column(i).path().get().ToDotString())
|
|
out[name] = i
|
|
|
|
return out
|
|
|
|
|
|
cdef class ParquetWriter(_Weakrefable):
|
|
cdef:
|
|
unique_ptr[FileWriter] writer
|
|
shared_ptr[COutputStream] sink
|
|
bint own_sink
|
|
|
|
def __cinit__(self, where, Schema schema not None, use_dictionary=None,
|
|
compression=None, version=None,
|
|
write_statistics=None,
|
|
MemoryPool memory_pool=None,
|
|
use_deprecated_int96_timestamps=False,
|
|
coerce_timestamps=None,
|
|
data_page_size=None,
|
|
allow_truncated_timestamps=False,
|
|
compression_level=None,
|
|
use_byte_stream_split=False,
|
|
column_encoding=None,
|
|
writer_engine_version=None,
|
|
data_page_version=None,
|
|
use_compliant_nested_type=True,
|
|
encryption_properties=None,
|
|
write_batch_size=None,
|
|
dictionary_pagesize_limit=None,
|
|
store_schema=True,
|
|
write_page_index=False,
|
|
write_page_checksum=False,
|
|
sorting_columns=None,
|
|
store_decimal_as_integer=False,
|
|
use_content_defined_chunking=False):
|
|
cdef:
|
|
shared_ptr[WriterProperties] properties
|
|
shared_ptr[ArrowWriterProperties] arrow_properties
|
|
c_string c_where
|
|
CMemoryPool* pool
|
|
|
|
try:
|
|
where = _stringify_path(where)
|
|
except TypeError:
|
|
get_writer(where, &self.sink)
|
|
self.own_sink = False
|
|
else:
|
|
c_where = tobytes(where)
|
|
with nogil:
|
|
self.sink = GetResultValue(FileOutputStream.Open(c_where))
|
|
self.own_sink = True
|
|
|
|
properties = _create_writer_properties(
|
|
use_dictionary=use_dictionary,
|
|
compression=compression,
|
|
version=version,
|
|
write_statistics=write_statistics,
|
|
data_page_size=data_page_size,
|
|
compression_level=compression_level,
|
|
use_byte_stream_split=use_byte_stream_split,
|
|
column_encoding=column_encoding,
|
|
data_page_version=data_page_version,
|
|
encryption_properties=encryption_properties,
|
|
write_batch_size=write_batch_size,
|
|
dictionary_pagesize_limit=dictionary_pagesize_limit,
|
|
write_page_index=write_page_index,
|
|
write_page_checksum=write_page_checksum,
|
|
sorting_columns=sorting_columns,
|
|
store_decimal_as_integer=store_decimal_as_integer,
|
|
use_content_defined_chunking=use_content_defined_chunking
|
|
)
|
|
arrow_properties = _create_arrow_writer_properties(
|
|
use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,
|
|
coerce_timestamps=coerce_timestamps,
|
|
allow_truncated_timestamps=allow_truncated_timestamps,
|
|
writer_engine_version=writer_engine_version,
|
|
use_compliant_nested_type=use_compliant_nested_type,
|
|
store_schema=store_schema,
|
|
)
|
|
|
|
pool = maybe_unbox_memory_pool(memory_pool)
|
|
with nogil:
|
|
self.writer = move(GetResultValue(
|
|
FileWriter.Open(deref(schema.schema), pool,
|
|
self.sink, properties, arrow_properties)))
|
|
|
|
def close(self):
|
|
with nogil:
|
|
check_status(self.writer.get().Close())
|
|
if self.own_sink:
|
|
check_status(self.sink.get().Close())
|
|
|
|
def write_table(self, Table table, row_group_size=None):
|
|
cdef:
|
|
CTable* ctable = table.table
|
|
int64_t c_row_group_size
|
|
|
|
if row_group_size is None or row_group_size == -1:
|
|
c_row_group_size = min(ctable.num_rows(), _DEFAULT_ROW_GROUP_SIZE)
|
|
elif row_group_size == 0:
|
|
raise ValueError('Row group size cannot be 0')
|
|
else:
|
|
c_row_group_size = row_group_size
|
|
|
|
with nogil:
|
|
check_status(self.writer.get()
|
|
.WriteTable(deref(ctable), c_row_group_size))
|
|
|
|
def add_key_value_metadata(self, key_value_metadata):
|
|
cdef:
|
|
shared_ptr[const CKeyValueMetadata] c_metadata
|
|
|
|
c_metadata = pyarrow_unwrap_metadata(KeyValueMetadata(key_value_metadata))
|
|
with nogil:
|
|
check_status(self.writer.get()
|
|
.AddKeyValueMetadata(c_metadata))
|
|
|
|
@property
|
|
def metadata(self):
|
|
cdef:
|
|
shared_ptr[CFileMetaData] metadata
|
|
FileMetaData result
|
|
with nogil:
|
|
metadata = self.writer.get().metadata()
|
|
if metadata:
|
|
result = FileMetaData.__new__(FileMetaData)
|
|
result.init(metadata)
|
|
return result
|
|
raise RuntimeError(
|
|
'file metadata is only available after writer close')
|