257 lines
11 KiB
C++
257 lines
11 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "arrow/io/caching.h"
|
|
#include "arrow/util/type_fwd.h"
|
|
#include "parquet/metadata.h" // IWYU pragma: keep
|
|
#include "parquet/platform.h"
|
|
#include "parquet/properties.h"
|
|
|
|
namespace parquet {
|
|
|
|
class ColumnReader;
|
|
class FileMetaData;
|
|
class PageIndexReader;
|
|
class BloomFilterReader;
|
|
class PageReader;
|
|
class RowGroupMetaData;
|
|
|
|
namespace internal {
|
|
class RecordReader;
|
|
}
|
|
|
|
class PARQUET_EXPORT RowGroupReader {
|
|
public:
|
|
// Forward declare a virtual class 'Contents' to aid dependency injection and more
|
|
// easily create test fixtures
|
|
// An implementation of the Contents class is defined in the .cc file
|
|
struct Contents {
|
|
virtual ~Contents() {}
|
|
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
|
|
virtual const RowGroupMetaData* metadata() const = 0;
|
|
virtual const ReaderProperties* properties() const = 0;
|
|
};
|
|
|
|
explicit RowGroupReader(std::unique_ptr<Contents> contents);
|
|
|
|
// Returns the rowgroup metadata
|
|
const RowGroupMetaData* metadata() const;
|
|
|
|
// Construct a ColumnReader for the indicated row group-relative
|
|
// column. Ownership is shared with the RowGroupReader.
|
|
std::shared_ptr<ColumnReader> Column(int i);
|
|
|
|
// EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group.
|
|
// Ownership is shared with the RowGroupReader.
|
|
std::shared_ptr<internal::RecordReader> RecordReader(int i,
|
|
bool read_dictionary = false);
|
|
|
|
// Construct a ColumnReader, trying to enable exposed encoding.
|
|
//
|
|
// For dictionary encoding, currently we only support column chunks that are fully
|
|
// dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
|
|
// If a column chunk uses dictionary encoding but then falls back to plain encoding, the
|
|
// encoding will not be exposed.
|
|
//
|
|
// The returned column reader provides an API GetExposedEncoding() for the
|
|
// users to check the exposed encoding and determine how to read the batches.
|
|
//
|
|
// \note API EXPERIMENTAL
|
|
std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
|
|
int i, ExposedEncoding encoding_to_expose);
|
|
|
|
// Construct a RecordReader, trying to enable exposed encoding.
|
|
//
|
|
// For dictionary encoding, currently we only support column chunks that are
|
|
// fully dictionary encoded byte arrays. The caller should verify if the reader can read
|
|
// and expose the dictionary by checking the reader's read_dictionary(). If a column
|
|
// chunk uses dictionary encoding but then falls back to plain encoding, the returned
|
|
// reader will read decoded data without exposing the dictionary.
|
|
//
|
|
// \note API EXPERIMENTAL
|
|
std::shared_ptr<internal::RecordReader> RecordReaderWithExposeEncoding(
|
|
int i, ExposedEncoding encoding_to_expose);
|
|
|
|
std::unique_ptr<PageReader> GetColumnPageReader(int i);
|
|
|
|
private:
|
|
// Holds a pointer to an instance of Contents implementation
|
|
std::unique_ptr<Contents> contents_;
|
|
};
|
|
|
|
class PARQUET_EXPORT ParquetFileReader {
|
|
public:
|
|
// Declare a virtual class 'Contents' to aid dependency injection and more
|
|
// easily create test fixtures
|
|
// An implementation of the Contents class is defined in the .cc file
|
|
struct PARQUET_EXPORT Contents {
|
|
static std::unique_ptr<Contents> Open(
|
|
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
|
const ReaderProperties& props = default_reader_properties(),
|
|
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
|
|
|
static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
|
|
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
|
const ReaderProperties& props = default_reader_properties(),
|
|
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
|
|
|
virtual ~Contents() = default;
|
|
// Perform any cleanup associated with the file contents
|
|
virtual void Close() = 0;
|
|
virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
|
|
virtual std::shared_ptr<FileMetaData> metadata() const = 0;
|
|
virtual std::shared_ptr<PageIndexReader> GetPageIndexReader() = 0;
|
|
virtual BloomFilterReader& GetBloomFilterReader() = 0;
|
|
};
|
|
|
|
ParquetFileReader();
|
|
~ParquetFileReader();
|
|
|
|
// Create a file reader instance from an Arrow file object. Thread-safety is
|
|
// the responsibility of the file implementation
|
|
static std::unique_ptr<ParquetFileReader> Open(
|
|
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
|
const ReaderProperties& props = default_reader_properties(),
|
|
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
|
|
|
// API Convenience to open a serialized Parquet file on disk, using Arrow IO
|
|
// interfaces.
|
|
static std::unique_ptr<ParquetFileReader> OpenFile(
|
|
const std::string& path, bool memory_map = false,
|
|
const ReaderProperties& props = default_reader_properties(),
|
|
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
|
|
|
// Asynchronously open a file reader from an Arrow file object.
|
|
// Does not throw - all errors are reported through the Future.
|
|
static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
|
|
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
|
const ReaderProperties& props = default_reader_properties(),
|
|
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
|
|
|
void Open(std::unique_ptr<Contents> contents);
|
|
void Close();
|
|
|
|
// The RowGroupReader is owned by the FileReader
|
|
std::shared_ptr<RowGroupReader> RowGroup(int i);
|
|
|
|
// Returns the file metadata. Only one instance is ever created
|
|
std::shared_ptr<FileMetaData> metadata() const;
|
|
|
|
/// Returns the PageIndexReader. Only one instance is ever created.
|
|
///
|
|
/// If the file does not have the page index, nullptr may be returned.
|
|
/// Because it pays to check existence of page index in the file, it
|
|
/// is possible to return a non null value even if page index does
|
|
/// not exist. It is the caller's responsibility to check the return
|
|
/// value and follow-up calls to PageIndexReader.
|
|
///
|
|
/// WARNING: The returned PageIndexReader must not outlive the ParquetFileReader.
|
|
/// Initialize GetPageIndexReader() is not thread-safety.
|
|
std::shared_ptr<PageIndexReader> GetPageIndexReader();
|
|
|
|
/// Returns the BloomFilterReader. Only one instance is ever created.
|
|
///
|
|
/// WARNING: The returned BloomFilterReader must not outlive the ParquetFileReader.
|
|
/// Initialize GetBloomFilterReader() is not thread-safety.
|
|
BloomFilterReader& GetBloomFilterReader();
|
|
|
|
/// Pre-buffer the specified column indices in all row groups.
|
|
///
|
|
/// Readers can optionally call this to cache the necessary slices
|
|
/// of the file in-memory before deserialization. Arrow readers can
|
|
/// automatically do this via an option. This is intended to
|
|
/// increase performance when reading from high-latency filesystems
|
|
/// (e.g. Amazon S3).
|
|
///
|
|
/// After calling this, creating readers for row groups/column
|
|
/// indices that were not buffered may fail. Creating multiple
|
|
/// readers for the a subset of the buffered regions is
|
|
/// acceptable. This may be called again to buffer a different set
|
|
/// of row groups/columns.
|
|
///
|
|
/// If memory usage is a concern, note that data will remain
|
|
/// buffered in memory until either \a PreBuffer() is called again,
|
|
/// or the reader itself is destructed. Reading - and buffering -
|
|
/// only one row group at a time may be useful.
|
|
///
|
|
/// This method may throw.
|
|
void PreBuffer(const std::vector<int>& row_groups,
|
|
const std::vector<int>& column_indices,
|
|
const ::arrow::io::IOContext& ctx,
|
|
const ::arrow::io::CacheOptions& options);
|
|
|
|
/// Retrieve the list of byte ranges that would need to be read to retrieve
|
|
/// the data for the specified row groups and column indices.
|
|
///
|
|
/// A reader can optionally call this if they wish to handle their own
|
|
/// caching and management of file reads (or offload them to other readers).
|
|
/// Unlike PreBuffer, this method will not perform any actual caching or
|
|
/// reads, instead just using the file metadata to determine the byte ranges
|
|
/// that would need to be read if you were to consume the entirety of the column
|
|
/// chunks for the provided columns in the specified row groups.
|
|
///
|
|
/// If row_groups or column_indices are empty, then the result of this will be empty.
|
|
///
|
|
/// hole_size_limit represents the maximum distance, in bytes, between two
|
|
/// consecutive ranges; beyond this value, ranges will not be combined. The default
|
|
/// value is 1MB.
|
|
///
|
|
/// range_size_limit is the maximum size in bytes of a combined range; if combining
|
|
/// two consecutive ranges would produce a range larger than this, they are not
|
|
/// combined. The default values is 64MB. This *must* be larger than hole_size_limit.
|
|
///
|
|
/// This will not take into account page indexes or any other predicate push down
|
|
/// benefits that may be available.
|
|
::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
|
|
const std::vector<int>& row_groups, const std::vector<int>& column_indices,
|
|
int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024);
|
|
|
|
/// Wait for the specified row groups and column indices to be pre-buffered.
|
|
///
|
|
/// After the returned Future completes, reading the specified row
|
|
/// groups/columns will not block.
|
|
///
|
|
/// PreBuffer must be called first. This method does not throw.
|
|
::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
|
|
const std::vector<int>& column_indices) const;
|
|
|
|
private:
|
|
// Holds a pointer to an instance of Contents implementation
|
|
std::unique_ptr<Contents> contents_;
|
|
};
|
|
|
|
// Read only Parquet file metadata
|
|
std::shared_ptr<FileMetaData> PARQUET_EXPORT
|
|
ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
|
|
|
|
/// \brief Scan all values in file. Useful for performance testing
|
|
/// \param[in] columns the column numbers to scan. If empty scans all
|
|
/// \param[in] column_batch_size number of values to read at a time when scanning column
|
|
/// \param[in] reader a ParquetFileReader instance
|
|
/// \return number of semantic rows in file
|
|
PARQUET_EXPORT
|
|
int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
|
|
ParquetFileReader* reader);
|
|
|
|
} // namespace parquet
|