993 lines
33 KiB
C++
993 lines
33 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <array>
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
#include <limits>
|
|
#include <memory>
|
|
#include <numeric>
|
|
#include <string>
|
|
#include <string_view>
|
|
#include <vector>
|
|
|
|
#include "arrow/array/array_base.h"
|
|
#include "arrow/array/array_binary.h"
|
|
#include "arrow/array/builder_base.h"
|
|
#include "arrow/array/data.h"
|
|
#include "arrow/buffer.h"
|
|
#include "arrow/buffer_builder.h"
|
|
#include "arrow/status.h"
|
|
#include "arrow/type.h"
|
|
#include "arrow/util/binary_view_util.h"
|
|
#include "arrow/util/macros.h"
|
|
#include "arrow/util/visibility.h"
|
|
|
|
namespace arrow {
|
|
|
|
/// \addtogroup binary-builders
|
|
///
|
|
/// @{
|
|
|
|
// ----------------------------------------------------------------------
|
|
// Binary and String
|
|
|
|
template <typename TYPE>
|
|
class BaseBinaryBuilder
|
|
: public ArrayBuilder,
|
|
public internal::ArrayBuilderExtraOps<BaseBinaryBuilder<TYPE>, std::string_view> {
|
|
public:
|
|
using TypeClass = TYPE;
|
|
using offset_type = typename TypeClass::offset_type;
|
|
|
|
explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(),
|
|
int64_t alignment = kDefaultBufferAlignment)
|
|
: ArrayBuilder(pool, alignment),
|
|
offsets_builder_(pool, alignment),
|
|
value_data_builder_(pool, alignment) {}
|
|
|
|
BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
|
|
: BaseBinaryBuilder(pool) {}
|
|
|
|
Status Append(const uint8_t* value, offset_type length) {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
UnsafeAppendNextOffset();
|
|
// Safety check for UBSAN.
|
|
if (ARROW_PREDICT_TRUE(length > 0)) {
|
|
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
|
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
|
}
|
|
|
|
UnsafeAppendToBitmap(true);
|
|
return Status::OK();
|
|
}
|
|
|
|
Status Append(const char* value, offset_type length) {
|
|
return Append(reinterpret_cast<const uint8_t*>(value), length);
|
|
}
|
|
|
|
Status Append(std::string_view value) {
|
|
return Append(value.data(), static_cast<offset_type>(value.size()));
|
|
}
|
|
|
|
/// Extend the last appended value by appending more data at the end
|
|
///
|
|
/// Unlike Append, this does not create a new offset.
|
|
Status ExtendCurrent(const uint8_t* value, offset_type length) {
|
|
// Safety check for UBSAN.
|
|
if (ARROW_PREDICT_TRUE(length > 0)) {
|
|
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
|
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status ExtendCurrent(std::string_view value) {
|
|
return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
|
static_cast<offset_type>(value.size()));
|
|
}
|
|
|
|
Status AppendNulls(int64_t length) final {
|
|
const int64_t num_bytes = value_data_builder_.length();
|
|
ARROW_RETURN_NOT_OK(Reserve(length));
|
|
for (int64_t i = 0; i < length; ++i) {
|
|
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
|
}
|
|
UnsafeAppendToBitmap(length, false);
|
|
return Status::OK();
|
|
}
|
|
|
|
Status AppendNull() final {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
UnsafeAppendNextOffset();
|
|
UnsafeAppendToBitmap(false);
|
|
return Status::OK();
|
|
}
|
|
|
|
Status AppendEmptyValue() final {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
UnsafeAppendNextOffset();
|
|
UnsafeAppendToBitmap(true);
|
|
return Status::OK();
|
|
}
|
|
|
|
Status AppendEmptyValues(int64_t length) final {
|
|
const int64_t num_bytes = value_data_builder_.length();
|
|
ARROW_RETURN_NOT_OK(Reserve(length));
|
|
for (int64_t i = 0; i < length; ++i) {
|
|
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
|
}
|
|
UnsafeAppendToBitmap(length, true);
|
|
return Status::OK();
|
|
}
|
|
|
|
/// \brief Append without checking capacity
|
|
///
|
|
/// Offsets and data should have been presized using Reserve() and
|
|
/// ReserveData(), respectively.
|
|
void UnsafeAppend(const uint8_t* value, offset_type length) {
|
|
UnsafeAppendNextOffset();
|
|
value_data_builder_.UnsafeAppend(value, length);
|
|
UnsafeAppendToBitmap(true);
|
|
}
|
|
|
|
void UnsafeAppend(const char* value, offset_type length) {
|
|
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
|
|
}
|
|
|
|
void UnsafeAppend(const std::string& value) {
|
|
UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
|
|
}
|
|
|
|
void UnsafeAppend(std::string_view value) {
|
|
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
|
|
}
|
|
|
|
/// Like ExtendCurrent, but do not check capacity
|
|
void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
|
|
value_data_builder_.UnsafeAppend(value, length);
|
|
}
|
|
|
|
void UnsafeExtendCurrent(std::string_view value) {
|
|
UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
|
static_cast<offset_type>(value.size()));
|
|
}
|
|
|
|
void UnsafeAppendNull() {
|
|
const int64_t num_bytes = value_data_builder_.length();
|
|
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
|
UnsafeAppendToBitmap(false);
|
|
}
|
|
|
|
void UnsafeAppendEmptyValue() {
|
|
const int64_t num_bytes = value_data_builder_.length();
|
|
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
|
UnsafeAppendToBitmap(true);
|
|
}
|
|
|
|
/// \brief Append a sequence of strings in one shot.
|
|
///
|
|
/// \param[in] values a vector of strings
|
|
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
|
/// indicates a valid (non-null) value
|
|
/// \return Status
|
|
Status AppendValues(const std::vector<std::string>& values,
|
|
const uint8_t* valid_bytes = NULLPTR) {
|
|
std::size_t total_length = std::accumulate(
|
|
values.begin(), values.end(), 0ULL,
|
|
[](uint64_t sum, const std::string& str) { return sum + str.size(); });
|
|
ARROW_RETURN_NOT_OK(Reserve(values.size()));
|
|
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
|
|
|
if (valid_bytes != NULLPTR) {
|
|
for (std::size_t i = 0; i < values.size(); ++i) {
|
|
UnsafeAppendNextOffset();
|
|
if (valid_bytes[i]) {
|
|
value_data_builder_.UnsafeAppend(
|
|
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
|
|
}
|
|
}
|
|
} else {
|
|
for (const auto& value : values) {
|
|
UnsafeAppendNextOffset();
|
|
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()),
|
|
value.size());
|
|
}
|
|
}
|
|
|
|
UnsafeAppendToBitmap(valid_bytes, values.size());
|
|
return Status::OK();
|
|
}
|
|
|
|
/// \brief Append a sequence of nul-terminated strings in one shot.
|
|
/// If one of the values is NULL, it is processed as a null
|
|
/// value even if the corresponding valid_bytes entry is 1.
|
|
///
|
|
/// \param[in] values a contiguous C array of nul-terminated char *
|
|
/// \param[in] length the number of values to append
|
|
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
|
/// indicates a valid (non-null) value
|
|
/// \return Status
|
|
Status AppendValues(const char** values, int64_t length,
|
|
const uint8_t* valid_bytes = NULLPTR) {
|
|
std::size_t total_length = 0;
|
|
std::vector<std::size_t> value_lengths(length);
|
|
bool have_null_value = false;
|
|
for (int64_t i = 0; i < length; ++i) {
|
|
if (values[i] != NULLPTR) {
|
|
auto value_length = strlen(values[i]);
|
|
value_lengths[i] = value_length;
|
|
total_length += value_length;
|
|
} else {
|
|
have_null_value = true;
|
|
}
|
|
}
|
|
ARROW_RETURN_NOT_OK(Reserve(length));
|
|
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
|
|
|
if (valid_bytes) {
|
|
int64_t valid_bytes_offset = 0;
|
|
for (int64_t i = 0; i < length; ++i) {
|
|
UnsafeAppendNextOffset();
|
|
if (valid_bytes[i]) {
|
|
if (values[i]) {
|
|
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
|
value_lengths[i]);
|
|
} else {
|
|
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
|
|
i - valid_bytes_offset);
|
|
UnsafeAppendToBitmap(false);
|
|
valid_bytes_offset = i + 1;
|
|
}
|
|
}
|
|
}
|
|
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
|
|
} else {
|
|
if (have_null_value) {
|
|
std::vector<uint8_t> valid_vector(length, 0);
|
|
for (int64_t i = 0; i < length; ++i) {
|
|
UnsafeAppendNextOffset();
|
|
if (values[i]) {
|
|
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
|
value_lengths[i]);
|
|
valid_vector[i] = 1;
|
|
}
|
|
}
|
|
UnsafeAppendToBitmap(valid_vector.data(), length);
|
|
} else {
|
|
for (int64_t i = 0; i < length; ++i) {
|
|
UnsafeAppendNextOffset();
|
|
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
|
value_lengths[i]);
|
|
}
|
|
UnsafeAppendToBitmap(NULLPTR, length);
|
|
}
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
|
int64_t length) override {
|
|
auto bitmap = array.GetValues<uint8_t>(0, 0);
|
|
auto offsets = array.GetValues<offset_type>(1);
|
|
auto data = array.GetValues<uint8_t>(2, 0);
|
|
auto total_length = offsets[offset + length] - offsets[offset];
|
|
ARROW_RETURN_NOT_OK(Reserve(length));
|
|
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
|
for (int64_t i = 0; i < length; i++) {
|
|
if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
|
|
const offset_type start = offsets[offset + i];
|
|
const offset_type end = offsets[offset + i + 1];
|
|
UnsafeAppend(data + start, end - start);
|
|
} else {
|
|
UnsafeAppendNull();
|
|
}
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
void Reset() override {
|
|
ArrayBuilder::Reset();
|
|
offsets_builder_.Reset();
|
|
value_data_builder_.Reset();
|
|
}
|
|
|
|
Status ValidateOverflow(int64_t new_bytes) {
|
|
auto new_size = value_data_builder_.length() + new_bytes;
|
|
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
|
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
|
" bytes, have ", new_size);
|
|
} else {
|
|
return Status::OK();
|
|
}
|
|
}
|
|
|
|
Status Resize(int64_t capacity) override {
|
|
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
|
// One more than requested for offsets
|
|
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
|
|
return ArrayBuilder::Resize(capacity);
|
|
}
|
|
|
|
/// \brief Ensures there is enough allocated capacity to append the indicated
|
|
/// number of bytes to the value data buffer without additional allocations
|
|
Status ReserveData(int64_t elements) {
|
|
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
|
return value_data_builder_.Reserve(elements);
|
|
}
|
|
|
|
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
|
// Write final offset (values length)
|
|
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
|
|
|
// These buffers' padding zeroed by BufferBuilder
|
|
std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
|
|
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
|
|
ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
|
|
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
|
|
|
|
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
|
|
null_count_, 0);
|
|
Reset();
|
|
return Status::OK();
|
|
}
|
|
|
|
/// \return data pointer of the value date builder
|
|
const uint8_t* value_data() const { return value_data_builder_.data(); }
|
|
/// \return size of values buffer so far
|
|
int64_t value_data_length() const { return value_data_builder_.length(); }
|
|
/// \return capacity of values buffer
|
|
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
|
|
|
|
/// \return data pointer of the value date builder
|
|
const offset_type* offsets_data() const { return offsets_builder_.data(); }
|
|
|
|
/// Temporary access to a value.
|
|
///
|
|
/// This pointer becomes invalid on the next modifying operation.
|
|
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
|
|
const offset_type* offsets = offsets_builder_.data();
|
|
const auto offset = offsets[i];
|
|
if (i == (length_ - 1)) {
|
|
*out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
|
|
} else {
|
|
*out_length = offsets[i + 1] - offset;
|
|
}
|
|
return value_data_builder_.data() + offset;
|
|
}
|
|
|
|
offset_type offset(int64_t i) const { return offsets_data()[i]; }
|
|
|
|
/// Temporary access to a value.
|
|
///
|
|
/// This view becomes invalid on the next modifying operation.
|
|
std::string_view GetView(int64_t i) const {
|
|
offset_type value_length;
|
|
const uint8_t* value_data = GetValue(i, &value_length);
|
|
return std::string_view(reinterpret_cast<const char*>(value_data), value_length);
|
|
}
|
|
|
|
// Cannot make this a static attribute because of linking issues
|
|
static constexpr int64_t memory_limit() {
|
|
return std::numeric_limits<offset_type>::max() - 1;
|
|
}
|
|
|
|
protected:
|
|
TypedBufferBuilder<offset_type> offsets_builder_;
|
|
TypedBufferBuilder<uint8_t> value_data_builder_;
|
|
|
|
Status AppendNextOffset() {
|
|
const int64_t num_bytes = value_data_builder_.length();
|
|
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
|
|
}
|
|
|
|
void UnsafeAppendNextOffset() {
|
|
const int64_t num_bytes = value_data_builder_.length();
|
|
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
|
}
|
|
};
|
|
|
|
/// \class BinaryBuilder
|
|
/// \brief Builder class for variable-length binary data
|
|
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
|
|
public:
|
|
using BaseBinaryBuilder::BaseBinaryBuilder;
|
|
|
|
/// \cond FALSE
|
|
using ArrayBuilder::Finish;
|
|
/// \endcond
|
|
|
|
Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
|
|
|
|
std::shared_ptr<DataType> type() const override { return binary(); }
|
|
};
|
|
|
|
/// \class StringBuilder
|
|
/// \brief Builder class for UTF8 strings
|
|
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
|
|
public:
|
|
using BinaryBuilder::BinaryBuilder;
|
|
|
|
/// \cond FALSE
|
|
using ArrayBuilder::Finish;
|
|
/// \endcond
|
|
|
|
Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
|
|
|
|
std::shared_ptr<DataType> type() const override { return utf8(); }
|
|
};
|
|
|
|
/// \class LargeBinaryBuilder
|
|
/// \brief Builder class for large variable-length binary data
|
|
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
|
|
public:
|
|
using BaseBinaryBuilder::BaseBinaryBuilder;
|
|
|
|
/// \cond FALSE
|
|
using ArrayBuilder::Finish;
|
|
/// \endcond
|
|
|
|
Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
|
|
|
|
std::shared_ptr<DataType> type() const override { return large_binary(); }
|
|
};
|
|
|
|
/// \class LargeStringBuilder
|
|
/// \brief Builder class for large UTF8 strings
|
|
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
|
|
public:
|
|
using LargeBinaryBuilder::LargeBinaryBuilder;
|
|
|
|
/// \cond FALSE
|
|
using ArrayBuilder::Finish;
|
|
/// \endcond
|
|
|
|
Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
|
|
|
|
std::shared_ptr<DataType> type() const override { return large_utf8(); }
|
|
};
|
|
|
|
// ----------------------------------------------------------------------
|
|
// BinaryViewBuilder, StringViewBuilder
|
|
//
|
|
// These builders do not support building raw pointer view arrays.
|
|
|
|
namespace internal {
|
|
|
|
// We allocate medium-sized memory chunks and accumulate data in those, which
|
|
// may result in some waste if there are many large-ish strings. If a string
|
|
// comes along that does not fit into a block, we allocate a new block and
|
|
// write into that.
|
|
//
|
|
// Later we can implement optimizations to continuing filling underfull blocks
|
|
// after encountering a large string that required allocating a new block.
|
|
class ARROW_EXPORT StringHeapBuilder {
|
|
public:
|
|
static constexpr int64_t kDefaultBlocksize = 32 << 10; // 32KB
|
|
|
|
StringHeapBuilder(MemoryPool* pool, int64_t alignment)
|
|
: pool_(pool), alignment_(alignment) {}
|
|
|
|
void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; }
|
|
|
|
using c_type = BinaryViewType::c_type;
|
|
|
|
template <bool Safe>
|
|
std::conditional_t<Safe, Result<c_type>, c_type> Append(const uint8_t* value,
|
|
int64_t length) {
|
|
if (length <= BinaryViewType::kInlineSize) {
|
|
return util::ToInlineBinaryView(value, static_cast<int32_t>(length));
|
|
}
|
|
|
|
if constexpr (Safe) {
|
|
ARROW_RETURN_NOT_OK(Reserve(length));
|
|
}
|
|
|
|
auto v = util::ToNonInlineBinaryView(value, static_cast<int32_t>(length),
|
|
static_cast<int32_t>(blocks_.size() - 1),
|
|
current_offset_);
|
|
|
|
memcpy(current_out_buffer_, value, static_cast<size_t>(length));
|
|
current_out_buffer_ += length;
|
|
current_remaining_bytes_ -= length;
|
|
current_offset_ += static_cast<int32_t>(length);
|
|
return v;
|
|
}
|
|
|
|
static constexpr int64_t ValueSizeLimit() {
|
|
return std::numeric_limits<int32_t>::max();
|
|
}
|
|
|
|
/// \brief Ensure that the indicated number of bytes can be appended via
|
|
/// UnsafeAppend operations without the need to allocate more memory
|
|
Status Reserve(int64_t num_bytes) {
|
|
if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) {
|
|
return Status::CapacityError(
|
|
"BinaryView or StringView elements cannot reference "
|
|
"strings larger than 2GB");
|
|
}
|
|
if (num_bytes > current_remaining_bytes_) {
|
|
ARROW_RETURN_NOT_OK(FinishLastBlock());
|
|
current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
|
|
ARROW_ASSIGN_OR_RAISE(
|
|
std::shared_ptr<ResizableBuffer> new_block,
|
|
AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_));
|
|
current_offset_ = 0;
|
|
current_out_buffer_ = new_block->mutable_data();
|
|
blocks_.emplace_back(std::move(new_block));
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
void Reset() {
|
|
current_offset_ = 0;
|
|
current_out_buffer_ = NULLPTR;
|
|
current_remaining_bytes_ = 0;
|
|
blocks_.clear();
|
|
}
|
|
|
|
int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
|
|
|
|
Result<std::vector<std::shared_ptr<ResizableBuffer>>> Finish() {
|
|
if (!blocks_.empty()) {
|
|
ARROW_RETURN_NOT_OK(FinishLastBlock());
|
|
}
|
|
current_offset_ = 0;
|
|
current_out_buffer_ = NULLPTR;
|
|
current_remaining_bytes_ = 0;
|
|
return std::move(blocks_);
|
|
}
|
|
|
|
private:
|
|
Status FinishLastBlock() {
|
|
if (current_remaining_bytes_ > 0) {
|
|
// Avoid leaking uninitialized bytes from the allocator
|
|
ARROW_RETURN_NOT_OK(
|
|
blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_,
|
|
/*shrink_to_fit=*/true));
|
|
blocks_.back()->ZeroPadding();
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
MemoryPool* pool_;
|
|
int64_t alignment_;
|
|
int64_t blocksize_ = kDefaultBlocksize;
|
|
std::vector<std::shared_ptr<ResizableBuffer>> blocks_;
|
|
|
|
int32_t current_offset_ = 0;
|
|
uint8_t* current_out_buffer_ = NULLPTR;
|
|
int64_t current_remaining_bytes_ = 0;
|
|
};
|
|
|
|
} // namespace internal
|
|
|
|
class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
|
|
public:
|
|
using TypeClass = BinaryViewType;
|
|
|
|
// this constructor provided for MakeBuilder compatibility
|
|
BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool);
|
|
|
|
explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
|
|
int64_t alignment = kDefaultBufferAlignment)
|
|
: ArrayBuilder(pool, alignment),
|
|
data_builder_(pool, alignment),
|
|
data_heap_builder_(pool, alignment) {}
|
|
|
|
/// Set the size for future preallocated data buffers.
|
|
///
|
|
/// The default size is 32KB, so after each 32KB of string data appended to the builder
|
|
/// a new data buffer will be allocated. Adjust this to a larger value to decrease the
|
|
/// frequency of allocation, or to a smaller value to lower the overhead of each
|
|
/// allocation.
|
|
void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); }
|
|
|
|
/// The number of bytes which can be appended to this builder without allocating another
|
|
/// data buffer.
|
|
int64_t current_block_bytes_remaining() const {
|
|
return data_heap_builder_.current_remaining_bytes();
|
|
}
|
|
|
|
Status Append(const uint8_t* value, int64_t length) {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
UnsafeAppendToBitmap(true);
|
|
ARROW_ASSIGN_OR_RAISE(auto v,
|
|
data_heap_builder_.Append</*Safe=*/true>(value, length));
|
|
data_builder_.UnsafeAppend(v);
|
|
return Status::OK();
|
|
}
|
|
|
|
Status Append(const char* value, int64_t length) {
|
|
return Append(reinterpret_cast<const uint8_t*>(value), length);
|
|
}
|
|
|
|
Status Append(std::string_view value) {
|
|
return Append(value.data(), static_cast<int64_t>(value.size()));
|
|
}
|
|
|
|
/// \brief Append without checking capacity
|
|
///
|
|
/// Builder should have been presized using Reserve() and ReserveData(),
|
|
/// respectively, and the value must not be larger than 2GB
|
|
void UnsafeAppend(const uint8_t* value, int64_t length) {
|
|
UnsafeAppendToBitmap(true);
|
|
auto v = data_heap_builder_.Append</*Safe=*/false>(value, length);
|
|
data_builder_.UnsafeAppend(v);
|
|
}
|
|
|
|
void UnsafeAppend(const char* value, int64_t length) {
|
|
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
|
|
}
|
|
|
|
void UnsafeAppend(const std::string& value) {
|
|
UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
|
|
}
|
|
|
|
void UnsafeAppend(std::string_view value) {
|
|
UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
|
|
}
|
|
|
|
/// \brief Ensures there is enough allocated available capacity in the
|
|
/// out-of-line data heap to append the indicated number of bytes without
|
|
/// additional allocations
|
|
Status ReserveData(int64_t length);
|
|
|
|
Status AppendNulls(int64_t length) final {
|
|
ARROW_RETURN_NOT_OK(Reserve(length));
|
|
data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
|
|
UnsafeSetNull(length);
|
|
return Status::OK();
|
|
}
|
|
|
|
/// \brief Append a single null element
|
|
Status AppendNull() final {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
|
UnsafeAppendToBitmap(false);
|
|
return Status::OK();
|
|
}
|
|
|
|
/// \brief Append a empty element (length-0 inline string)
|
|
Status AppendEmptyValue() final {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
|
UnsafeAppendToBitmap(true);
|
|
return Status::OK();
|
|
}
|
|
|
|
/// \brief Append several empty elements
|
|
Status AppendEmptyValues(int64_t length) final {
|
|
ARROW_RETURN_NOT_OK(Reserve(length));
|
|
data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
|
|
UnsafeSetNotNull(length);
|
|
return Status::OK();
|
|
}
|
|
|
|
void UnsafeAppendNull() {
|
|
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
|
UnsafeAppendToBitmap(false);
|
|
}
|
|
|
|
void UnsafeAppendEmptyValue() {
|
|
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
|
UnsafeAppendToBitmap(true);
|
|
}
|
|
|
|
/// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies
|
|
/// the underlying out-of-line string memory to avoid memory lifetime issues
|
|
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
|
int64_t length) override;
|
|
|
|
void Reset() override;
|
|
|
|
Status Resize(int64_t capacity) override {
|
|
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
|
capacity = std::max(capacity, kMinBuilderCapacity);
|
|
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
|
|
return ArrayBuilder::Resize(capacity);
|
|
}
|
|
|
|
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
|
|
|
std::shared_ptr<DataType> type() const override { return binary_view(); }
|
|
|
|
protected:
|
|
TypedBufferBuilder<BinaryViewType::c_type> data_builder_;
|
|
|
|
// Accumulates out-of-line data in fixed-size chunks which are then attached
|
|
// to the resulting ArrayData
|
|
internal::StringHeapBuilder data_heap_builder_;
|
|
};
|
|
|
|
class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
|
|
public:
|
|
using BinaryViewBuilder::BinaryViewBuilder;
|
|
std::shared_ptr<DataType> type() const override { return utf8_view(); }
|
|
};
|
|
|
|
// ----------------------------------------------------------------------
|
|
// FixedSizeBinaryBuilder
|
|
|
|
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
|
|
public:
|
|
using TypeClass = FixedSizeBinaryType;
|
|
|
|
explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
|
|
MemoryPool* pool = default_memory_pool(),
|
|
int64_t alignment = kDefaultBufferAlignment);
|
|
|
|
Status Append(const uint8_t* value) {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
UnsafeAppend(value);
|
|
return Status::OK();
|
|
}
|
|
|
|
Status Append(const char* value) {
|
|
return Append(reinterpret_cast<const uint8_t*>(value));
|
|
}
|
|
|
|
Status Append(std::string_view view) {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
UnsafeAppend(view);
|
|
return Status::OK();
|
|
}
|
|
|
|
Status Append(const std::string& s) {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
UnsafeAppend(s);
|
|
return Status::OK();
|
|
}
|
|
|
|
Status Append(const Buffer& s) {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
UnsafeAppend(s);
|
|
return Status::OK();
|
|
}
|
|
|
|
Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
|
|
|
|
template <size_t NBYTES>
|
|
Status Append(const std::array<uint8_t, NBYTES>& value) {
|
|
ARROW_RETURN_NOT_OK(Reserve(1));
|
|
UnsafeAppend(
|
|
std::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
|
|
return Status::OK();
|
|
}
|
|
|
|
Status AppendValues(const uint8_t* data, int64_t length,
|
|
const uint8_t* valid_bytes = NULLPTR);
|
|
|
|
Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
|
|
int64_t bitmap_offset);
|
|
|
|
Status AppendNull() final;
|
|
Status AppendNulls(int64_t length) final;
|
|
|
|
Status AppendEmptyValue() final;
|
|
Status AppendEmptyValues(int64_t length) final;
|
|
|
|
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
|
int64_t length) override {
|
|
return AppendValues(
|
|
array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
|
|
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
|
}
|
|
|
|
void UnsafeAppend(const uint8_t* value) {
|
|
UnsafeAppendToBitmap(true);
|
|
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
|
|
byte_builder_.UnsafeAppend(value, byte_width_);
|
|
}
|
|
}
|
|
|
|
void UnsafeAppend(const char* value) {
|
|
UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
|
|
}
|
|
|
|
void UnsafeAppend(std::string_view value) {
|
|
#ifndef NDEBUG
|
|
CheckValueSize(static_cast<size_t>(value.size()));
|
|
#endif
|
|
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
|
|
}
|
|
|
|
void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); }
|
|
|
|
void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
|
|
|
|
void UnsafeAppendNull() {
|
|
UnsafeAppendToBitmap(false);
|
|
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
|
|
}
|
|
|
|
Status ValidateOverflow(int64_t new_bytes) const {
|
|
auto new_size = byte_builder_.length() + new_bytes;
|
|
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
|
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
|
" bytes, have ", new_size);
|
|
} else {
|
|
return Status::OK();
|
|
}
|
|
}
|
|
|
|
/// \brief Ensures there is enough allocated capacity to append the indicated
|
|
/// number of bytes to the value data buffer without additional allocations
|
|
Status ReserveData(int64_t elements) {
|
|
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
|
return byte_builder_.Reserve(elements);
|
|
}
|
|
|
|
void Reset() override;
|
|
Status Resize(int64_t capacity) override;
|
|
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
|
|
|
/// \cond FALSE
|
|
using ArrayBuilder::Finish;
|
|
/// \endcond
|
|
|
|
Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
|
|
|
|
/// \return size of values buffer so far
|
|
int64_t value_data_length() const { return byte_builder_.length(); }
|
|
|
|
int32_t byte_width() const { return byte_width_; }
|
|
|
|
/// Temporary access to a value.
|
|
///
|
|
/// This pointer becomes invalid on the next modifying operation.
|
|
const uint8_t* GetValue(int64_t i) const;
|
|
|
|
/// Temporary mutable access to a value.
|
|
///
|
|
/// This pointer becomes invalid on the next modifying operation.
|
|
uint8_t* GetMutableValue(int64_t i) {
|
|
uint8_t* data_ptr = byte_builder_.mutable_data();
|
|
return data_ptr + i * byte_width_;
|
|
}
|
|
|
|
/// Temporary mutable access to a value.
|
|
///
|
|
/// This view becomes invalid on the next modifying operation.
|
|
std::string_view GetView(int64_t i) const;
|
|
|
|
/// Advance builder without allocating nor writing any values
|
|
///
|
|
/// The internal pointer is advanced by `length` values and the same number
|
|
/// of non-null entries are appended to the validity bitmap.
|
|
/// This method assumes that the `length` values were populated directly,
|
|
/// for example using `GetMutableValue`.
|
|
void UnsafeAdvance(int64_t length) {
|
|
byte_builder_.UnsafeAdvance(length * byte_width_);
|
|
UnsafeAppendToBitmap(length, true);
|
|
}
|
|
|
|
/// Advance builder without allocating nor writing any values
|
|
///
|
|
/// The internal pointer is advanced by `length` values and the same number
|
|
/// of validity bits are appended to the validity bitmap.
|
|
/// This method assumes that the `length` values were populated directly,
|
|
/// for example using `GetMutableValue`.
|
|
void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) {
|
|
byte_builder_.UnsafeAdvance(length * byte_width_);
|
|
UnsafeAppendToBitmap(validity, valid_bits_offset, length);
|
|
}
|
|
|
|
static constexpr int64_t memory_limit() {
|
|
return std::numeric_limits<int64_t>::max() - 1;
|
|
}
|
|
|
|
std::shared_ptr<DataType> type() const override {
|
|
return fixed_size_binary(byte_width_);
|
|
}
|
|
|
|
protected:
|
|
int32_t byte_width_;
|
|
BufferBuilder byte_builder_;
|
|
|
|
void CheckValueSize(int64_t size);
|
|
};
|
|
|
|
/// @}
|
|
|
|
// ----------------------------------------------------------------------
|
|
// Chunked builders: build a sequence of BinaryArray or StringArray that are
|
|
// limited to a particular size (to the upper limit of 2GB)
|
|
|
|
namespace internal {
|
|
|
|
class ARROW_EXPORT ChunkedBinaryBuilder {
|
|
public:
|
|
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
|
|
MemoryPool* pool = default_memory_pool());
|
|
|
|
ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
|
|
MemoryPool* pool = default_memory_pool());
|
|
|
|
virtual ~ChunkedBinaryBuilder() = default;
|
|
|
|
Status Append(const uint8_t* value, int32_t length) {
|
|
if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
|
|
max_chunk_value_length_)) {
|
|
if (builder_->value_data_length() == 0) {
|
|
// The current item is larger than max_chunk_size_;
|
|
// this chunk will be oversize and hold *only* this item
|
|
ARROW_RETURN_NOT_OK(builder_->Append(value, length));
|
|
return NextChunk();
|
|
}
|
|
// The current item would cause builder_->value_data_length() to exceed
|
|
// max_chunk_size_, so finish this chunk and append the current item to the next
|
|
// chunk
|
|
ARROW_RETURN_NOT_OK(NextChunk());
|
|
return Append(value, length);
|
|
}
|
|
|
|
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
|
// The current item would cause builder_->length() to exceed max_chunk_length_, so
|
|
// finish this chunk and append the current item to the next chunk
|
|
ARROW_RETURN_NOT_OK(NextChunk());
|
|
}
|
|
|
|
return builder_->Append(value, length);
|
|
}
|
|
|
|
Status Append(std::string_view value) {
|
|
return Append(reinterpret_cast<const uint8_t*>(value.data()),
|
|
static_cast<int32_t>(value.size()));
|
|
}
|
|
|
|
Status AppendNull() {
|
|
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
|
ARROW_RETURN_NOT_OK(NextChunk());
|
|
}
|
|
return builder_->AppendNull();
|
|
}
|
|
|
|
Status Reserve(int64_t values);
|
|
|
|
virtual Status Finish(ArrayVector* out);
|
|
|
|
protected:
|
|
Status NextChunk();
|
|
|
|
// maximum total character data size per chunk
|
|
int64_t max_chunk_value_length_;
|
|
|
|
// maximum elements allowed per chunk
|
|
int64_t max_chunk_length_ = kListMaximumElements;
|
|
|
|
// when Reserve() would cause builder_ to exceed its max_chunk_length_,
|
|
// add to extra_capacity_ instead and wait to reserve until the next chunk
|
|
int64_t extra_capacity_ = 0;
|
|
|
|
std::unique_ptr<BinaryBuilder> builder_;
|
|
std::vector<std::shared_ptr<Array>> chunks_;
|
|
};
|
|
|
|
class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
|
|
public:
|
|
using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
|
|
|
|
Status Finish(ArrayVector* out) override;
|
|
};
|
|
|
|
} // namespace internal
|
|
|
|
} // namespace arrow
|