198 lines
7.3 KiB
C++
198 lines
7.3 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
#include "arrow/compute/kernel.h"
|
|
#include "arrow/compute/visibility.h"
|
|
#include "arrow/datum.h"
|
|
#include "arrow/result.h"
|
|
#include "arrow/util/visibility.h"
|
|
|
|
namespace arrow {
|
|
namespace compute {
|
|
|
|
/// \brief A segment
|
|
/// A segment group is a chunk of continuous rows that have the same segment key. (For
|
|
/// example, in ordered time series processing, segment key can be "date", and a segment
|
|
/// group can be all the rows that belong to the same date.) A segment group can span
|
|
/// across multiple exec batches. A segment is a chunk of continuous rows that has the
|
|
/// same segment key within a given batch. When a segment group span cross batches, it
|
|
/// will have multiple segments. A segment never spans cross batches. The segment data
|
|
/// structure only makes sense when used along with a exec batch.
|
|
struct ARROW_COMPUTE_EXPORT Segment {
|
|
/// \brief the offset into the batch where the segment starts
|
|
int64_t offset;
|
|
/// \brief the length of the segment
|
|
int64_t length;
|
|
/// \brief whether the segment may be extended by a next one
|
|
bool is_open;
|
|
/// \brief whether the segment extends a preceeding one
|
|
bool extends;
|
|
};
|
|
|
|
inline bool operator==(const Segment& segment1, const Segment& segment2) {
|
|
return segment1.offset == segment2.offset && segment1.length == segment2.length &&
|
|
segment1.is_open == segment2.is_open && segment1.extends == segment2.extends;
|
|
}
|
|
inline bool operator!=(const Segment& segment1, const Segment& segment2) {
|
|
return !(segment1 == segment2);
|
|
}
|
|
|
|
/// \brief a helper class to divide a batch into segments of equal values
|
|
///
|
|
/// For example, given a batch with two columns specifed as segment keys:
|
|
///
|
|
/// A A [other columns]...
|
|
/// A A ...
|
|
/// A B ...
|
|
/// A B ...
|
|
/// A A ...
|
|
///
|
|
/// Then the batch could be divided into 3 segments. The first would be rows 0 & 1,
|
|
/// the second would be rows 2 & 3, and the third would be row 4.
|
|
///
|
|
/// Further, a segmenter keeps track of the last value seen. This allows it to calculate
|
|
/// segments which span batches. In our above example the last batch we emit would set
|
|
/// the "open" flag, which indicates whether the segment may extend into the next batch.
|
|
///
|
|
/// If the next call to the segmenter starts with `A A` then that segment would set the
|
|
/// "extends" flag, which indicates whether the segment continues the last open batch.
|
|
class ARROW_COMPUTE_EXPORT RowSegmenter {
|
|
public:
|
|
virtual ~RowSegmenter() = default;
|
|
|
|
/// \brief Construct a Segmenter which segments on the specified key types
|
|
///
|
|
/// \param[in] key_types the specified key types
|
|
/// \param[in] nullable_keys whether values of the specified keys may be null
|
|
/// \param[in] ctx the execution context to use
|
|
static Result<std::unique_ptr<RowSegmenter>> Make(
|
|
const std::vector<TypeHolder>& key_types, bool nullable_keys, ExecContext* ctx);
|
|
|
|
/// \brief Return the key types of this segmenter
|
|
virtual const std::vector<TypeHolder>& key_types() const = 0;
|
|
|
|
/// \brief Reset this segmenter
|
|
///
|
|
/// A segmenter normally extends (see `Segment`) a segment from one batch to the next.
|
|
/// If segment-extension is undesirable, for example when each batch is processed
|
|
/// independently, then `Reset` should be invoked before processing the next batch.
|
|
virtual Status Reset() = 0;
|
|
|
|
/// \brief Get all segments for the given batch
|
|
virtual Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) = 0;
|
|
};
|
|
|
|
/// Consumes batches of keys and yields batches of the group ids.
|
|
class ARROW_COMPUTE_EXPORT Grouper {
|
|
public:
|
|
virtual ~Grouper() = default;
|
|
|
|
/// Construct a Grouper which receives the specified key types
|
|
static Result<std::unique_ptr<Grouper>> Make(const std::vector<TypeHolder>& key_types,
|
|
ExecContext* ctx = default_exec_context());
|
|
|
|
/// Reset all intermediate state, make the grouper logically as just `Make`ed.
|
|
/// The underlying buffers, if any, may or may not be released though.
|
|
virtual Status Reset() = 0;
|
|
|
|
/// Consume a batch of keys, producing the corresponding group ids as an integer array,
|
|
/// over a slice defined by an offset and length, which defaults to the batch length.
|
|
/// Currently only uint32 indices will be produced, eventually the bit width will only
|
|
/// be as wide as necessary.
|
|
virtual Result<Datum> Consume(const ExecSpan& batch, int64_t offset = 0,
|
|
int64_t length = -1) = 0;
|
|
|
|
/// Like Consume, but groups not already encountered emit null instead of
|
|
/// generating a new group id.
|
|
virtual Result<Datum> Lookup(const ExecSpan& batch, int64_t offset = 0,
|
|
int64_t length = -1) = 0;
|
|
|
|
/// Like Consume, but only populates the Grouper without returning the group ids.
|
|
virtual Status Populate(const ExecSpan& batch, int64_t offset = 0,
|
|
int64_t length = -1) = 0;
|
|
|
|
/// Get current unique keys. May be called multiple times.
|
|
virtual Result<ExecBatch> GetUniques() = 0;
|
|
|
|
/// Get the current number of groups.
|
|
virtual uint32_t num_groups() const = 0;
|
|
|
|
/// \brief Assemble lists of indices of identical elements.
|
|
///
|
|
/// \param[in] ids An unsigned, all-valid integral array which will be
|
|
/// used as grouping criteria.
|
|
/// \param[in] num_groups An upper bound for the elements of ids
|
|
/// \param[in] ctx Execution context to use during the operation
|
|
/// \return A num_groups-long ListArray where the slot at i contains a
|
|
/// list of indices where i appears in ids.
|
|
///
|
|
/// MakeGroupings([
|
|
/// 2,
|
|
/// 2,
|
|
/// 5,
|
|
/// 5,
|
|
/// 2,
|
|
/// 3
|
|
/// ], 8) == [
|
|
/// [],
|
|
/// [],
|
|
/// [0, 1, 4],
|
|
/// [5],
|
|
/// [],
|
|
/// [2, 3],
|
|
/// [],
|
|
/// []
|
|
/// ]
|
|
static Result<std::shared_ptr<ListArray>> MakeGroupings(
|
|
const UInt32Array& ids, uint32_t num_groups,
|
|
ExecContext* ctx = default_exec_context());
|
|
|
|
/// \brief Produce a ListArray whose slots are selections of `array` which correspond to
|
|
/// the provided groupings.
|
|
///
|
|
/// For example,
|
|
/// ApplyGroupings([
|
|
/// [],
|
|
/// [],
|
|
/// [0, 1, 4],
|
|
/// [5],
|
|
/// [],
|
|
/// [2, 3],
|
|
/// [],
|
|
/// []
|
|
/// ], [2, 2, 5, 5, 2, 3]) == [
|
|
/// [],
|
|
/// [],
|
|
/// [2, 2, 2],
|
|
/// [3],
|
|
/// [],
|
|
/// [5, 5],
|
|
/// [],
|
|
/// []
|
|
/// ]
|
|
static Result<std::shared_ptr<ListArray>> ApplyGroupings(
|
|
const ListArray& groupings, const Array& array,
|
|
ExecContext* ctx = default_exec_context());
|
|
};
|
|
|
|
} // namespace compute
|
|
} // namespace arrow
|