Initial commit
This commit is contained in:
@@ -0,0 +1,162 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/visibility.h"
|
||||
#include "arrow/compute/exec.h"
|
||||
#include "arrow/result.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
namespace util {
|
||||
|
||||
using arrow::compute::ExecBatch;
|
||||
|
||||
/// \brief A container that accumulates batches until they are ready to
|
||||
/// be processed.
|
||||
class ARROW_ACERO_EXPORT AccumulationQueue {
|
||||
public:
|
||||
AccumulationQueue() : row_count_(0) {}
|
||||
~AccumulationQueue() = default;
|
||||
|
||||
// We should never be copying ExecBatch around
|
||||
AccumulationQueue(const AccumulationQueue&) = delete;
|
||||
AccumulationQueue& operator=(const AccumulationQueue&) = delete;
|
||||
|
||||
AccumulationQueue(AccumulationQueue&& that);
|
||||
AccumulationQueue& operator=(AccumulationQueue&& that);
|
||||
|
||||
void Concatenate(AccumulationQueue&& that);
|
||||
void InsertBatch(ExecBatch batch);
|
||||
int64_t row_count() { return row_count_; }
|
||||
size_t batch_count() { return batches_.size(); }
|
||||
bool empty() const { return batches_.empty(); }
|
||||
void Clear();
|
||||
ExecBatch& operator[](size_t i);
|
||||
|
||||
private:
|
||||
int64_t row_count_;
|
||||
std::vector<ExecBatch> batches_;
|
||||
};
|
||||
|
||||
/// A queue that sequences incoming batches
|
||||
///
|
||||
/// This can be used when a node needs to do some kind of ordered processing on
|
||||
/// the stream.
|
||||
///
|
||||
/// Batches can be inserted in any order. The process_callback will be called on
|
||||
/// the batches, in order, without reentrant calls. For this reason the callback
|
||||
/// should be quick.
|
||||
///
|
||||
/// For example, in a top-n node, the process callback should determine how many
|
||||
/// rows need to be delivered for the given batch, and then return a task to actually
|
||||
/// deliver those rows.
|
||||
class ARROW_ACERO_EXPORT SequencingQueue {
|
||||
public:
|
||||
using Task = std::function<Status()>;
|
||||
|
||||
/// Strategy that describes how to handle items
|
||||
class Processor {
|
||||
public:
|
||||
/// Process the batch, potentially generating a task
|
||||
///
|
||||
/// This method will be called on each batch in order. Calls to this method
|
||||
/// will be serialized and it will not be called reentrantly. This makes it
|
||||
/// safe to do things that rely on order but minimal time should be spent here
|
||||
/// to avoid becoming a bottleneck.
|
||||
///
|
||||
/// \return a follow-up task that will be scheduled. The follow-up task(s) are
|
||||
/// is not guaranteed to run in any particular order. If nullopt is
|
||||
/// returned then nothing will be scheduled.
|
||||
virtual Result<std::optional<Task>> Process(ExecBatch batch) = 0;
|
||||
/// Schedule a task
|
||||
virtual void Schedule(Task task) = 0;
|
||||
};
|
||||
|
||||
virtual ~SequencingQueue() = default;
|
||||
|
||||
/// Insert a batch into the queue
|
||||
///
|
||||
/// This will insert the batch into the queue. If this batch was the next batch
|
||||
/// to deliver then this will trigger 1+ calls to the process callback to generate
|
||||
/// 1+ tasks.
|
||||
///
|
||||
/// The task generated by this call will be executed immediately. The remaining
|
||||
/// tasks will be scheduled using the schedule callback.
|
||||
///
|
||||
/// From a data pipeline perspective the sequencing queue is a "sometimes" breaker. If
|
||||
/// a task arrives in order then this call will usually execute the downstream pipeline.
|
||||
/// If this task arrives early then this call will only queue the data.
|
||||
virtual Status InsertBatch(ExecBatch batch) = 0;
|
||||
|
||||
/// Create a queue
|
||||
/// \param processor describes how to process the batches, must outlive the queue
|
||||
static std::unique_ptr<SequencingQueue> Make(Processor* processor);
|
||||
};
|
||||
|
||||
/// A queue that sequences incoming batches
|
||||
///
|
||||
/// Unlike SequencingQueue the Process method is not expected to schedule new tasks.
|
||||
///
|
||||
/// If a batch arrives and another thread is currently processing then the batch
|
||||
/// will be queued and control will return. In other words, delivery of batches will
|
||||
/// not block on the Process method.
|
||||
///
|
||||
/// It can be helpful to think of this as if a dedicated thread is running Process as
|
||||
/// batches arrive
|
||||
class ARROW_ACERO_EXPORT SerialSequencingQueue {
|
||||
public:
|
||||
/// Strategy that describes how to handle items
|
||||
class Processor {
|
||||
public:
|
||||
virtual ~Processor() = default;
|
||||
/// Process the batch
|
||||
///
|
||||
/// This method will be called on each batch in order. Calls to this method
|
||||
/// will be serialized and it will not be called reentrantly. This makes it
|
||||
/// safe to do things that rely on order.
|
||||
///
|
||||
/// If this falls behind then data may accumulate
|
||||
///
|
||||
/// TODO: Could add backpressure if needed but right now all uses of this should
|
||||
/// be pretty fast and so are unlikely to block.
|
||||
virtual Status Process(ExecBatch batch) = 0;
|
||||
};
|
||||
|
||||
virtual ~SerialSequencingQueue() = default;
|
||||
|
||||
/// Insert a batch into the queue
|
||||
///
|
||||
/// This will insert the batch into the queue. If this batch was the next batch
|
||||
/// to deliver then this may trigger calls to the processor which will be run
|
||||
/// as part of this call.
|
||||
virtual Status InsertBatch(ExecBatch batch) = 0;
|
||||
|
||||
/// Create a queue
|
||||
/// \param processor describes how to process the batches, must outlive the queue
|
||||
static std::unique_ptr<SerialSequencingQueue> Make(Processor* processor);
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,58 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/visibility.h"
|
||||
#include "arrow/compute/api_aggregate.h"
|
||||
#include "arrow/compute/test_util_internal.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
namespace aggregate {
|
||||
|
||||
using compute::Aggregate;
|
||||
using compute::default_exec_context;
|
||||
using compute::ExecContext;
|
||||
|
||||
/// \brief Make the output schema of an aggregate node
|
||||
///
|
||||
/// The output schema is determined by the aggregation kernels, which may depend on the
|
||||
/// ExecContext argument. To guarantee correct results, the same ExecContext argument
|
||||
/// should be used in execution.
|
||||
///
|
||||
/// \param[in] input_schema the schema of the input to the node
|
||||
/// \param[in] keys the grouping keys for the aggregation
|
||||
/// \param[in] segment_keys the segmenting keys for the aggregation
|
||||
/// \param[in] aggregates the aggregates for the aggregation
|
||||
/// \param[in] exec_ctx the execution context for the aggregation
|
||||
ARROW_ACERO_EXPORT Result<std::shared_ptr<Schema>> MakeOutputSchema(
|
||||
const std::shared_ptr<Schema>& input_schema, const std::vector<FieldRef>& keys,
|
||||
const std::vector<FieldRef>& segment_keys, const std::vector<Aggregate>& aggregates,
|
||||
ExecContext* exec_ctx = default_exec_context());
|
||||
|
||||
} // namespace aggregate
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,32 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle
|
||||
|
||||
#pragma once
|
||||
|
||||
/// \defgroup acero-api Utilities for creating and executing execution plans
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup acero-nodes Options classes for the various exec nodes
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
#include "arrow/acero/exec_plan.h"
|
||||
#include "arrow/acero/options.h"
|
||||
@@ -0,0 +1,41 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/options.h"
|
||||
#include "arrow/acero/visibility.h"
|
||||
#include "arrow/compute/exec.h"
|
||||
#include "arrow/type.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
namespace asofjoin {
|
||||
|
||||
using AsofJoinKeys = AsofJoinNodeOptions::Keys;
|
||||
|
||||
/// \brief Make the output schema of an as-of-join node
|
||||
///
|
||||
/// \param[in] input_schema the schema of each input to the node
|
||||
/// \param[in] input_keys the key of each input to the node
|
||||
ARROW_ACERO_EXPORT Result<std::shared_ptr<Schema>> MakeOutputSchema(
|
||||
const std::vector<std::shared_ptr<Schema>>& input_schema,
|
||||
const std::vector<AsofJoinKeys>& input_keys);
|
||||
|
||||
} // namespace asofjoin
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,64 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
#include "arrow/acero/exec_plan.h"
|
||||
#include "arrow/acero/options.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace arrow::acero {
|
||||
|
||||
class BackpressureHandler {
|
||||
private:
|
||||
BackpressureHandler(size_t low_threshold, size_t high_threshold,
|
||||
std::unique_ptr<BackpressureControl> backpressure_control)
|
||||
: low_threshold_(low_threshold),
|
||||
high_threshold_(high_threshold),
|
||||
backpressure_control_(std::move(backpressure_control)) {}
|
||||
|
||||
public:
|
||||
static Result<BackpressureHandler> Make(
|
||||
size_t low_threshold, size_t high_threshold,
|
||||
std::unique_ptr<BackpressureControl> backpressure_control) {
|
||||
if (low_threshold >= high_threshold) {
|
||||
return Status::Invalid("low threshold (", low_threshold,
|
||||
") must be less than high threshold (", high_threshold, ")");
|
||||
}
|
||||
if (backpressure_control == NULLPTR) {
|
||||
return Status::Invalid("null backpressure control parameter");
|
||||
}
|
||||
BackpressureHandler backpressure_handler(low_threshold, high_threshold,
|
||||
std::move(backpressure_control));
|
||||
return backpressure_handler;
|
||||
}
|
||||
|
||||
void Handle(size_t start_level, size_t end_level) {
|
||||
if (start_level < high_threshold_ && end_level >= high_threshold_) {
|
||||
backpressure_control_->Pause();
|
||||
} else if (start_level > low_threshold_ && end_level <= low_threshold_) {
|
||||
backpressure_control_->Resume();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
size_t low_threshold_;
|
||||
size_t high_threshold_;
|
||||
std::unique_ptr<BackpressureControl> backpressure_control_;
|
||||
};
|
||||
|
||||
} // namespace arrow::acero
|
||||
@@ -0,0 +1,48 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
|
||||
#include "arrow/acero/exec_plan.h"
|
||||
#include "arrow/acero/test_util_internal.h"
|
||||
#include "arrow/compute/exec.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace acero {
|
||||
|
||||
Status BenchmarkNodeOverhead(benchmark::State& state, int32_t num_batches,
|
||||
int32_t batch_size, arrow::acero::BatchesWithSchema data,
|
||||
std::vector<arrow::acero::Declaration>& node_declarations,
|
||||
arrow::MemoryPool* pool = default_memory_pool());
|
||||
|
||||
Status BenchmarkIsolatedNodeOverhead(benchmark::State& state,
|
||||
arrow::compute::Expression expr, int32_t num_batches,
|
||||
int32_t batch_size,
|
||||
arrow::acero::BatchesWithSchema data,
|
||||
std::string factory_name,
|
||||
arrow::acero::ExecNodeOptions& options,
|
||||
arrow::MemoryPool* pool = default_memory_pool());
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,323 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/acero/partition_util.h"
|
||||
#include "arrow/acero/util.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/simd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
|
||||
// A set of pre-generated bit masks from a 64-bit word.
|
||||
//
|
||||
// It is used to map selected bits of hash to a bit mask that will be used in
|
||||
// a Bloom filter.
|
||||
//
|
||||
// These bit masks need to look random and need to have a similar fractions of
|
||||
// bits set in order for a Bloom filter to have a low false positives rate.
|
||||
//
|
||||
struct ARROW_ACERO_EXPORT BloomFilterMasks {
|
||||
// Generate all masks as a single bit vector. Each bit offset in this bit
|
||||
// vector corresponds to a single mask.
|
||||
// In each consecutive kBitsPerMask bits, there must be between
|
||||
// kMinBitsSet and kMaxBitsSet bits set.
|
||||
//
|
||||
BloomFilterMasks();
|
||||
|
||||
inline uint64_t mask(int bit_offset) {
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
return (arrow::util::SafeLoadAs<uint64_t>(masks_ + bit_offset / 8) >>
|
||||
(bit_offset % 8)) &
|
||||
kFullMask;
|
||||
#else
|
||||
return (BYTESWAP(arrow::util::SafeLoadAs<uint64_t>(masks_ + bit_offset / 8)) >>
|
||||
(bit_offset % 8)) &
|
||||
kFullMask;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Masks are 57 bits long because then they can be accessed at an
|
||||
// arbitrary bit offset using a single unaligned 64-bit load instruction.
|
||||
//
|
||||
static constexpr int kBitsPerMask = 57;
|
||||
static constexpr uint64_t kFullMask = (1ULL << kBitsPerMask) - 1;
|
||||
|
||||
// Minimum and maximum number of bits set in each mask.
|
||||
// This constraint is enforced when generating the bit masks.
|
||||
// Values should be close to each other and chosen as to minimize a Bloom
|
||||
// filter false positives rate.
|
||||
//
|
||||
static constexpr int kMinBitsSet = 4;
|
||||
static constexpr int kMaxBitsSet = 5;
|
||||
|
||||
// Number of generated masks.
|
||||
// Having more masks to choose will improve false positives rate of Bloom
|
||||
// filter but will also use more memory, which may lead to more CPU cache
|
||||
// misses.
|
||||
// The chosen value results in using only a few cache-lines for mask lookups,
|
||||
// while providing a good variety of available bit masks.
|
||||
//
|
||||
static constexpr int kLogNumMasks = 10;
|
||||
static constexpr int kNumMasks = 1 << kLogNumMasks;
|
||||
|
||||
// Data of masks. Masks are stored in a single bit vector. Nth mask is
|
||||
// kBitsPerMask bits starting at bit offset N.
|
||||
//
|
||||
static constexpr int kTotalBytes = (kNumMasks + 64) / 8;
|
||||
uint8_t masks_[kTotalBytes];
|
||||
};
|
||||
|
||||
// A variant of a blocked Bloom filter implementation.
|
||||
// A Bloom filter is a data structure that provides approximate membership test
|
||||
// functionality based only on the hash of the key. Membership test may return
|
||||
// false positives but not false negatives. Approximation of the result allows
|
||||
// in general case (for arbitrary data types of keys) to save on both memory and
|
||||
// lookup cost compared to the accurate membership test.
|
||||
// The accurate test may sometimes still be cheaper for a specific data types
|
||||
// and inputs, e.g. integers from a small range.
|
||||
//
|
||||
// This blocked Bloom filter is optimized for use in hash joins, to achieve a
|
||||
// good balance between the size of the filter, the cost of its building and
|
||||
// querying and the rate of false positives.
|
||||
//
|
||||
class ARROW_ACERO_EXPORT BlockedBloomFilter {
|
||||
friend class BloomFilterBuilder_SingleThreaded;
|
||||
friend class BloomFilterBuilder_Parallel;
|
||||
|
||||
public:
|
||||
BlockedBloomFilter() : log_num_blocks_(0), num_blocks_(0), blocks_(NULLPTR) {}
|
||||
|
||||
inline bool Find(uint64_t hash) const {
|
||||
uint64_t m = mask(hash);
|
||||
uint64_t b = blocks_[block_id(hash)];
|
||||
return (b & m) == m;
|
||||
}
|
||||
|
||||
// Uses SIMD if available for smaller Bloom filters.
|
||||
// Uses memory prefetching for larger Bloom filters.
|
||||
//
|
||||
void Find(int64_t hardware_flags, int64_t num_rows, const uint32_t* hashes,
|
||||
uint8_t* result_bit_vector, bool enable_prefetch = true) const;
|
||||
void Find(int64_t hardware_flags, int64_t num_rows, const uint64_t* hashes,
|
||||
uint8_t* result_bit_vector, bool enable_prefetch = true) const;
|
||||
|
||||
int log_num_blocks() const { return log_num_blocks_; }
|
||||
|
||||
int NumHashBitsUsed() const;
|
||||
|
||||
bool IsSameAs(const BlockedBloomFilter* other) const;
|
||||
|
||||
int64_t NumBitsSet() const;
|
||||
|
||||
// Folding of a block Bloom filter after the initial version
|
||||
// has been built.
|
||||
//
|
||||
// One of the parameters for creation of Bloom filter is the number
|
||||
// of bits allocated for it. The more bits allocated, the lower the
|
||||
// probability of false positives. A good heuristic is to aim for
|
||||
// half of the bits set in the constructed Bloom filter. This should
|
||||
// result in a good trade off between size (and following cost of
|
||||
// memory accesses) and false positives rate.
|
||||
//
|
||||
// There might have been many duplicate keys in the input provided
|
||||
// to Bloom filter builder. In that case the resulting bit vector
|
||||
// would be more sparse then originally intended. It is possible to
|
||||
// easily correct that and cut in half the size of Bloom filter
|
||||
// after it has already been constructed. The process to do that is
|
||||
// approximately equal to OR-ing bits from upper and lower half (the
|
||||
// way we address these bits when inserting or querying a hash makes
|
||||
// such folding in half possible).
|
||||
//
|
||||
// We will keep folding as long as the fraction of bits set is less
|
||||
// than 1/4. The resulting bit vector density should be in the [1/4,
|
||||
// 1/2) range.
|
||||
//
|
||||
void Fold();
|
||||
|
||||
private:
|
||||
Status CreateEmpty(int64_t num_rows_to_insert, MemoryPool* pool);
|
||||
|
||||
inline void Insert(uint64_t hash) {
|
||||
uint64_t m = mask(hash);
|
||||
uint64_t& b = blocks_[block_id(hash)];
|
||||
b |= m;
|
||||
}
|
||||
|
||||
void Insert(int64_t hardware_flags, int64_t num_rows, const uint32_t* hashes);
|
||||
void Insert(int64_t hardware_flags, int64_t num_rows, const uint64_t* hashes);
|
||||
|
||||
inline uint64_t mask(uint64_t hash) const {
|
||||
// The lowest bits of hash are used to pick mask index.
|
||||
//
|
||||
int mask_id = static_cast<int>(hash & (BloomFilterMasks::kNumMasks - 1));
|
||||
uint64_t result = masks_.mask(mask_id);
|
||||
|
||||
// The next set of hash bits is used to pick the amount of bit
|
||||
// rotation of the mask.
|
||||
//
|
||||
int rotation = (hash >> BloomFilterMasks::kLogNumMasks) & 63;
|
||||
result = ROTL64(result, rotation);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline int64_t block_id(uint64_t hash) const {
|
||||
// The next set of hash bits following the bits used to select a
|
||||
// mask is used to pick block id (index of 64-bit word in a bit
|
||||
// vector).
|
||||
//
|
||||
return (hash >> (BloomFilterMasks::kLogNumMasks + 6)) & (num_blocks_ - 1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void InsertImp(int64_t num_rows, const T* hashes);
|
||||
|
||||
template <typename T>
|
||||
inline void FindImp(int64_t num_rows, const T* hashes, uint8_t* result_bit_vector,
|
||||
bool enable_prefetch) const;
|
||||
|
||||
void SingleFold(int num_folds);
|
||||
|
||||
#if defined(ARROW_HAVE_RUNTIME_AVX2)
|
||||
inline __m256i mask_avx2(__m256i hash) const;
|
||||
inline __m256i block_id_avx2(__m256i hash) const;
|
||||
int64_t Insert_avx2(int64_t num_rows, const uint32_t* hashes);
|
||||
int64_t Insert_avx2(int64_t num_rows, const uint64_t* hashes);
|
||||
template <typename T>
|
||||
int64_t InsertImp_avx2(int64_t num_rows, const T* hashes);
|
||||
int64_t Find_avx2(int64_t num_rows, const uint32_t* hashes,
|
||||
uint8_t* result_bit_vector) const;
|
||||
int64_t Find_avx2(int64_t num_rows, const uint64_t* hashes,
|
||||
uint8_t* result_bit_vector) const;
|
||||
template <typename T>
|
||||
int64_t FindImp_avx2(int64_t num_rows, const T* hashes,
|
||||
uint8_t* result_bit_vector) const;
|
||||
#endif
|
||||
|
||||
bool UsePrefetch() const {
|
||||
return num_blocks_ * sizeof(uint64_t) > kPrefetchLimitBytes;
|
||||
}
|
||||
|
||||
static constexpr int64_t kPrefetchLimitBytes = 256 * 1024;
|
||||
|
||||
static BloomFilterMasks masks_;
|
||||
|
||||
// Total number of bits used by block Bloom filter must be a power
|
||||
// of 2.
|
||||
//
|
||||
int log_num_blocks_;
|
||||
int64_t num_blocks_;
|
||||
|
||||
// Buffer allocated to store an array of power of 2 64-bit blocks.
|
||||
//
|
||||
std::shared_ptr<Buffer> buf_;
|
||||
// Pointer to mutable data owned by Buffer
|
||||
//
|
||||
uint64_t* blocks_;
|
||||
};
|
||||
|
||||
// We have two separate implementations of building a Bloom filter, multi-threaded and
|
||||
// single-threaded.
|
||||
//
|
||||
// Single threaded version is useful in two ways:
|
||||
// a) It allows to verify parallel implementation in tests (the single threaded one is
|
||||
// simpler and can be used as the source of truth).
|
||||
// b) It is preferred for small and medium size Bloom filters, because it skips extra
|
||||
// synchronization related steps from parallel variant (partitioning and taking locks).
|
||||
//
|
||||
enum class BloomFilterBuildStrategy {
|
||||
SINGLE_THREADED = 0,
|
||||
PARALLEL = 1,
|
||||
};
|
||||
|
||||
class ARROW_ACERO_EXPORT BloomFilterBuilder {
|
||||
public:
|
||||
virtual ~BloomFilterBuilder() = default;
|
||||
virtual Status Begin(size_t num_threads, int64_t hardware_flags, MemoryPool* pool,
|
||||
int64_t num_rows, int64_t num_batches,
|
||||
BlockedBloomFilter* build_target) = 0;
|
||||
virtual int64_t num_tasks() const { return 0; }
|
||||
virtual Status PushNextBatch(size_t thread_index, int64_t num_rows,
|
||||
const uint32_t* hashes) = 0;
|
||||
virtual Status PushNextBatch(size_t thread_index, int64_t num_rows,
|
||||
const uint64_t* hashes) = 0;
|
||||
virtual void CleanUp() {}
|
||||
static std::unique_ptr<BloomFilterBuilder> Make(BloomFilterBuildStrategy strategy);
|
||||
};
|
||||
|
||||
class ARROW_ACERO_EXPORT BloomFilterBuilder_SingleThreaded : public BloomFilterBuilder {
|
||||
public:
|
||||
Status Begin(size_t num_threads, int64_t hardware_flags, MemoryPool* pool,
|
||||
int64_t num_rows, int64_t num_batches,
|
||||
BlockedBloomFilter* build_target) override;
|
||||
|
||||
Status PushNextBatch(size_t /*thread_index*/, int64_t num_rows,
|
||||
const uint32_t* hashes) override;
|
||||
|
||||
Status PushNextBatch(size_t /*thread_index*/, int64_t num_rows,
|
||||
const uint64_t* hashes) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void PushNextBatchImp(int64_t num_rows, const T* hashes);
|
||||
|
||||
int64_t hardware_flags_;
|
||||
BlockedBloomFilter* build_target_;
|
||||
};
|
||||
|
||||
class ARROW_ACERO_EXPORT BloomFilterBuilder_Parallel : public BloomFilterBuilder {
|
||||
public:
|
||||
Status Begin(size_t num_threads, int64_t hardware_flags, MemoryPool* pool,
|
||||
int64_t num_rows, int64_t num_batches,
|
||||
BlockedBloomFilter* build_target) override;
|
||||
|
||||
Status PushNextBatch(size_t thread_id, int64_t num_rows,
|
||||
const uint32_t* hashes) override;
|
||||
|
||||
Status PushNextBatch(size_t thread_id, int64_t num_rows,
|
||||
const uint64_t* hashes) override;
|
||||
|
||||
void CleanUp() override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void PushNextBatchImp(size_t thread_id, int64_t num_rows, const T* hashes);
|
||||
|
||||
int64_t hardware_flags_;
|
||||
BlockedBloomFilter* build_target_;
|
||||
int log_num_prtns_;
|
||||
struct ThreadLocalState {
|
||||
std::vector<uint32_t> partitioned_hashes_32;
|
||||
std::vector<uint64_t> partitioned_hashes_64;
|
||||
std::vector<uint16_t> partition_ranges;
|
||||
std::vector<int> unprocessed_partition_ids;
|
||||
};
|
||||
std::vector<ThreadLocalState> thread_local_states_;
|
||||
PartitionLocks prtn_locks_;
|
||||
};
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,819 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/type_fwd.h"
|
||||
#include "arrow/acero/visibility.h"
|
||||
#include "arrow/compute/api_vector.h"
|
||||
#include "arrow/compute/exec.h"
|
||||
#include "arrow/compute/ordering.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/future.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/tracing.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using compute::ExecBatch;
|
||||
using compute::ExecContext;
|
||||
using compute::FunctionRegistry;
|
||||
using compute::GetFunctionRegistry;
|
||||
using compute::Ordering;
|
||||
using compute::threaded_exec_context;
|
||||
|
||||
namespace acero {
|
||||
|
||||
/// \addtogroup acero-internals
|
||||
/// @{
|
||||
|
||||
class ARROW_ACERO_EXPORT ExecPlan : public std::enable_shared_from_this<ExecPlan> {
|
||||
public:
|
||||
// This allows operators to rely on signed 16-bit indices
|
||||
static const uint32_t kMaxBatchSize = 1 << 15;
|
||||
using NodeVector = std::vector<ExecNode*>;
|
||||
|
||||
virtual ~ExecPlan() = default;
|
||||
|
||||
QueryContext* query_context();
|
||||
|
||||
/// \brief retrieve the nodes in the plan
|
||||
const NodeVector& nodes() const;
|
||||
|
||||
/// Make an empty exec plan
|
||||
static Result<std::shared_ptr<ExecPlan>> Make(
|
||||
QueryOptions options, ExecContext exec_context = *threaded_exec_context(),
|
||||
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
|
||||
|
||||
static Result<std::shared_ptr<ExecPlan>> Make(
|
||||
ExecContext exec_context = *threaded_exec_context(),
|
||||
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
|
||||
|
||||
static Result<std::shared_ptr<ExecPlan>> Make(
|
||||
QueryOptions options, ExecContext* exec_context,
|
||||
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
|
||||
|
||||
static Result<std::shared_ptr<ExecPlan>> Make(
|
||||
ExecContext* exec_context,
|
||||
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
|
||||
|
||||
ExecNode* AddNode(std::unique_ptr<ExecNode> node);
|
||||
|
||||
template <typename Node, typename... Args>
|
||||
Node* EmplaceNode(Args&&... args) {
|
||||
std::unique_ptr<Node> node{new Node{std::forward<Args>(args)...}};
|
||||
auto out = node.get();
|
||||
AddNode(std::move(node));
|
||||
return out;
|
||||
}
|
||||
|
||||
Status Validate();
|
||||
|
||||
/// \brief Start producing on all nodes
|
||||
///
|
||||
/// Nodes are started in reverse topological order, such that any node
|
||||
/// is started before all of its inputs.
|
||||
void StartProducing();
|
||||
|
||||
/// \brief Stop producing on all nodes
|
||||
///
|
||||
/// Triggers all sources to stop producing new data. In order to cleanly stop the plan
|
||||
/// will continue to run any tasks that are already in progress. The caller should
|
||||
/// still wait for `finished` to complete before destroying the plan.
|
||||
void StopProducing();
|
||||
|
||||
/// \brief A future which will be marked finished when all tasks have finished.
|
||||
Future<> finished();
|
||||
|
||||
/// \brief Return whether the plan has non-empty metadata
|
||||
bool HasMetadata() const;
|
||||
|
||||
/// \brief Return the plan's attached metadata
|
||||
std::shared_ptr<const KeyValueMetadata> metadata() const;
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
// Acero can be extended by providing custom implementations of ExecNode. The methods
|
||||
// below are documented in detail and provide careful instruction on how to fulfill the
|
||||
// ExecNode contract. It's suggested you familiarize yourself with the Acero
|
||||
// documentation in the C++ user guide.
|
||||
class ARROW_ACERO_EXPORT ExecNode {
|
||||
public:
|
||||
using NodeVector = std::vector<ExecNode*>;
|
||||
|
||||
virtual ~ExecNode() = default;
|
||||
|
||||
virtual const char* kind_name() const = 0;
|
||||
|
||||
// The number of inputs expected by this node
|
||||
int num_inputs() const { return static_cast<int>(inputs_.size()); }
|
||||
|
||||
/// This node's predecessors in the exec plan
|
||||
const NodeVector& inputs() const { return inputs_; }
|
||||
|
||||
/// True if the plan has no output schema (is a sink)
|
||||
bool is_sink() const { return !output_schema_; }
|
||||
|
||||
/// \brief Labels identifying the function of each input.
|
||||
const std::vector<std::string>& input_labels() const { return input_labels_; }
|
||||
|
||||
/// This node's successor in the exec plan
|
||||
const ExecNode* output() const { return output_; }
|
||||
|
||||
/// The datatypes for batches produced by this node
|
||||
const std::shared_ptr<Schema>& output_schema() const { return output_schema_; }
|
||||
|
||||
/// This node's exec plan
|
||||
ExecPlan* plan() { return plan_; }
|
||||
|
||||
/// \brief An optional label, for display and debugging
|
||||
///
|
||||
/// There is no guarantee that this value is non-empty or unique.
|
||||
const std::string& label() const { return label_; }
|
||||
void SetLabel(std::string label) { label_ = std::move(label); }
|
||||
|
||||
virtual Status Validate() const;
|
||||
|
||||
/// \brief the ordering of the output batches
|
||||
///
|
||||
/// This does not guarantee the batches will be emitted by this node
|
||||
/// in order. Instead it guarantees that the batches will have their
|
||||
/// ExecBatch::index property set in a way that respects this ordering.
|
||||
///
|
||||
/// In other words, given the ordering {{"x", SortOrder::Ascending}} we
|
||||
/// know that all values of x in a batch with index N will be less than
|
||||
/// or equal to all values of x in a batch with index N+k (assuming k > 0).
|
||||
/// Furthermore, we also know that values will be sorted within a batch.
|
||||
/// Any row N will have a value of x that is less than the value for
|
||||
/// any row N+k.
|
||||
///
|
||||
/// Note that an ordering can be both Ordering::Unordered and Ordering::Implicit.
|
||||
/// A node's output should be marked Ordering::Unordered if the order is
|
||||
/// non-deterministic. For example, a hash-join has no predictable output order.
|
||||
///
|
||||
/// If the ordering is Ordering::Implicit then there is a meaningful order but that
|
||||
/// ordering is not represented by any column in the data. The most common case for
|
||||
/// this is when reading data from an in-memory table. The data has an implicit "row
|
||||
/// order" which is not necessarily represented in the data set.
|
||||
///
|
||||
/// A filter or project node will not modify the ordering. Nothing needs to be done
|
||||
/// other than ensure the index assigned to output batches is the same as the
|
||||
/// input batch that was mapped.
|
||||
///
|
||||
/// Other nodes may introduce order. For example, an order-by node will emit
|
||||
/// a brand new ordering independent of the input ordering.
|
||||
///
|
||||
/// Finally, as described above, such as a hash-join or aggregation may may
|
||||
/// destroy ordering (although these nodes could also choose to establish a
|
||||
/// new ordering based on the hash keys).
|
||||
///
|
||||
/// Some nodes will require an ordering. For example, a fetch node or an
|
||||
/// asof join node will only function if the input data is ordered (for fetch
|
||||
/// it is enough to be implicitly ordered. For an asof join the ordering must
|
||||
/// be explicit and compatible with the on key.)
|
||||
///
|
||||
/// Nodes that maintain ordering should be careful to avoid introducing gaps
|
||||
/// in the batch index. This may require emitting empty batches in order to
|
||||
/// maintain continuity.
|
||||
virtual const Ordering& ordering() const;
|
||||
|
||||
/// Upstream API:
|
||||
/// These functions are called by input nodes that want to inform this node
|
||||
/// about an updated condition (a new input batch or an impending
|
||||
/// end of stream).
|
||||
///
|
||||
/// Implementation rules:
|
||||
/// - these may be called anytime after StartProducing() has succeeded
|
||||
/// (and even during or after StopProducing())
|
||||
/// - these may be called concurrently
|
||||
/// - these are allowed to call back into PauseProducing(), ResumeProducing()
|
||||
/// and StopProducing()
|
||||
|
||||
/// Transfer input batch to ExecNode
|
||||
///
|
||||
/// A node will typically perform some kind of operation on the batch
|
||||
/// and then call InputReceived on its outputs with the result.
|
||||
///
|
||||
/// Other nodes may need to accumulate some number of inputs before any
|
||||
/// output can be produced. These nodes will add the batch to some kind
|
||||
/// of in-memory accumulation queue and return.
|
||||
virtual Status InputReceived(ExecNode* input, ExecBatch batch) = 0;
|
||||
|
||||
/// Mark the inputs finished after the given number of batches.
|
||||
///
|
||||
/// This may be called before all inputs are received. This simply fixes
|
||||
/// the total number of incoming batches for an input, so that the ExecNode
|
||||
/// knows when it has received all input, regardless of order.
|
||||
virtual Status InputFinished(ExecNode* input, int total_batches) = 0;
|
||||
|
||||
/// \brief Perform any needed initialization
|
||||
///
|
||||
/// This hook performs any actions in between creation of ExecPlan and the call to
|
||||
/// StartProducing. An example could be Bloom filter pushdown. The order of ExecNodes
|
||||
/// that executes this method is undefined, but the calls are made synchronously.
|
||||
///
|
||||
/// At this point a node can rely on all inputs & outputs (and the input schemas)
|
||||
/// being well defined.
|
||||
virtual Status Init();
|
||||
|
||||
/// Lifecycle API:
|
||||
/// - start / stop to initiate and terminate production
|
||||
/// - pause / resume to apply backpressure
|
||||
///
|
||||
/// Implementation rules:
|
||||
/// - StartProducing() should not recurse into the inputs, as it is
|
||||
/// handled by ExecPlan::StartProducing()
|
||||
/// - PauseProducing(), ResumeProducing(), StopProducing() may be called
|
||||
/// concurrently, potentially even before the call to StartProducing
|
||||
/// has finished.
|
||||
/// - PauseProducing(), ResumeProducing(), StopProducing() may be called
|
||||
/// by the downstream nodes' InputReceived(), InputFinished() methods
|
||||
///
|
||||
/// StopProducing may be called due to an error, by the user (e.g. cancel), or
|
||||
/// because a node has all the data it needs (e.g. limit, top-k on sorted data).
|
||||
/// This means the method may be called multiple times and we have the following
|
||||
/// additional rules
|
||||
/// - StopProducing() must be idempotent
|
||||
/// - StopProducing() must be forwarded to inputs (this is needed for the limit/top-k
|
||||
/// case because we may not be stopping the entire plan)
|
||||
|
||||
// Right now, since synchronous calls happen in both directions (input to
|
||||
// output and then output to input), a node must be careful to be reentrant
|
||||
// against synchronous calls from its output, *and* also concurrent calls from
|
||||
// other threads. The most reliable solution is to update the internal state
|
||||
// first, and notify outputs only at the end.
|
||||
//
|
||||
// Concurrent calls to PauseProducing and ResumeProducing can be hard to sequence
|
||||
// as they may travel at different speeds through the plan.
|
||||
//
|
||||
// For example, consider a resume that comes quickly after a pause. If the source
|
||||
// receives the resume before the pause the source may think the destination is full
|
||||
// and halt production which would lead to deadlock.
|
||||
//
|
||||
// To resolve this a counter is sent for all calls to pause/resume. Only the call with
|
||||
// the highest counter value is valid. So if a call to PauseProducing(5) comes after
|
||||
// a call to ResumeProducing(6) then the source should continue producing.
|
||||
|
||||
/// \brief Start producing
|
||||
///
|
||||
/// This must only be called once.
|
||||
///
|
||||
/// This is typically called automatically by ExecPlan::StartProducing().
|
||||
virtual Status StartProducing() = 0;
|
||||
|
||||
/// \brief Pause producing temporarily
|
||||
///
|
||||
/// \param output Pointer to the output that is full
|
||||
/// \param counter Counter used to sequence calls to pause/resume
|
||||
///
|
||||
/// This call is a hint that an output node is currently not willing
|
||||
/// to receive data.
|
||||
///
|
||||
/// This may be called any number of times.
|
||||
/// However, the node is still free to produce data (which may be difficult
|
||||
/// to prevent anyway if data is produced using multiple threads).
|
||||
virtual void PauseProducing(ExecNode* output, int32_t counter) = 0;
|
||||
|
||||
/// \brief Resume producing after a temporary pause
|
||||
///
|
||||
/// \param output Pointer to the output that is now free
|
||||
/// \param counter Counter used to sequence calls to pause/resume
|
||||
///
|
||||
/// This call is a hint that an output node is willing to receive data again.
|
||||
///
|
||||
/// This may be called any number of times.
|
||||
virtual void ResumeProducing(ExecNode* output, int32_t counter) = 0;
|
||||
|
||||
/// \brief Stop producing new data
|
||||
///
|
||||
/// If this node is a source then the source should stop generating data
|
||||
/// as quickly as possible. If this node is not a source then there is typically
|
||||
/// nothing that needs to be done although a node may choose to start ignoring incoming
|
||||
/// data.
|
||||
///
|
||||
/// This method will be called when an error occurs in the plan
|
||||
/// This method may also be called by the user if they wish to end a plan early
|
||||
/// Finally, this method may be called if a node determines it no longer needs any more
|
||||
/// input (for example, a limit node).
|
||||
///
|
||||
/// This method may be called multiple times.
|
||||
///
|
||||
/// This is not a pause. There will be no way to start the source again after this has
|
||||
/// been called.
|
||||
virtual Status StopProducing();
|
||||
|
||||
std::string ToString(int indent = 0) const;
|
||||
|
||||
protected:
|
||||
ExecNode(ExecPlan* plan, NodeVector inputs, std::vector<std::string> input_labels,
|
||||
std::shared_ptr<Schema> output_schema);
|
||||
|
||||
virtual Status StopProducingImpl() = 0;
|
||||
|
||||
/// Provide extra info to include in the string representation.
|
||||
virtual std::string ToStringExtra(int indent = 0) const;
|
||||
|
||||
std::atomic<bool> stopped_;
|
||||
ExecPlan* plan_;
|
||||
std::string label_;
|
||||
|
||||
NodeVector inputs_;
|
||||
std::vector<std::string> input_labels_;
|
||||
|
||||
std::shared_ptr<Schema> output_schema_;
|
||||
ExecNode* output_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief An extensible registry for factories of ExecNodes
|
||||
class ARROW_ACERO_EXPORT ExecFactoryRegistry {
|
||||
public:
|
||||
using Factory = std::function<Result<ExecNode*>(ExecPlan*, std::vector<ExecNode*>,
|
||||
const ExecNodeOptions&)>;
|
||||
|
||||
virtual ~ExecFactoryRegistry() = default;
|
||||
|
||||
/// \brief Get the named factory from this registry
|
||||
///
|
||||
/// will raise if factory_name is not found
|
||||
virtual Result<Factory> GetFactory(const std::string& factory_name) = 0;
|
||||
|
||||
/// \brief Add a factory to this registry with the provided name
|
||||
///
|
||||
/// will raise if factory_name is already in the registry
|
||||
virtual Status AddFactory(std::string factory_name, Factory factory) = 0;
|
||||
};
|
||||
|
||||
/// The default registry, which includes built-in factories.
|
||||
ARROW_ACERO_EXPORT
|
||||
ExecFactoryRegistry* default_exec_factory_registry();
|
||||
|
||||
/// \brief Construct an ExecNode using the named factory
|
||||
inline Result<ExecNode*> MakeExecNode(
|
||||
const std::string& factory_name, ExecPlan* plan, std::vector<ExecNode*> inputs,
|
||||
const ExecNodeOptions& options,
|
||||
ExecFactoryRegistry* registry = default_exec_factory_registry()) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto factory, registry->GetFactory(factory_name));
|
||||
return factory(plan, std::move(inputs), options);
|
||||
}
|
||||
|
||||
/// @}
|
||||
|
||||
/// \addtogroup acero-api
|
||||
/// @{
|
||||
|
||||
/// \brief Helper class for declaring execution nodes
|
||||
///
|
||||
/// A Declaration represents an unconstructed ExecNode (and potentially an entire graph
|
||||
/// since its inputs may also be Declarations)
|
||||
///
|
||||
/// A Declaration can be converted to a plan and executed using one of the
|
||||
/// DeclarationToXyz methods.
|
||||
///
|
||||
/// For more direct control, a Declaration can be added to an existing execution
|
||||
/// plan with Declaration::AddToPlan, which will recursively construct any inputs as
|
||||
/// necessary.
|
||||
struct ARROW_ACERO_EXPORT Declaration {
|
||||
using Input = std::variant<ExecNode*, Declaration>;
|
||||
|
||||
Declaration() {}
|
||||
|
||||
/// \brief construct a declaration
|
||||
/// \param factory_name the name of the exec node to construct. The node must have
|
||||
/// been added to the exec node registry with this name.
|
||||
/// \param inputs the inputs to the node, these should be other declarations
|
||||
/// \param options options that control the behavior of the node. You must use
|
||||
/// the appropriate subclass. For example, if `factory_name` is
|
||||
/// "project" then `options` should be ProjectNodeOptions.
|
||||
/// \param label a label to give the node. Can be used to distinguish it from other
|
||||
/// nodes of the same type in the plan.
|
||||
Declaration(std::string factory_name, std::vector<Input> inputs,
|
||||
std::shared_ptr<ExecNodeOptions> options, std::string label)
|
||||
: factory_name{std::move(factory_name)},
|
||||
inputs{std::move(inputs)},
|
||||
options{std::move(options)},
|
||||
label{std::move(label)} {}
|
||||
|
||||
template <typename Options>
|
||||
Declaration(std::string factory_name, std::vector<Input> inputs, Options options,
|
||||
std::string label)
|
||||
: Declaration{std::move(factory_name), std::move(inputs),
|
||||
std::shared_ptr<ExecNodeOptions>(
|
||||
std::make_shared<Options>(std::move(options))),
|
||||
std::move(label)} {}
|
||||
|
||||
template <typename Options>
|
||||
Declaration(std::string factory_name, std::vector<Input> inputs, Options options)
|
||||
: Declaration{std::move(factory_name), std::move(inputs), std::move(options),
|
||||
/*label=*/""} {}
|
||||
|
||||
template <typename Options>
|
||||
Declaration(std::string factory_name, Options options)
|
||||
: Declaration{std::move(factory_name), {}, std::move(options), /*label=*/""} {}
|
||||
|
||||
template <typename Options>
|
||||
Declaration(std::string factory_name, Options options, std::string label)
|
||||
: Declaration{std::move(factory_name), {}, std::move(options), std::move(label)} {}
|
||||
|
||||
/// \brief Convenience factory for the common case of a simple sequence of nodes.
|
||||
///
|
||||
/// Each of decls will be appended to the inputs of the subsequent declaration,
|
||||
/// and the final modified declaration will be returned.
|
||||
///
|
||||
/// Without this convenience factory, constructing a sequence would require explicit,
|
||||
/// difficult-to-read nesting:
|
||||
///
|
||||
/// Declaration{"n3",
|
||||
/// {
|
||||
/// Declaration{"n2",
|
||||
/// {
|
||||
/// Declaration{"n1",
|
||||
/// {
|
||||
/// Declaration{"n0", N0Opts{}},
|
||||
/// },
|
||||
/// N1Opts{}},
|
||||
/// },
|
||||
/// N2Opts{}},
|
||||
/// },
|
||||
/// N3Opts{}};
|
||||
///
|
||||
/// An equivalent Declaration can be constructed more tersely using Sequence:
|
||||
///
|
||||
/// Declaration::Sequence({
|
||||
/// {"n0", N0Opts{}},
|
||||
/// {"n1", N1Opts{}},
|
||||
/// {"n2", N2Opts{}},
|
||||
/// {"n3", N3Opts{}},
|
||||
/// });
|
||||
static Declaration Sequence(std::vector<Declaration> decls);
|
||||
|
||||
/// \brief add the declaration to an already created execution plan
|
||||
/// \param plan the plan to add the node to
|
||||
/// \param registry the registry to use to lookup the node factory
|
||||
///
|
||||
/// This method will recursively call AddToPlan on all of the declaration's inputs.
|
||||
/// This method is only for advanced use when the DeclarationToXyz methods are not
|
||||
/// sufficient.
|
||||
///
|
||||
/// \return the instantiated execution node
|
||||
Result<ExecNode*> AddToPlan(ExecPlan* plan, ExecFactoryRegistry* registry =
|
||||
default_exec_factory_registry()) const;
|
||||
|
||||
// Validate a declaration
|
||||
bool IsValid(ExecFactoryRegistry* registry = default_exec_factory_registry()) const;
|
||||
|
||||
/// \brief the name of the factory to use when creating a node
|
||||
std::string factory_name;
|
||||
/// \brief the declarations's inputs
|
||||
std::vector<Input> inputs;
|
||||
/// \brief options to control the behavior of the node
|
||||
std::shared_ptr<ExecNodeOptions> options;
|
||||
/// \brief a label to give the node in the plan
|
||||
std::string label;
|
||||
};
|
||||
|
||||
/// \brief How to handle unaligned buffers
|
||||
enum class UnalignedBufferHandling { kWarn, kIgnore, kReallocate, kError };
|
||||
|
||||
/// \brief get the default behavior of unaligned buffer handling
|
||||
///
|
||||
/// This is configurable via the ACERO_ALIGNMENT_HANDLING environment variable which
|
||||
/// can be set to "warn", "ignore", "reallocate", or "error". If the environment
|
||||
/// variable is not set, or is set to an invalid value, this will return kWarn
|
||||
UnalignedBufferHandling GetDefaultUnalignedBufferHandling();
|
||||
|
||||
/// \brief plan-wide options that can be specified when executing an execution plan
|
||||
struct ARROW_ACERO_EXPORT QueryOptions {
|
||||
/// \brief Should the plan use a legacy batching strategy
|
||||
///
|
||||
/// This is currently in place only to support the Scanner::ToTable
|
||||
/// method. This method relies on batch indices from the scanner
|
||||
/// remaining consistent. This is impractical in the ExecPlan which
|
||||
/// might slice batches as needed (e.g. for a join)
|
||||
///
|
||||
/// However, it still works for simple plans and this is the only way
|
||||
/// we have at the moment for maintaining implicit order.
|
||||
bool use_legacy_batching = false;
|
||||
|
||||
/// If the output has a meaningful order then sequence the output of the plan
|
||||
///
|
||||
/// The default behavior (std::nullopt) will sequence output batches if there
|
||||
/// is a meaningful ordering in the final node and will emit batches immediately
|
||||
/// otherwise.
|
||||
///
|
||||
/// If explicitly set to true then plan execution will fail if there is no
|
||||
/// meaningful ordering. This can be useful to validate a query that should
|
||||
/// be emitting ordered results.
|
||||
///
|
||||
/// If explicitly set to false then batches will be emit immediately even if there
|
||||
/// is a meaningful ordering. This could cause batches to be emit out of order but
|
||||
/// may offer a small decrease to latency.
|
||||
std::optional<bool> sequence_output = std::nullopt;
|
||||
|
||||
/// \brief should the plan use multiple background threads for CPU-intensive work
|
||||
///
|
||||
/// If this is false then all CPU work will be done on the calling thread. I/O tasks
|
||||
/// will still happen on the I/O executor and may be multi-threaded (but should not use
|
||||
/// significant CPU resources).
|
||||
///
|
||||
/// Will be ignored if custom_cpu_executor is set
|
||||
bool use_threads = true;
|
||||
|
||||
/// \brief custom executor to use for CPU-intensive work
|
||||
///
|
||||
/// Must be null or remain valid for the duration of the plan. If this is null then
|
||||
/// a default thread pool will be chosen whose behavior will be controlled by
|
||||
/// the `use_threads` option.
|
||||
::arrow::internal::Executor* custom_cpu_executor = NULLPTR;
|
||||
|
||||
/// \brief custom executor to use for IO work
|
||||
///
|
||||
/// Must be null or remain valid for the duration of the plan. If this is null then
|
||||
/// the global io thread pool will be chosen whose behavior will be controlled by
|
||||
/// the "ARROW_IO_THREADS" environment.
|
||||
::arrow::internal::Executor* custom_io_executor = NULLPTR;
|
||||
|
||||
/// \brief a memory pool to use for allocations
|
||||
///
|
||||
/// Must remain valid for the duration of the plan.
|
||||
MemoryPool* memory_pool = default_memory_pool();
|
||||
|
||||
/// \brief a function registry to use for the plan
|
||||
///
|
||||
/// Must remain valid for the duration of the plan.
|
||||
FunctionRegistry* function_registry = GetFunctionRegistry();
|
||||
/// \brief the names of the output columns
|
||||
///
|
||||
/// If this is empty then names will be generated based on the input columns
|
||||
///
|
||||
/// If set then the number of names must equal the number of output columns
|
||||
std::vector<std::string> field_names;
|
||||
|
||||
/// \brief Policy for unaligned buffers in source data
|
||||
///
|
||||
/// Various compute functions and acero internals will type pun array
|
||||
/// buffers from uint8_t* to some kind of value type (e.g. we might
|
||||
/// cast to int32_t* to add two int32 arrays)
|
||||
///
|
||||
/// If the buffer is poorly aligned (e.g. an int32 array is not aligned
|
||||
/// on a 4-byte boundary) then this is technically undefined behavior in C++.
|
||||
/// However, most modern compilers and CPUs are fairly tolerant of this
|
||||
/// behavior and nothing bad (beyond a small hit to performance) is likely
|
||||
/// to happen.
|
||||
///
|
||||
/// Note that this only applies to source buffers. All buffers allocated internally
|
||||
/// by Acero will be suitably aligned.
|
||||
///
|
||||
/// If this field is set to kWarn then Acero will check if any buffers are unaligned
|
||||
/// and, if they are, will emit a warning.
|
||||
///
|
||||
/// If this field is set to kReallocate then Acero will allocate a new, suitably aligned
|
||||
/// buffer and copy the contents from the old buffer into this new buffer.
|
||||
///
|
||||
/// If this field is set to kError then Acero will gracefully abort the plan instead.
|
||||
///
|
||||
/// If this field is set to kIgnore then Acero will not even check if the buffers are
|
||||
/// unaligned.
|
||||
///
|
||||
/// If this field is not set then it will be treated as kWarn unless overridden
|
||||
/// by the ACERO_ALIGNMENT_HANDLING environment variable
|
||||
std::optional<UnalignedBufferHandling> unaligned_buffer_handling;
|
||||
};
|
||||
|
||||
/// \brief Calculate the output schema of a declaration
|
||||
///
|
||||
/// This does not actually execute the plan. This operation may fail if the
|
||||
/// declaration represents an invalid plan (e.g. a project node with multiple inputs)
|
||||
///
|
||||
/// \param declaration A declaration describing an execution plan
|
||||
/// \param function_registry The function registry to use for function execution. If null
|
||||
/// then the default function registry will be used.
|
||||
///
|
||||
/// \return the schema that batches would have after going through the execution plan
|
||||
ARROW_ACERO_EXPORT Result<std::shared_ptr<Schema>> DeclarationToSchema(
|
||||
const Declaration& declaration, FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
/// \brief Create a string representation of a plan
|
||||
///
|
||||
/// This representation is for debug purposes only.
|
||||
///
|
||||
/// Conversion to a string may fail if the declaration represents an
|
||||
/// invalid plan.
|
||||
///
|
||||
/// Use Substrait for complete serialization of plans
|
||||
///
|
||||
/// \param declaration A declaration describing an execution plan
|
||||
/// \param function_registry The function registry to use for function execution. If null
|
||||
/// then the default function registry will be used.
|
||||
///
|
||||
/// \return a string representation of the plan suitable for debugging output
|
||||
ARROW_ACERO_EXPORT Result<std::string> DeclarationToString(
|
||||
const Declaration& declaration, FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
/// \brief Utility method to run a declaration and collect the results into a table
|
||||
///
|
||||
/// \param declaration A declaration describing the plan to run
|
||||
/// \param use_threads If `use_threads` is false then all CPU work will be done on the
|
||||
/// calling thread. I/O tasks will still happen on the I/O executor
|
||||
/// and may be multi-threaded (but should not use significant CPU
|
||||
/// resources).
|
||||
/// \param memory_pool The memory pool to use for allocations made while running the plan.
|
||||
/// \param function_registry The function registry to use for function execution. If null
|
||||
/// then the default function registry will be used.
|
||||
///
|
||||
/// This method will add a sink node to the declaration to collect results into a
|
||||
/// table. It will then create an ExecPlan from the declaration, start the exec plan,
|
||||
/// block until the plan has finished, and return the created table.
|
||||
ARROW_ACERO_EXPORT Result<std::shared_ptr<Table>> DeclarationToTable(
|
||||
Declaration declaration, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool(),
|
||||
FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
ARROW_ACERO_EXPORT Result<std::shared_ptr<Table>> DeclarationToTable(
|
||||
Declaration declaration, QueryOptions query_options);
|
||||
|
||||
/// \brief Asynchronous version of \see DeclarationToTable
|
||||
///
|
||||
/// \param declaration A declaration describing the plan to run
|
||||
/// \param use_threads The behavior of use_threads is slightly different than the
|
||||
/// synchronous version since we cannot run synchronously on the
|
||||
/// calling thread. Instead, if use_threads=false then a new thread
|
||||
/// pool will be created with a single thread and this will be used for
|
||||
/// all compute work.
|
||||
/// \param memory_pool The memory pool to use for allocations made while running the plan.
|
||||
/// \param function_registry The function registry to use for function execution. If null
|
||||
/// then the default function registry will be used.
|
||||
ARROW_ACERO_EXPORT Future<std::shared_ptr<Table>> DeclarationToTableAsync(
|
||||
Declaration declaration, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool(),
|
||||
FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
/// \brief Overload of \see DeclarationToTableAsync accepting a custom exec context
|
||||
///
|
||||
/// The executor must be specified (cannot be null) and must be kept alive until the
|
||||
/// returned future finishes.
|
||||
ARROW_ACERO_EXPORT Future<std::shared_ptr<Table>> DeclarationToTableAsync(
|
||||
Declaration declaration, ExecContext custom_exec_context);
|
||||
|
||||
/// \brief a collection of exec batches with a common schema
|
||||
struct BatchesWithCommonSchema {
|
||||
std::vector<ExecBatch> batches;
|
||||
std::shared_ptr<Schema> schema;
|
||||
};
|
||||
|
||||
/// \brief Utility method to run a declaration and collect the results into ExecBatch
|
||||
/// vector
|
||||
///
|
||||
/// \see DeclarationToTable for details on threading & execution
|
||||
ARROW_ACERO_EXPORT Result<BatchesWithCommonSchema> DeclarationToExecBatches(
|
||||
Declaration declaration, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool(),
|
||||
FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
ARROW_ACERO_EXPORT Result<BatchesWithCommonSchema> DeclarationToExecBatches(
|
||||
Declaration declaration, QueryOptions query_options);
|
||||
|
||||
/// \brief Asynchronous version of \see DeclarationToExecBatches
|
||||
///
|
||||
/// \see DeclarationToTableAsync for details on threading & execution
|
||||
ARROW_ACERO_EXPORT Future<BatchesWithCommonSchema> DeclarationToExecBatchesAsync(
|
||||
Declaration declaration, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool(),
|
||||
FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
/// \brief Overload of \see DeclarationToExecBatchesAsync accepting a custom exec context
|
||||
///
|
||||
/// \see DeclarationToTableAsync for details on threading & execution
|
||||
ARROW_ACERO_EXPORT Future<BatchesWithCommonSchema> DeclarationToExecBatchesAsync(
|
||||
Declaration declaration, ExecContext custom_exec_context);
|
||||
|
||||
/// \brief Utility method to run a declaration and collect the results into a vector
|
||||
///
|
||||
/// \see DeclarationToTable for details on threading & execution
|
||||
ARROW_ACERO_EXPORT Result<std::vector<std::shared_ptr<RecordBatch>>> DeclarationToBatches(
|
||||
Declaration declaration, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool(),
|
||||
FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
ARROW_ACERO_EXPORT Result<std::vector<std::shared_ptr<RecordBatch>>> DeclarationToBatches(
|
||||
Declaration declaration, QueryOptions query_options);
|
||||
|
||||
/// \brief Asynchronous version of \see DeclarationToBatches
|
||||
///
|
||||
/// \see DeclarationToTableAsync for details on threading & execution
|
||||
ARROW_ACERO_EXPORT Future<std::vector<std::shared_ptr<RecordBatch>>>
|
||||
DeclarationToBatchesAsync(Declaration declaration, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool(),
|
||||
FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
/// \brief Overload of \see DeclarationToBatchesAsync accepting a custom exec context
|
||||
///
|
||||
/// \see DeclarationToTableAsync for details on threading & execution
|
||||
ARROW_ACERO_EXPORT Future<std::vector<std::shared_ptr<RecordBatch>>>
|
||||
DeclarationToBatchesAsync(Declaration declaration, ExecContext exec_context);
|
||||
|
||||
/// \brief Utility method to run a declaration and return results as a RecordBatchReader
|
||||
///
|
||||
/// If an exec context is not provided then a default exec context will be used based
|
||||
/// on the value of `use_threads`. If `use_threads` is false then the CPU executor will
|
||||
/// be a serial executor and all CPU work will be done on the calling thread. I/O tasks
|
||||
/// will still happen on the I/O executor and may be multi-threaded.
|
||||
///
|
||||
/// If `use_threads` is false then all CPU work will happen during the calls to
|
||||
/// RecordBatchReader::Next and no CPU work will happen in the background. If
|
||||
/// `use_threads` is true then CPU work will happen on the CPU thread pool and tasks may
|
||||
/// run in between calls to RecordBatchReader::Next. If the returned reader is not
|
||||
/// consumed quickly enough then the plan will eventually pause as the backpressure queue
|
||||
/// fills up.
|
||||
///
|
||||
/// If a custom exec context is provided then the value of `use_threads` will be ignored.
|
||||
///
|
||||
/// The returned RecordBatchReader can be closed early to cancel the computation of record
|
||||
/// batches. In this case, only errors encountered by the computation may be reported. In
|
||||
/// particular, no cancellation error may be reported.
|
||||
ARROW_ACERO_EXPORT Result<std::unique_ptr<RecordBatchReader>> DeclarationToReader(
|
||||
Declaration declaration, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool(),
|
||||
FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
ARROW_ACERO_EXPORT Result<std::unique_ptr<RecordBatchReader>> DeclarationToReader(
|
||||
Declaration declaration, QueryOptions query_options);
|
||||
|
||||
/// \brief Utility method to run a declaration and ignore results
|
||||
///
|
||||
/// This can be useful when the data are consumed as part of the plan itself, for
|
||||
/// example, when the plan ends with a write node.
|
||||
///
|
||||
/// \see DeclarationToTable for details on threading & execution
|
||||
ARROW_ACERO_EXPORT Status
|
||||
DeclarationToStatus(Declaration declaration, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool(),
|
||||
FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
ARROW_ACERO_EXPORT Status DeclarationToStatus(Declaration declaration,
|
||||
QueryOptions query_options);
|
||||
|
||||
/// \brief Asynchronous version of \see DeclarationToStatus
|
||||
///
|
||||
/// This can be useful when the data are consumed as part of the plan itself, for
|
||||
/// example, when the plan ends with a write node.
|
||||
///
|
||||
/// \see DeclarationToTableAsync for details on threading & execution
|
||||
ARROW_ACERO_EXPORT Future<> DeclarationToStatusAsync(
|
||||
Declaration declaration, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool(),
|
||||
FunctionRegistry* function_registry = NULLPTR);
|
||||
|
||||
/// \brief Overload of \see DeclarationToStatusAsync accepting a custom exec context
|
||||
///
|
||||
/// \see DeclarationToTableAsync for details on threading & execution
|
||||
ARROW_ACERO_EXPORT Future<> DeclarationToStatusAsync(Declaration declaration,
|
||||
ExecContext exec_context);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Wrap an ExecBatch generator in a RecordBatchReader.
|
||||
///
|
||||
/// The RecordBatchReader does not impose any ordering on emitted batches.
|
||||
ARROW_ACERO_EXPORT
|
||||
std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
|
||||
std::shared_ptr<Schema>, std::function<Future<std::optional<ExecBatch>>()>,
|
||||
MemoryPool*);
|
||||
|
||||
constexpr int kDefaultBackgroundMaxQ = 32;
|
||||
constexpr int kDefaultBackgroundQRestart = 16;
|
||||
|
||||
/// \brief Make a generator of RecordBatchReaders
|
||||
///
|
||||
/// Useful as a source node for an Exec plan
|
||||
ARROW_ACERO_EXPORT
|
||||
Result<std::function<Future<std::optional<ExecBatch>>()>> MakeReaderGenerator(
|
||||
std::shared_ptr<RecordBatchReader> reader, arrow::internal::Executor* io_executor,
|
||||
int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart);
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,75 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/accumulation_queue.h"
|
||||
#include "arrow/acero/bloom_filter.h"
|
||||
#include "arrow/acero/options.h"
|
||||
#include "arrow/acero/query_context.h"
|
||||
#include "arrow/acero/schema_util.h"
|
||||
#include "arrow/acero/task_util.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/tracing.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
|
||||
using util::AccumulationQueue;
|
||||
|
||||
class ARROW_ACERO_EXPORT HashJoinImpl {
|
||||
public:
|
||||
using OutputBatchCallback = std::function<Status(int64_t, ExecBatch)>;
|
||||
using BuildFinishedCallback = std::function<Status(size_t)>;
|
||||
using FinishedCallback = std::function<Status(int64_t)>;
|
||||
using RegisterTaskGroupCallback = std::function<int(
|
||||
std::function<Status(size_t, int64_t)>, std::function<Status(size_t)>)>;
|
||||
using StartTaskGroupCallback = std::function<Status(int, int64_t)>;
|
||||
using AbortContinuationImpl = std::function<void()>;
|
||||
|
||||
virtual ~HashJoinImpl() = default;
|
||||
virtual Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads,
|
||||
const HashJoinProjectionMaps* proj_map_left,
|
||||
const HashJoinProjectionMaps* proj_map_right,
|
||||
std::vector<JoinKeyCmp> key_cmp, Expression filter,
|
||||
RegisterTaskGroupCallback register_task_group_callback,
|
||||
StartTaskGroupCallback start_task_group_callback,
|
||||
OutputBatchCallback output_batch_callback,
|
||||
FinishedCallback finished_callback) = 0;
|
||||
|
||||
virtual Status BuildHashTable(size_t thread_index, AccumulationQueue batches,
|
||||
BuildFinishedCallback on_finished) = 0;
|
||||
virtual Status ProbeSingleBatch(size_t thread_index, ExecBatch batch) = 0;
|
||||
virtual Status ProbingFinished(size_t thread_index) = 0;
|
||||
virtual void Abort(TaskScheduler::AbortContinuationImpl pos_abort_callback) = 0;
|
||||
virtual std::string ToString() const = 0;
|
||||
|
||||
static Result<std::unique_ptr<HashJoinImpl>> MakeBasic();
|
||||
static Result<std::unique_ptr<HashJoinImpl>> MakeSwiss();
|
||||
|
||||
protected:
|
||||
arrow::util::tracing::Span span_;
|
||||
};
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,318 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/acero/schema_util.h"
|
||||
#include "arrow/compute/exec.h"
|
||||
#include "arrow/compute/row/row_encoder_internal.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
|
||||
// This file contains hash join logic related to handling of dictionary encoded key
|
||||
// columns.
|
||||
//
|
||||
// A key column from probe side of the join can be matched against a key column from build
|
||||
// side of the join, as long as the underlying value types are equal. That means that:
|
||||
// - both scalars and arrays can be used and even mixed in the same column
|
||||
// - dictionary column can be matched against non-dictionary column if underlying value
|
||||
// types are equal
|
||||
// - dictionary column can be matched against dictionary column with a different index
|
||||
// type, and potentially using a different dictionary, if underlying value types are equal
|
||||
//
|
||||
// We currently require in hash join that for all dictionary encoded columns, the same
|
||||
// dictionary is used in all input exec batches.
|
||||
//
|
||||
// In order to allow matching columns with different dictionaries, different dictionary
|
||||
// index types, and dictionary key against non-dictionary key, internally comparisons will
|
||||
// be evaluated after remapping values on both sides of the join to a common
|
||||
// representation (which will be called "unified representation"). This common
|
||||
// representation is a column of int32() type (not a dictionary column). It represents an
|
||||
// index in the unified dictionary computed for the (only) dictionary present on build
|
||||
// side (an empty dictionary is still created for an empty build side). Null value is
|
||||
// always represented in this common representation as null int32 value, unified
|
||||
// dictionary will never contain a null value (so there is no ambiguity of representing
|
||||
// nulls as either index to a null entry in the dictionary or null index).
|
||||
//
|
||||
// Unified dictionary represents values present on build side. There may be values on
|
||||
// probe side that are not present in it. All such values, that are not null, are mapped
|
||||
// in the common representation to a special constant kMissingValueId.
|
||||
//
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using compute::ExecBatch;
|
||||
using compute::ExecContext;
|
||||
using compute::internal::RowEncoder;
|
||||
|
||||
namespace acero {
|
||||
|
||||
/// Helper class with operations that are stateless and common to processing of dictionary
|
||||
/// keys on both build and probe side.
|
||||
class HashJoinDictUtil {
|
||||
public:
|
||||
// Null values in unified representation are always represented as null that has
|
||||
// corresponding integer set to this constant
|
||||
static constexpr int32_t kNullId = 0;
|
||||
// Constant representing a value, that is not null, missing on the build side, in
|
||||
// unified representation.
|
||||
static constexpr int32_t kMissingValueId = -1;
|
||||
|
||||
// Check if data types of corresponding pair of key column on build and probe side are
|
||||
// compatible
|
||||
static bool KeyDataTypesValid(const std::shared_ptr<DataType>& probe_data_type,
|
||||
const std::shared_ptr<DataType>& build_data_type);
|
||||
|
||||
// Input must be dictionary array or dictionary scalar.
|
||||
// A precomputed and provided here lookup table in the form of int32() array will be
|
||||
// used to remap input indices to unified representation.
|
||||
//
|
||||
static Result<std::shared_ptr<ArrayData>> IndexRemapUsingLUT(
|
||||
ExecContext* ctx, const Datum& indices, int64_t batch_length,
|
||||
const std::shared_ptr<ArrayData>& map_array,
|
||||
const std::shared_ptr<DataType>& data_type);
|
||||
|
||||
// Return int32() array that contains indices of input dictionary array or scalar after
|
||||
// type casting.
|
||||
static Result<std::shared_ptr<ArrayData>> ConvertToInt32(
|
||||
const std::shared_ptr<DataType>& from_type, const Datum& input,
|
||||
int64_t batch_length, ExecContext* ctx);
|
||||
|
||||
// Return an array that contains elements of input int32() array after casting to a
|
||||
// given integer type. This is used for mapping unified representation stored in the
|
||||
// hash table on build side back to original input data type of hash join, when
|
||||
// outputting hash join results to parent exec node.
|
||||
//
|
||||
static Result<std::shared_ptr<ArrayData>> ConvertFromInt32(
|
||||
const std::shared_ptr<DataType>& to_type, const Datum& input, int64_t batch_length,
|
||||
ExecContext* ctx);
|
||||
|
||||
// Return dictionary referenced in either dictionary array or dictionary scalar
|
||||
static std::shared_ptr<Array> ExtractDictionary(const Datum& data);
|
||||
};
|
||||
|
||||
/// Implements processing of dictionary arrays/scalars in key columns on the build side of
|
||||
/// a hash join.
|
||||
/// Each instance of this class corresponds to a single column and stores and
|
||||
/// processes only the information related to that column.
|
||||
/// Const methods are thread-safe, non-const methods are not (the caller must make sure
|
||||
/// that only one thread at any time will access them).
|
||||
///
|
||||
class HashJoinDictBuild {
|
||||
public:
|
||||
// Returns true if the key column (described in input by its data type) requires any
|
||||
// pre- or post-processing related to handling dictionaries.
|
||||
//
|
||||
static bool KeyNeedsProcessing(const std::shared_ptr<DataType>& build_data_type) {
|
||||
return (build_data_type->id() == Type::DICTIONARY);
|
||||
}
|
||||
|
||||
// Data type of unified representation
|
||||
static std::shared_ptr<DataType> DataTypeAfterRemapping() { return int32(); }
|
||||
|
||||
// Should be called only once in hash join, before processing any build or probe
|
||||
// batches.
|
||||
//
|
||||
// Takes a pointer to the dictionary for a corresponding key column on the build side as
|
||||
// an input. If the build side is empty, it still needs to be called, but with
|
||||
// dictionary pointer set to null.
|
||||
//
|
||||
// Currently it is required that all input batches on build side share the same
|
||||
// dictionary. For each input batch during its pre-processing, dictionary will be
|
||||
// checked and error will be returned if it is different then the one provided in the
|
||||
// call to this method.
|
||||
//
|
||||
// Unifies the dictionary. The order of the values is still preserved.
|
||||
// Null and duplicate entries are removed. If the dictionary is already unified, its
|
||||
// copy will be produced and stored within this class.
|
||||
//
|
||||
// Prepares the mapping from ids within original dictionary to the ids in the resulting
|
||||
// dictionary. This is used later on to pre-process (map to unified representation) key
|
||||
// column on build side.
|
||||
//
|
||||
// Prepares the reverse mapping (in the form of hash table) from values to the ids in
|
||||
// the resulting dictionary. This will be used later on to pre-process (map to unified
|
||||
// representation) key column on probe side. Values on probe side that are not present
|
||||
// in the original dictionary will be mapped to a special constant kMissingValueId. The
|
||||
// exception is made for nulls, which get always mapped to nulls (both when null is
|
||||
// represented as a dictionary id pointing to a null and a null dictionary id).
|
||||
//
|
||||
Status Init(ExecContext* ctx, std::shared_ptr<Array> dictionary,
|
||||
std::shared_ptr<DataType> index_type, std::shared_ptr<DataType> value_type);
|
||||
|
||||
// Remap array or scalar values into unified representation (array of int32()).
|
||||
// Outputs kMissingValueId if input value is not found in the unified dictionary.
|
||||
// Outputs null for null input value (with corresponding data set to kNullId).
|
||||
//
|
||||
Result<std::shared_ptr<ArrayData>> RemapInputValues(ExecContext* ctx,
|
||||
const Datum& values,
|
||||
int64_t batch_length) const;
|
||||
|
||||
// Remap dictionary array or dictionary scalar on build side to unified representation.
|
||||
// Dictionary referenced in the input must match the dictionary that was
|
||||
// given during initialization.
|
||||
// The output is a dictionary array that references unified dictionary.
|
||||
//
|
||||
Result<std::shared_ptr<ArrayData>> RemapInput(
|
||||
ExecContext* ctx, const Datum& indices, int64_t batch_length,
|
||||
const std::shared_ptr<DataType>& data_type) const;
|
||||
|
||||
// Outputs dictionary array referencing unified dictionary, given an array with 32-bit
|
||||
// ids.
|
||||
// Used to post-process values looked up in a hash table on build side of the hash join
|
||||
// before outputting to the parent exec node.
|
||||
//
|
||||
Result<std::shared_ptr<ArrayData>> RemapOutput(const ArrayData& indices32Bit,
|
||||
ExecContext* ctx) const;
|
||||
|
||||
// Release shared pointers and memory
|
||||
void CleanUp();
|
||||
|
||||
private:
|
||||
// Data type of dictionary ids for the input dictionary on build side
|
||||
std::shared_ptr<DataType> index_type_;
|
||||
// Data type of values for the input dictionary on build side
|
||||
std::shared_ptr<DataType> value_type_;
|
||||
// Mapping from (encoded as string) values to the ids in unified dictionary
|
||||
std::unordered_map<std::string, int32_t> hash_table_;
|
||||
// Mapping from input dictionary ids to unified dictionary ids
|
||||
std::shared_ptr<ArrayData> remapped_ids_;
|
||||
// Input dictionary
|
||||
std::shared_ptr<Array> dictionary_;
|
||||
// Unified dictionary
|
||||
std::shared_ptr<ArrayData> unified_dictionary_;
|
||||
};
|
||||
|
||||
/// Implements processing of dictionary arrays/scalars in key columns on the probe side of
|
||||
/// a hash join.
|
||||
/// Each instance of this class corresponds to a single column and stores and
|
||||
/// processes only the information related to that column.
|
||||
/// It is not thread-safe - every participating thread should use its own instance of
|
||||
/// this class.
|
||||
///
|
||||
class HashJoinDictProbe {
|
||||
public:
|
||||
static bool KeyNeedsProcessing(const std::shared_ptr<DataType>& probe_data_type,
|
||||
const std::shared_ptr<DataType>& build_data_type);
|
||||
|
||||
// Data type of the result of remapping input key column.
|
||||
//
|
||||
// The result of remapping is what is used in hash join for matching keys on build and
|
||||
// probe side. The exact data types may be different, as described below, and therefore
|
||||
// a common representation is needed for simplifying comparisons of pairs of keys on
|
||||
// both sides.
|
||||
//
|
||||
// We support matching key that is of non-dictionary type with key that is of dictionary
|
||||
// type, as long as the underlying value types are equal. We support matching when both
|
||||
// keys are of dictionary type, regardless whether underlying dictionary index types are
|
||||
// the same or not.
|
||||
//
|
||||
static std::shared_ptr<DataType> DataTypeAfterRemapping(
|
||||
const std::shared_ptr<DataType>& build_data_type);
|
||||
|
||||
// Should only be called if KeyNeedsProcessing method returns true for a pair of
|
||||
// corresponding key columns from build and probe side.
|
||||
// Converts values in order to match the common representation for
|
||||
// both build and probe side used in hash table comparison.
|
||||
// Supports arrays and scalars as input.
|
||||
// Argument opt_build_side should be null if dictionary key on probe side is matched
|
||||
// with non-dictionary key on build side.
|
||||
//
|
||||
Result<std::shared_ptr<ArrayData>> RemapInput(
|
||||
const HashJoinDictBuild* opt_build_side, const Datum& data, int64_t batch_length,
|
||||
const std::shared_ptr<DataType>& probe_data_type,
|
||||
const std::shared_ptr<DataType>& build_data_type, ExecContext* ctx);
|
||||
|
||||
void CleanUp();
|
||||
|
||||
private:
|
||||
// May be null if probe side key is non-dictionary. Otherwise it is used to verify that
|
||||
// only a single dictionary is referenced in exec batch on probe side of hash join.
|
||||
std::shared_ptr<Array> dictionary_;
|
||||
// Mapping from dictionary on probe side of hash join (if it is used) to unified
|
||||
// representation.
|
||||
std::shared_ptr<ArrayData> remapped_ids_;
|
||||
// Encoder of key columns that uses unified representation instead of original data type
|
||||
// for key columns that need to use it (have dictionaries on either side of the join).
|
||||
RowEncoder encoder_;
|
||||
};
|
||||
|
||||
// Encapsulates dictionary handling logic for build side of hash join.
|
||||
//
|
||||
class HashJoinDictBuildMulti {
|
||||
public:
|
||||
Status Init(const SchemaProjectionMaps<HashJoinProjection>& proj_map,
|
||||
const ExecBatch* opt_non_empty_batch, ExecContext* ctx);
|
||||
static void InitEncoder(const SchemaProjectionMaps<HashJoinProjection>& proj_map,
|
||||
RowEncoder* encoder, ExecContext* ctx);
|
||||
Status EncodeBatch(size_t thread_index,
|
||||
const SchemaProjectionMaps<HashJoinProjection>& proj_map,
|
||||
const ExecBatch& batch, RowEncoder* encoder, ExecContext* ctx) const;
|
||||
Status PostDecode(const SchemaProjectionMaps<HashJoinProjection>& proj_map,
|
||||
ExecBatch* decoded_key_batch, ExecContext* ctx);
|
||||
const HashJoinDictBuild& get_dict_build(int icol) const { return remap_imp_[icol]; }
|
||||
|
||||
private:
|
||||
std::vector<bool> needs_remap_;
|
||||
std::vector<HashJoinDictBuild> remap_imp_;
|
||||
};
|
||||
|
||||
// Encapsulates dictionary handling logic for probe side of hash join
|
||||
//
|
||||
class HashJoinDictProbeMulti {
|
||||
public:
|
||||
void Init(size_t num_threads);
|
||||
bool BatchRemapNeeded(size_t thread_index,
|
||||
const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
|
||||
const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
|
||||
ExecContext* ctx);
|
||||
Status EncodeBatch(size_t thread_index,
|
||||
const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
|
||||
const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
|
||||
const HashJoinDictBuildMulti& dict_build, const ExecBatch& batch,
|
||||
RowEncoder** out_encoder, ExecBatch* opt_out_key_batch,
|
||||
ExecContext* ctx);
|
||||
|
||||
private:
|
||||
void InitLocalStateIfNeeded(
|
||||
size_t thread_index, const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
|
||||
const SchemaProjectionMaps<HashJoinProjection>& proj_map_build, ExecContext* ctx);
|
||||
static void InitEncoder(const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
|
||||
const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
|
||||
RowEncoder* encoder, ExecContext* ctx);
|
||||
struct ThreadLocalState {
|
||||
bool is_initialized;
|
||||
// Whether any key column needs remapping (because of dictionaries used) before doing
|
||||
// join hash table lookups
|
||||
bool any_needs_remap;
|
||||
// Whether each key column needs remapping before doing join hash table lookups
|
||||
std::vector<bool> needs_remap;
|
||||
std::vector<HashJoinDictProbe> remap_imp;
|
||||
// Encoder of key columns that uses unified representation instead of original data
|
||||
// type for key columns that need to use it (have dictionaries on either side of the
|
||||
// join).
|
||||
RowEncoder post_remap_encoder;
|
||||
};
|
||||
std::vector<ThreadLocalState> local_states_;
|
||||
};
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,103 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/options.h"
|
||||
#include "arrow/acero/schema_util.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using compute::ExecContext;
|
||||
|
||||
namespace acero {
|
||||
|
||||
class ARROW_ACERO_EXPORT HashJoinSchema {
|
||||
public:
|
||||
Status Init(JoinType join_type, const Schema& left_schema,
|
||||
const std::vector<FieldRef>& left_keys, const Schema& right_schema,
|
||||
const std::vector<FieldRef>& right_keys, const Expression& filter,
|
||||
const std::string& left_field_name_prefix,
|
||||
const std::string& right_field_name_prefix);
|
||||
|
||||
Status Init(JoinType join_type, const Schema& left_schema,
|
||||
const std::vector<FieldRef>& left_keys,
|
||||
const std::vector<FieldRef>& left_output, const Schema& right_schema,
|
||||
const std::vector<FieldRef>& right_keys,
|
||||
const std::vector<FieldRef>& right_output, const Expression& filter,
|
||||
const std::string& left_field_name_prefix,
|
||||
const std::string& right_field_name_prefix);
|
||||
|
||||
static Status ValidateSchemas(JoinType join_type, const Schema& left_schema,
|
||||
const std::vector<FieldRef>& left_keys,
|
||||
const std::vector<FieldRef>& left_output,
|
||||
const Schema& right_schema,
|
||||
const std::vector<FieldRef>& right_keys,
|
||||
const std::vector<FieldRef>& right_output,
|
||||
const std::string& left_field_name_prefix,
|
||||
const std::string& right_field_name_prefix);
|
||||
|
||||
bool HasDictionaries() const;
|
||||
|
||||
bool HasLargeBinary() const;
|
||||
|
||||
Result<Expression> BindFilter(Expression filter, const Schema& left_schema,
|
||||
const Schema& right_schema, ExecContext* exec_context);
|
||||
std::shared_ptr<Schema> MakeOutputSchema(const std::string& left_field_name_suffix,
|
||||
const std::string& right_field_name_suffix);
|
||||
|
||||
bool LeftPayloadIsEmpty() const { return PayloadIsEmpty(0); }
|
||||
|
||||
bool RightPayloadIsEmpty() const { return PayloadIsEmpty(1); }
|
||||
|
||||
static int kMissingField() {
|
||||
return SchemaProjectionMaps<HashJoinProjection>::kMissingField;
|
||||
}
|
||||
|
||||
SchemaProjectionMaps<HashJoinProjection> proj_maps[2];
|
||||
|
||||
private:
|
||||
static bool IsTypeSupported(const DataType& type);
|
||||
|
||||
Status CollectFilterColumns(std::vector<FieldRef>& left_filter,
|
||||
std::vector<FieldRef>& right_filter,
|
||||
const Expression& filter, const Schema& left_schema,
|
||||
const Schema& right_schema);
|
||||
|
||||
Expression RewriteFilterToUseFilterSchema(int right_filter_offset,
|
||||
const SchemaProjectionMap& left_to_filter,
|
||||
const SchemaProjectionMap& right_to_filter,
|
||||
const Expression& filter);
|
||||
|
||||
bool PayloadIsEmpty(int side) const {
|
||||
assert(side == 0 || side == 1);
|
||||
return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0;
|
||||
}
|
||||
|
||||
static Result<std::vector<FieldRef>> ComputePayload(const Schema& schema,
|
||||
const std::vector<FieldRef>& output,
|
||||
const std::vector<FieldRef>& filter,
|
||||
const std::vector<FieldRef>& key);
|
||||
};
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,81 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/exec_plan.h"
|
||||
#include "arrow/acero/util.h"
|
||||
#include "arrow/acero/visibility.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/cancel.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
|
||||
/// A utility base class for simple exec nodes with one input
|
||||
///
|
||||
/// Pause/Resume Producing are forwarded appropriately
|
||||
/// There is nothing to do in StopProducingImpl
|
||||
///
|
||||
/// An AtomicCounter is used to keep track of when all data has arrived. When it
|
||||
/// has the Finish() method will be invoked
|
||||
class ARROW_ACERO_EXPORT MapNode : public ExecNode, public TracedNode {
|
||||
public:
|
||||
MapNode(ExecPlan* plan, std::vector<ExecNode*> inputs,
|
||||
std::shared_ptr<Schema> output_schema);
|
||||
|
||||
Status InputFinished(ExecNode* input, int total_batches) override;
|
||||
|
||||
Status StartProducing() override;
|
||||
|
||||
void PauseProducing(ExecNode* output, int32_t counter) override;
|
||||
|
||||
void ResumeProducing(ExecNode* output, int32_t counter) override;
|
||||
|
||||
Status InputReceived(ExecNode* input, ExecBatch batch) override;
|
||||
|
||||
const Ordering& ordering() const override;
|
||||
|
||||
protected:
|
||||
Status StopProducingImpl() override;
|
||||
|
||||
/// Transform a batch
|
||||
///
|
||||
/// The output batch will have the same guarantee as the input batch
|
||||
/// If this was the last batch this call may trigger Finish()
|
||||
virtual Result<ExecBatch> ProcessBatch(ExecBatch batch) = 0;
|
||||
|
||||
/// Function called after all data has been received
|
||||
///
|
||||
/// By default this does nothing. Override this to provide a custom implementation.
|
||||
virtual void Finish();
|
||||
|
||||
protected:
|
||||
// Counter for the number of batches received
|
||||
AtomicCounter input_counter_;
|
||||
};
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,874 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/type_fwd.h"
|
||||
#include "arrow/acero/visibility.h"
|
||||
#include "arrow/compute/api_aggregate.h"
|
||||
#include "arrow/compute/api_vector.h"
|
||||
#include "arrow/compute/exec.h"
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/future.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using compute::Aggregate;
|
||||
using compute::ExecBatch;
|
||||
using compute::Expression;
|
||||
using compute::literal;
|
||||
using compute::Ordering;
|
||||
using compute::SelectKOptions;
|
||||
using compute::SortOptions;
|
||||
|
||||
namespace internal {
|
||||
|
||||
class Executor;
|
||||
|
||||
} // namespace internal
|
||||
|
||||
namespace acero {
|
||||
|
||||
/// \brief This must not be used in release-mode
|
||||
struct DebugOptions;
|
||||
|
||||
using AsyncExecBatchGenerator = std::function<Future<std::optional<ExecBatch>>()>;
|
||||
|
||||
/// \addtogroup acero-nodes
|
||||
/// @{
|
||||
|
||||
/// \brief A base class for all options objects
|
||||
///
|
||||
/// The only time this is used directly is when a node has no configuration
|
||||
class ARROW_ACERO_EXPORT ExecNodeOptions {
|
||||
public:
|
||||
virtual ~ExecNodeOptions() = default;
|
||||
|
||||
/// \brief This must not be used in release-mode
|
||||
std::shared_ptr<DebugOptions> debug_opts;
|
||||
};
|
||||
|
||||
/// \brief A node representing a generic source of data for Acero
|
||||
///
|
||||
/// The source node will start calling `generator` during StartProducing. An initial
|
||||
/// task will be created that will call `generator`. It will not call `generator`
|
||||
/// reentrantly. If the source can be read in parallel then those details should be
|
||||
/// encapsulated within `generator`.
|
||||
///
|
||||
/// For each batch received a new task will be created to push that batch downstream.
|
||||
/// This task will slice smaller units of size `ExecPlan::kMaxBatchSize` from the
|
||||
/// parent batch and call InputReceived. Thus, if the `generator` yields a large
|
||||
/// batch it may result in several calls to InputReceived.
|
||||
///
|
||||
/// The SourceNode will, by default, assign an implicit ordering to outgoing batches.
|
||||
/// This is valid as long as the generator generates batches in a deterministic fashion.
|
||||
/// Currently, the only way to override this is to subclass the SourceNode.
|
||||
///
|
||||
/// This node is not generally used directly but can serve as the basis for various
|
||||
/// specialized nodes.
|
||||
class ARROW_ACERO_EXPORT SourceNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
/// Create an instance from values
|
||||
SourceNodeOptions(std::shared_ptr<Schema> output_schema,
|
||||
std::function<Future<std::optional<ExecBatch>>()> generator,
|
||||
Ordering ordering = Ordering::Unordered())
|
||||
: output_schema(std::move(output_schema)),
|
||||
generator(std::move(generator)),
|
||||
ordering(std::move(ordering)) {}
|
||||
|
||||
/// \brief the schema for batches that will be generated by this source
|
||||
std::shared_ptr<Schema> output_schema;
|
||||
/// \brief an asynchronous stream of batches ending with std::nullopt
|
||||
std::function<Future<std::optional<ExecBatch>>()> generator;
|
||||
/// \brief the order of the data, defaults to Ordering::Unordered
|
||||
Ordering ordering;
|
||||
};
|
||||
|
||||
/// \brief a node that generates data from a table already loaded in memory
|
||||
///
|
||||
/// The table source node will slice off chunks, defined by `max_batch_size`
|
||||
/// for parallel processing. The table source node extends source node and so these
|
||||
/// chunks will be iteratively processed in small batches. \see SourceNodeOptions
|
||||
/// for details.
|
||||
class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
static constexpr int64_t kDefaultMaxBatchSize = 1 << 20;
|
||||
|
||||
/// Create an instance from values
|
||||
TableSourceNodeOptions(std::shared_ptr<Table> table,
|
||||
int64_t max_batch_size = kDefaultMaxBatchSize)
|
||||
: table(std::move(table)), max_batch_size(max_batch_size) {}
|
||||
|
||||
/// \brief a table which acts as the data source
|
||||
std::shared_ptr<Table> table;
|
||||
/// \brief size of batches to emit from this node
|
||||
/// If the table is larger the node will emit multiple batches from the
|
||||
/// the table to be processed in parallel.
|
||||
int64_t max_batch_size;
|
||||
};
|
||||
|
||||
/// \brief define a lazily resolved Arrow table.
|
||||
///
|
||||
/// The table uniquely identified by the names can typically be resolved at the time when
|
||||
/// the plan is to be consumed.
|
||||
///
|
||||
/// This node is for serialization purposes only and can never be executed.
|
||||
class ARROW_ACERO_EXPORT NamedTableNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
/// Create an instance from values
|
||||
NamedTableNodeOptions(std::vector<std::string> names, std::shared_ptr<Schema> schema)
|
||||
: names(std::move(names)), schema(std::move(schema)) {}
|
||||
|
||||
/// \brief the names to put in the serialized plan
|
||||
std::vector<std::string> names;
|
||||
/// \brief the output schema of the table
|
||||
std::shared_ptr<Schema> schema;
|
||||
};
|
||||
|
||||
/// \brief a source node which feeds data from a synchronous iterator of batches
|
||||
///
|
||||
/// ItMaker is a maker of an iterator of tabular data.
|
||||
///
|
||||
/// The node can be configured to use an I/O executor. If set then each time the
|
||||
/// iterator is polled a new I/O thread task will be created to do the polling. This
|
||||
/// allows a blocking iterator to stay off the CPU thread pool.
|
||||
template <typename ItMaker>
|
||||
class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
/// Create an instance that will create a new task on io_executor for each iteration
|
||||
SchemaSourceNodeOptions(std::shared_ptr<Schema> schema, ItMaker it_maker,
|
||||
arrow::internal::Executor* io_executor)
|
||||
: schema(std::move(schema)),
|
||||
it_maker(std::move(it_maker)),
|
||||
io_executor(io_executor),
|
||||
requires_io(true) {}
|
||||
|
||||
/// Create an instance that will either iterate synchronously or use the default I/O
|
||||
/// executor
|
||||
SchemaSourceNodeOptions(std::shared_ptr<Schema> schema, ItMaker it_maker,
|
||||
bool requires_io = false)
|
||||
: schema(std::move(schema)),
|
||||
it_maker(std::move(it_maker)),
|
||||
io_executor(NULLPTR),
|
||||
requires_io(requires_io) {}
|
||||
|
||||
/// \brief The schema of the record batches from the iterator
|
||||
std::shared_ptr<Schema> schema;
|
||||
|
||||
/// \brief A maker of an iterator which acts as the data source
|
||||
ItMaker it_maker;
|
||||
|
||||
/// \brief The executor to use for scanning the iterator
|
||||
///
|
||||
/// Defaults to the default I/O executor. Only used if requires_io is true.
|
||||
/// If requires_io is false then this MUST be nullptr.
|
||||
arrow::internal::Executor* io_executor;
|
||||
|
||||
/// \brief If true then items will be fetched from the iterator on a dedicated I/O
|
||||
/// thread to keep I/O off the CPU thread
|
||||
bool requires_io;
|
||||
};
|
||||
|
||||
/// a source node that reads from a RecordBatchReader
|
||||
///
|
||||
/// Each iteration of the RecordBatchReader will be run on a new thread task created
|
||||
/// on the I/O thread pool.
|
||||
class ARROW_ACERO_EXPORT RecordBatchReaderSourceNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
/// Create an instance from values
|
||||
RecordBatchReaderSourceNodeOptions(std::shared_ptr<RecordBatchReader> reader,
|
||||
arrow::internal::Executor* io_executor = NULLPTR)
|
||||
: reader(std::move(reader)), io_executor(io_executor) {}
|
||||
|
||||
/// \brief The RecordBatchReader which acts as the data source
|
||||
std::shared_ptr<RecordBatchReader> reader;
|
||||
|
||||
/// \brief The executor to use for the reader
|
||||
///
|
||||
/// Defaults to the default I/O executor.
|
||||
arrow::internal::Executor* io_executor;
|
||||
};
|
||||
|
||||
/// a source node that reads from an iterator of array vectors
|
||||
using ArrayVectorIteratorMaker = std::function<Iterator<std::shared_ptr<ArrayVector>>()>;
|
||||
/// \brief An extended Source node which accepts a schema and array-vectors
|
||||
class ARROW_ACERO_EXPORT ArrayVectorSourceNodeOptions
|
||||
: public SchemaSourceNodeOptions<ArrayVectorIteratorMaker> {
|
||||
using SchemaSourceNodeOptions::SchemaSourceNodeOptions;
|
||||
};
|
||||
|
||||
/// a source node that reads from an iterator of ExecBatch
|
||||
using ExecBatchIteratorMaker = std::function<Iterator<std::shared_ptr<ExecBatch>>()>;
|
||||
/// \brief An extended Source node which accepts a schema and exec-batches
|
||||
class ARROW_ACERO_EXPORT ExecBatchSourceNodeOptions
|
||||
: public SchemaSourceNodeOptions<ExecBatchIteratorMaker> {
|
||||
public:
|
||||
using SchemaSourceNodeOptions::SchemaSourceNodeOptions;
|
||||
ExecBatchSourceNodeOptions(std::shared_ptr<Schema> schema,
|
||||
std::vector<ExecBatch> batches,
|
||||
::arrow::internal::Executor* io_executor);
|
||||
ExecBatchSourceNodeOptions(std::shared_ptr<Schema> schema,
|
||||
std::vector<ExecBatch> batches, bool requires_io = false);
|
||||
};
|
||||
|
||||
using RecordBatchIteratorMaker = std::function<Iterator<std::shared_ptr<RecordBatch>>()>;
|
||||
/// a source node that reads from an iterator of RecordBatch
|
||||
class ARROW_ACERO_EXPORT RecordBatchSourceNodeOptions
|
||||
: public SchemaSourceNodeOptions<RecordBatchIteratorMaker> {
|
||||
using SchemaSourceNodeOptions::SchemaSourceNodeOptions;
|
||||
};
|
||||
|
||||
/// \brief a node which excludes some rows from batches passed through it
|
||||
///
|
||||
/// filter_expression will be evaluated against each batch which is pushed to
|
||||
/// this node. Any rows for which filter_expression does not evaluate to `true` will be
|
||||
/// excluded in the batch emitted by this node.
|
||||
///
|
||||
/// This node will emit empty batches if all rows are excluded. This is done
|
||||
/// to avoid gaps in the ordering.
|
||||
class ARROW_ACERO_EXPORT FilterNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
/// \brief create an instance from values
|
||||
explicit FilterNodeOptions(Expression filter_expression)
|
||||
: filter_expression(std::move(filter_expression)) {}
|
||||
|
||||
/// \brief the expression to filter batches
|
||||
///
|
||||
/// The return type of this expression must be boolean
|
||||
Expression filter_expression;
|
||||
};
|
||||
|
||||
/// \brief a node which selects a specified subset from the input
|
||||
class ARROW_ACERO_EXPORT FetchNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
static constexpr std::string_view kName = "fetch";
|
||||
/// \brief create an instance from values
|
||||
FetchNodeOptions(int64_t offset, int64_t count) : offset(offset), count(count) {}
|
||||
/// \brief the number of rows to skip
|
||||
int64_t offset;
|
||||
/// \brief the number of rows to keep (not counting skipped rows)
|
||||
int64_t count;
|
||||
};
|
||||
|
||||
/// \brief a node which executes expressions on input batches, producing batches
|
||||
/// of the same length with new columns.
|
||||
///
|
||||
/// Each expression will be evaluated against each batch which is pushed to
|
||||
/// this node to produce a corresponding output column.
|
||||
///
|
||||
/// If names are not provided, the string representations of exprs will be used.
|
||||
class ARROW_ACERO_EXPORT ProjectNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
/// \brief create an instance from values
|
||||
explicit ProjectNodeOptions(std::vector<Expression> expressions,
|
||||
std::vector<std::string> names = {})
|
||||
: expressions(std::move(expressions)), names(std::move(names)) {}
|
||||
|
||||
/// \brief the expressions to run on the batches
|
||||
///
|
||||
/// The output will have one column for each expression. If you wish to keep any of
|
||||
/// the columns from the input then you should create a simple field_ref expression
|
||||
/// for that column.
|
||||
std::vector<Expression> expressions;
|
||||
/// \brief the names of the output columns
|
||||
///
|
||||
/// If this is not specified then the result of calling ToString on the expression will
|
||||
/// be used instead
|
||||
///
|
||||
/// This list should either be empty or have the same length as `expressions`
|
||||
std::vector<std::string> names;
|
||||
};
|
||||
|
||||
/// \brief a node which aggregates input batches and calculates summary statistics
|
||||
///
|
||||
/// The node can summarize the entire input or it can group the input with grouping keys
|
||||
/// and segment keys.
|
||||
///
|
||||
/// By default, the aggregate node is a pipeline breaker. It must accumulate all input
|
||||
/// before any output is produced. Segment keys are a performance optimization. If
|
||||
/// you know your input is already partitioned by one or more columns then you can
|
||||
/// specify these as segment keys. At each change in the segment keys the node will
|
||||
/// emit values for all data seen so far.
|
||||
///
|
||||
/// Segment keys are currently limited to single-threaded mode.
|
||||
///
|
||||
/// Both keys and segment-keys determine the group. However segment-keys are also used
|
||||
/// for determining grouping segments, which should be large, and allow streaming a
|
||||
/// partial aggregation result after processing each segment. One common use-case for
|
||||
/// segment-keys is ordered aggregation, in which the segment-key attribute specifies a
|
||||
/// column with non-decreasing values or a lexicographically-ordered set of such columns.
|
||||
///
|
||||
/// If the keys attribute is a non-empty vector, then each aggregate in `aggregates` is
|
||||
/// expected to be a HashAggregate function. If the keys attribute is an empty vector,
|
||||
/// then each aggregate is assumed to be a ScalarAggregate function.
|
||||
///
|
||||
/// If the segment_keys attribute is a non-empty vector, then segmented aggregation, as
|
||||
/// described above, applies.
|
||||
///
|
||||
/// The keys and segment_keys vectors must be disjoint.
|
||||
///
|
||||
/// If no measures are provided then you will simply get the list of unique keys.
|
||||
///
|
||||
/// This node outputs segment keys first, followed by regular keys, followed by one
|
||||
/// column for each aggregate.
|
||||
class ARROW_ACERO_EXPORT AggregateNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
/// \brief create an instance from values
|
||||
explicit AggregateNodeOptions(std::vector<Aggregate> aggregates,
|
||||
std::vector<FieldRef> keys = {},
|
||||
std::vector<FieldRef> segment_keys = {})
|
||||
: aggregates(std::move(aggregates)),
|
||||
keys(std::move(keys)),
|
||||
segment_keys(std::move(segment_keys)) {}
|
||||
|
||||
// aggregations which will be applied to the targeted fields
|
||||
std::vector<Aggregate> aggregates;
|
||||
// keys by which aggregations will be grouped (optional)
|
||||
std::vector<FieldRef> keys;
|
||||
// keys by which aggregations will be segmented (optional)
|
||||
std::vector<FieldRef> segment_keys;
|
||||
};
|
||||
|
||||
/// \brief a default value at which backpressure will be applied
|
||||
constexpr int32_t kDefaultBackpressureHighBytes = 1 << 30; // 1GiB
|
||||
/// \brief a default value at which backpressure will be removed
|
||||
constexpr int32_t kDefaultBackpressureLowBytes = 1 << 28; // 256MiB
|
||||
|
||||
/// \brief an interface that can be queried for backpressure statistics
|
||||
class ARROW_ACERO_EXPORT BackpressureMonitor {
|
||||
public:
|
||||
virtual ~BackpressureMonitor() = default;
|
||||
/// \brief fetches the number of bytes currently queued up
|
||||
virtual uint64_t bytes_in_use() = 0;
|
||||
/// \brief checks to see if backpressure is currently applied
|
||||
virtual bool is_paused() = 0;
|
||||
};
|
||||
|
||||
/// \brief Options to control backpressure behavior
|
||||
struct ARROW_ACERO_EXPORT BackpressureOptions {
|
||||
/// \brief Create default options that perform no backpressure
|
||||
BackpressureOptions() : resume_if_below(0), pause_if_above(0) {}
|
||||
/// \brief Create options that will perform backpressure
|
||||
///
|
||||
/// \param resume_if_below The producer should resume producing if the backpressure
|
||||
/// queue has fewer than resume_if_below items.
|
||||
/// \param pause_if_above The producer should pause producing if the backpressure
|
||||
/// queue has more than pause_if_above items
|
||||
BackpressureOptions(uint64_t resume_if_below, uint64_t pause_if_above)
|
||||
: resume_if_below(resume_if_below), pause_if_above(pause_if_above) {}
|
||||
|
||||
/// \brief create an instance using default values for backpressure limits
|
||||
static BackpressureOptions DefaultBackpressure() {
|
||||
return BackpressureOptions(kDefaultBackpressureLowBytes,
|
||||
kDefaultBackpressureHighBytes);
|
||||
}
|
||||
|
||||
/// \brief helper method to determine if backpressure is disabled
|
||||
/// \return true if pause_if_above is greater than zero, false otherwise
|
||||
bool should_apply_backpressure() const { return pause_if_above > 0; }
|
||||
|
||||
/// \brief the number of bytes at which the producer should resume producing
|
||||
uint64_t resume_if_below;
|
||||
/// \brief the number of bytes at which the producer should pause producing
|
||||
///
|
||||
/// If this is <= 0 then backpressure will be disabled
|
||||
uint64_t pause_if_above;
|
||||
};
|
||||
|
||||
/// \brief a sink node which collects results in a queue
|
||||
///
|
||||
/// Emitted batches will only be ordered if there is a meaningful ordering
|
||||
/// and sequence_output is not set to false.
|
||||
class ARROW_ACERO_EXPORT SinkNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
explicit SinkNodeOptions(std::function<Future<std::optional<ExecBatch>>()>* generator,
|
||||
std::shared_ptr<Schema>* schema,
|
||||
BackpressureOptions backpressure = {},
|
||||
BackpressureMonitor** backpressure_monitor = NULLPTR,
|
||||
std::optional<bool> sequence_output = std::nullopt)
|
||||
: generator(generator),
|
||||
schema(schema),
|
||||
backpressure(backpressure),
|
||||
backpressure_monitor(backpressure_monitor),
|
||||
sequence_output(sequence_output) {}
|
||||
|
||||
explicit SinkNodeOptions(std::function<Future<std::optional<ExecBatch>>()>* generator,
|
||||
BackpressureOptions backpressure = {},
|
||||
BackpressureMonitor** backpressure_monitor = NULLPTR,
|
||||
std::optional<bool> sequence_output = std::nullopt)
|
||||
: generator(generator),
|
||||
schema(NULLPTR),
|
||||
backpressure(std::move(backpressure)),
|
||||
backpressure_monitor(backpressure_monitor),
|
||||
sequence_output(sequence_output) {}
|
||||
|
||||
/// \brief A pointer to a generator of batches.
|
||||
///
|
||||
/// This will be set when the node is added to the plan and should be used to consume
|
||||
/// data from the plan. If this function is not called frequently enough then the sink
|
||||
/// node will start to accumulate data and may apply backpressure.
|
||||
std::function<Future<std::optional<ExecBatch>>()>* generator;
|
||||
/// \brief A pointer which will be set to the schema of the generated batches
|
||||
///
|
||||
/// This is optional, if nullptr is passed in then it will be ignored.
|
||||
/// This will be set when the node is added to the plan, before StartProducing is called
|
||||
std::shared_ptr<Schema>* schema;
|
||||
/// \brief Options to control when to apply backpressure
|
||||
///
|
||||
/// This is optional, the default is to never apply backpressure. If the plan is not
|
||||
/// consumed quickly enough the system may eventually run out of memory.
|
||||
BackpressureOptions backpressure;
|
||||
/// \brief A pointer to a backpressure monitor
|
||||
///
|
||||
/// This will be set when the node is added to the plan. This can be used to inspect
|
||||
/// the amount of data currently queued in the sink node. This is an optional utility
|
||||
/// and backpressure can be applied even if this is not used.
|
||||
BackpressureMonitor** backpressure_monitor;
|
||||
/// \brief Controls whether batches should be emitted immediately or sequenced in order
|
||||
///
|
||||
/// \see QueryOptions for more details
|
||||
std::optional<bool> sequence_output;
|
||||
};
|
||||
|
||||
/// \brief Control used by a SinkNodeConsumer to pause & resume
|
||||
///
|
||||
/// Callers should ensure that they do not call Pause and Resume simultaneously and they
|
||||
/// should sequence things so that a call to Pause() is always followed by an eventual
|
||||
/// call to Resume()
|
||||
class ARROW_ACERO_EXPORT BackpressureControl {
|
||||
public:
|
||||
virtual ~BackpressureControl() = default;
|
||||
/// \brief Ask the input to pause
|
||||
///
|
||||
/// This is best effort, batches may continue to arrive
|
||||
/// Must eventually be followed by a call to Resume() or deadlock will occur
|
||||
virtual void Pause() = 0;
|
||||
/// \brief Ask the input to resume
|
||||
virtual void Resume() = 0;
|
||||
};
|
||||
|
||||
/// \brief a sink node that consumes the data as part of the plan using callbacks
|
||||
class ARROW_ACERO_EXPORT SinkNodeConsumer {
|
||||
public:
|
||||
virtual ~SinkNodeConsumer() = default;
|
||||
/// \brief Prepare any consumer state
|
||||
///
|
||||
/// This will be run once the schema is finalized as the plan is starting and
|
||||
/// before any calls to Consume. A common use is to save off the schema so that
|
||||
/// batches can be interpreted.
|
||||
virtual Status Init(const std::shared_ptr<Schema>& schema,
|
||||
BackpressureControl* backpressure_control, ExecPlan* plan) = 0;
|
||||
/// \brief Consume a batch of data
|
||||
virtual Status Consume(ExecBatch batch) = 0;
|
||||
/// \brief Signal to the consumer that the last batch has been delivered
|
||||
///
|
||||
/// The returned future should only finish when all outstanding tasks have completed
|
||||
///
|
||||
/// If the plan is ended early or aborts due to an error then this will not be
|
||||
/// called.
|
||||
virtual Future<> Finish() = 0;
|
||||
};
|
||||
|
||||
/// \brief Add a sink node which consumes data within the exec plan run
|
||||
class ARROW_ACERO_EXPORT ConsumingSinkNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
explicit ConsumingSinkNodeOptions(std::shared_ptr<SinkNodeConsumer> consumer,
|
||||
std::vector<std::string> names = {},
|
||||
std::optional<bool> sequence_output = std::nullopt)
|
||||
: consumer(std::move(consumer)),
|
||||
names(std::move(names)),
|
||||
sequence_output(sequence_output) {}
|
||||
|
||||
std::shared_ptr<SinkNodeConsumer> consumer;
|
||||
/// \brief Names to rename the sink's schema fields to
|
||||
///
|
||||
/// If specified then names must be provided for all fields. Currently, only a flat
|
||||
/// schema is supported (see GH-31875).
|
||||
///
|
||||
/// If not specified then names will be generated based on the source data.
|
||||
std::vector<std::string> names;
|
||||
/// \brief Controls whether batches should be emitted immediately or sequenced in order
|
||||
///
|
||||
/// \see QueryOptions for more details
|
||||
std::optional<bool> sequence_output;
|
||||
};
|
||||
|
||||
/// \brief Make a node which sorts rows passed through it
|
||||
///
|
||||
/// All batches pushed to this node will be accumulated, then sorted, by the given
|
||||
/// fields. Then sorted batches will be forwarded to the generator in sorted order.
|
||||
class ARROW_ACERO_EXPORT OrderBySinkNodeOptions : public SinkNodeOptions {
|
||||
public:
|
||||
/// \brief create an instance from values
|
||||
explicit OrderBySinkNodeOptions(
|
||||
SortOptions sort_options,
|
||||
std::function<Future<std::optional<ExecBatch>>()>* generator)
|
||||
: SinkNodeOptions(generator), sort_options(std::move(sort_options)) {}
|
||||
|
||||
/// \brief options describing which columns and direction to sort
|
||||
SortOptions sort_options;
|
||||
};
|
||||
|
||||
/// \brief Apply a new ordering to data
|
||||
///
|
||||
/// Currently this node works by accumulating all data, sorting, and then emitting
|
||||
/// the new data with an updated batch index.
|
||||
///
|
||||
/// Larger-than-memory sort is not currently supported.
|
||||
class ARROW_ACERO_EXPORT OrderByNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
static constexpr std::string_view kName = "order_by";
|
||||
explicit OrderByNodeOptions(Ordering ordering) : ordering(std::move(ordering)) {}
|
||||
|
||||
/// \brief The new ordering to apply to outgoing data
|
||||
Ordering ordering;
|
||||
};
|
||||
|
||||
enum class JoinType {
|
||||
LEFT_SEMI,
|
||||
RIGHT_SEMI,
|
||||
LEFT_ANTI,
|
||||
RIGHT_ANTI,
|
||||
INNER,
|
||||
LEFT_OUTER,
|
||||
RIGHT_OUTER,
|
||||
FULL_OUTER
|
||||
};
|
||||
|
||||
std::string ToString(JoinType t);
|
||||
|
||||
enum class JoinKeyCmp { EQ, IS };
|
||||
|
||||
/// \brief a node which implements a join operation using a hash table
|
||||
class ARROW_ACERO_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
static constexpr const char* default_output_suffix_for_left = "";
|
||||
static constexpr const char* default_output_suffix_for_right = "";
|
||||
/// \brief create an instance from values that outputs all columns
|
||||
HashJoinNodeOptions(
|
||||
JoinType in_join_type, std::vector<FieldRef> in_left_keys,
|
||||
std::vector<FieldRef> in_right_keys, Expression filter = literal(true),
|
||||
std::string output_suffix_for_left = default_output_suffix_for_left,
|
||||
std::string output_suffix_for_right = default_output_suffix_for_right,
|
||||
bool disable_bloom_filter = false)
|
||||
: join_type(in_join_type),
|
||||
left_keys(std::move(in_left_keys)),
|
||||
right_keys(std::move(in_right_keys)),
|
||||
output_all(true),
|
||||
output_suffix_for_left(std::move(output_suffix_for_left)),
|
||||
output_suffix_for_right(std::move(output_suffix_for_right)),
|
||||
filter(std::move(filter)),
|
||||
disable_bloom_filter(disable_bloom_filter) {
|
||||
this->key_cmp.resize(this->left_keys.size());
|
||||
for (size_t i = 0; i < this->left_keys.size(); ++i) {
|
||||
this->key_cmp[i] = JoinKeyCmp::EQ;
|
||||
}
|
||||
}
|
||||
/// \brief create an instance from keys
|
||||
///
|
||||
/// This will create an inner join that outputs all columns and has no post join filter
|
||||
///
|
||||
/// `in_left_keys` should have the same length and types as `in_right_keys`
|
||||
/// @param in_left_keys the keys in the left input
|
||||
/// @param in_right_keys the keys in the right input
|
||||
HashJoinNodeOptions(std::vector<FieldRef> in_left_keys,
|
||||
std::vector<FieldRef> in_right_keys)
|
||||
: left_keys(std::move(in_left_keys)), right_keys(std::move(in_right_keys)) {
|
||||
this->join_type = JoinType::INNER;
|
||||
this->output_all = true;
|
||||
this->output_suffix_for_left = default_output_suffix_for_left;
|
||||
this->output_suffix_for_right = default_output_suffix_for_right;
|
||||
this->key_cmp.resize(this->left_keys.size());
|
||||
for (size_t i = 0; i < this->left_keys.size(); ++i) {
|
||||
this->key_cmp[i] = JoinKeyCmp::EQ;
|
||||
}
|
||||
this->filter = literal(true);
|
||||
}
|
||||
/// \brief create an instance from values using JoinKeyCmp::EQ for all comparisons
|
||||
HashJoinNodeOptions(
|
||||
JoinType join_type, std::vector<FieldRef> left_keys,
|
||||
std::vector<FieldRef> right_keys, std::vector<FieldRef> left_output,
|
||||
std::vector<FieldRef> right_output, Expression filter = literal(true),
|
||||
std::string output_suffix_for_left = default_output_suffix_for_left,
|
||||
std::string output_suffix_for_right = default_output_suffix_for_right,
|
||||
bool disable_bloom_filter = false)
|
||||
: join_type(join_type),
|
||||
left_keys(std::move(left_keys)),
|
||||
right_keys(std::move(right_keys)),
|
||||
output_all(false),
|
||||
left_output(std::move(left_output)),
|
||||
right_output(std::move(right_output)),
|
||||
output_suffix_for_left(std::move(output_suffix_for_left)),
|
||||
output_suffix_for_right(std::move(output_suffix_for_right)),
|
||||
filter(std::move(filter)),
|
||||
disable_bloom_filter(disable_bloom_filter) {
|
||||
this->key_cmp.resize(this->left_keys.size());
|
||||
for (size_t i = 0; i < this->left_keys.size(); ++i) {
|
||||
this->key_cmp[i] = JoinKeyCmp::EQ;
|
||||
}
|
||||
}
|
||||
/// \brief create an instance from values
|
||||
HashJoinNodeOptions(
|
||||
JoinType join_type, std::vector<FieldRef> left_keys,
|
||||
std::vector<FieldRef> right_keys, std::vector<FieldRef> left_output,
|
||||
std::vector<FieldRef> right_output, std::vector<JoinKeyCmp> key_cmp,
|
||||
Expression filter = literal(true),
|
||||
std::string output_suffix_for_left = default_output_suffix_for_left,
|
||||
std::string output_suffix_for_right = default_output_suffix_for_right,
|
||||
bool disable_bloom_filter = false)
|
||||
: join_type(join_type),
|
||||
left_keys(std::move(left_keys)),
|
||||
right_keys(std::move(right_keys)),
|
||||
output_all(false),
|
||||
left_output(std::move(left_output)),
|
||||
right_output(std::move(right_output)),
|
||||
key_cmp(std::move(key_cmp)),
|
||||
output_suffix_for_left(std::move(output_suffix_for_left)),
|
||||
output_suffix_for_right(std::move(output_suffix_for_right)),
|
||||
filter(std::move(filter)),
|
||||
disable_bloom_filter(disable_bloom_filter) {}
|
||||
|
||||
HashJoinNodeOptions() = default;
|
||||
|
||||
// type of join (inner, left, semi...)
|
||||
JoinType join_type = JoinType::INNER;
|
||||
// key fields from left input
|
||||
std::vector<FieldRef> left_keys;
|
||||
// key fields from right input
|
||||
std::vector<FieldRef> right_keys;
|
||||
// if set all valid fields from both left and right input will be output
|
||||
// (and field ref vectors for output fields will be ignored)
|
||||
bool output_all = false;
|
||||
// output fields passed from left input
|
||||
std::vector<FieldRef> left_output;
|
||||
// output fields passed from right input
|
||||
std::vector<FieldRef> right_output;
|
||||
// key comparison function (determines whether a null key is equal another null
|
||||
// key or not)
|
||||
std::vector<JoinKeyCmp> key_cmp;
|
||||
// suffix added to names of output fields coming from left input (used to distinguish,
|
||||
// if necessary, between fields of the same name in left and right input and can be left
|
||||
// empty if there are no name collisions)
|
||||
std::string output_suffix_for_left;
|
||||
// suffix added to names of output fields coming from right input
|
||||
std::string output_suffix_for_right;
|
||||
// residual filter which is applied to matching rows. Rows that do not match
|
||||
// the filter are not included. The filter is applied against the
|
||||
// concatenated input schema (left fields then right fields) and can reference
|
||||
// fields that are not included in the output.
|
||||
Expression filter = literal(true);
|
||||
// whether or not to disable Bloom filters in this join
|
||||
bool disable_bloom_filter = false;
|
||||
};
|
||||
|
||||
/// \brief a node which implements the asof join operation
|
||||
///
|
||||
/// Note, this API is experimental and will change in the future
|
||||
///
|
||||
/// This node takes one left table and any number of right tables, and asof joins them
|
||||
/// together. Batches produced by each input must be ordered by the "on" key.
|
||||
/// This node will output one row for each row in the left table.
|
||||
class ARROW_ACERO_EXPORT AsofJoinNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
/// \brief Keys for one input table of the AsofJoin operation
|
||||
///
|
||||
/// The keys must be consistent across the input tables:
|
||||
/// Each "on" key must refer to a field of the same type and units across the tables.
|
||||
/// Each "by" key must refer to a list of fields of the same types across the tables.
|
||||
struct Keys {
|
||||
/// \brief "on" key for the join.
|
||||
///
|
||||
/// The input table must be sorted by the "on" key. Must be a single field of a common
|
||||
/// type. An inexact match is used on the "on" key, i.e. a row is considered a
|
||||
/// match if and only if `right.on - left.on` is in the range
|
||||
/// `[min(0, tolerance), max(0, tolerance)]`.
|
||||
/// Currently, the "on" key must be of an integer, date, or timestamp type.
|
||||
FieldRef on_key;
|
||||
/// \brief "by" key for the join.
|
||||
///
|
||||
/// Each input table must have each field of the "by" key. Exact equality is used for
|
||||
/// each field of the "by" key.
|
||||
/// Currently, each field of the "by" key must be of an integer, date, timestamp, or
|
||||
/// base-binary type.
|
||||
std::vector<FieldRef> by_key;
|
||||
};
|
||||
|
||||
AsofJoinNodeOptions(std::vector<Keys> input_keys, int64_t tolerance)
|
||||
: input_keys(std::move(input_keys)), tolerance(tolerance) {}
|
||||
|
||||
/// \brief AsofJoin keys per input table. At least two keys must be given. The first key
|
||||
/// corresponds to a left table and all other keys correspond to right tables for the
|
||||
/// as-of-join.
|
||||
///
|
||||
/// \see `Keys` for details.
|
||||
std::vector<Keys> input_keys;
|
||||
/// \brief Tolerance for inexact "on" key matching. A right row is considered a match
|
||||
/// with a left row if `right.on - left.on` is in the range
|
||||
/// `[min(0, tolerance), max(0, tolerance)]`. `tolerance` may be:
|
||||
/// - negative, in which case a past-as-of-join occurs (match iff
|
||||
/// `tolerance <= right.on - left.on <= 0`);
|
||||
/// - or positive, in which case a future-as-of-join occurs (match iff
|
||||
/// `0 <= right.on - left.on <= tolerance`);
|
||||
/// - or zero, in which case an exact-as-of-join occurs (match iff
|
||||
/// `right.on == left.on`).
|
||||
///
|
||||
/// The tolerance is interpreted in the same units as the "on" key.
|
||||
int64_t tolerance;
|
||||
};
|
||||
|
||||
/// \brief a node which select top_k/bottom_k rows passed through it
|
||||
///
|
||||
/// All batches pushed to this node will be accumulated, then selected, by the given
|
||||
/// fields. Then sorted batches will be forwarded to the generator in sorted order.
|
||||
class ARROW_ACERO_EXPORT SelectKSinkNodeOptions : public SinkNodeOptions {
|
||||
public:
|
||||
explicit SelectKSinkNodeOptions(
|
||||
SelectKOptions select_k_options,
|
||||
std::function<Future<std::optional<ExecBatch>>()>* generator)
|
||||
: SinkNodeOptions(generator), select_k_options(std::move(select_k_options)) {}
|
||||
|
||||
/// SelectK options
|
||||
SelectKOptions select_k_options;
|
||||
};
|
||||
|
||||
/// \brief a sink node which accumulates all output into a table
|
||||
class ARROW_ACERO_EXPORT TableSinkNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
/// \brief create an instance from values
|
||||
explicit TableSinkNodeOptions(std::shared_ptr<Table>* output_table,
|
||||
std::optional<bool> sequence_output = std::nullopt)
|
||||
: output_table(output_table), sequence_output(sequence_output) {}
|
||||
|
||||
/// \brief an "out parameter" specifying the table that will be created
|
||||
///
|
||||
/// Must not be null and remain valid for the entirety of the plan execution. After the
|
||||
/// plan has completed this will be set to point to the result table
|
||||
std::shared_ptr<Table>* output_table;
|
||||
/// \brief Controls whether batches should be emitted immediately or sequenced in order
|
||||
///
|
||||
/// \see QueryOptions for more details
|
||||
std::optional<bool> sequence_output;
|
||||
/// \brief Custom names to use for the columns.
|
||||
///
|
||||
/// If specified then names must be provided for all fields. Currently, only a flat
|
||||
/// schema is supported (see GH-31875).
|
||||
///
|
||||
/// If not specified then names will be generated based on the source data.
|
||||
std::vector<std::string> names;
|
||||
};
|
||||
|
||||
/// \brief a row template that describes one row that will be generated for each input row
|
||||
struct ARROW_ACERO_EXPORT PivotLongerRowTemplate {
|
||||
PivotLongerRowTemplate(std::vector<std::string> feature_values,
|
||||
std::vector<std::optional<FieldRef>> measurement_values)
|
||||
: feature_values(std::move(feature_values)),
|
||||
measurement_values(std::move(measurement_values)) {}
|
||||
/// A (typically unique) set of feature values for the template, usually derived from a
|
||||
/// column name
|
||||
///
|
||||
/// These will be used to populate the feature columns
|
||||
std::vector<std::string> feature_values;
|
||||
/// The fields containing the measurements to use for this row
|
||||
///
|
||||
/// These will be used to populate the measurement columns. If nullopt then nulls
|
||||
/// will be inserted for the given value.
|
||||
std::vector<std::optional<FieldRef>> measurement_values;
|
||||
};
|
||||
|
||||
/// \brief Reshape a table by turning some columns into additional rows
|
||||
///
|
||||
/// This operation is sometimes also referred to as UNPIVOT
|
||||
///
|
||||
/// This is typically done when there are multiple observations in each row in order to
|
||||
/// transform to a table containing a single observation per row.
|
||||
///
|
||||
/// For example:
|
||||
///
|
||||
/// | time | left_temp | right_temp |
|
||||
/// | ---- | --------- | ---------- |
|
||||
/// | 1 | 10 | 20 |
|
||||
/// | 2 | 15 | 18 |
|
||||
///
|
||||
/// The above table contains two observations per row. There is an implicit feature
|
||||
/// "location" (left vs right) and a measurement "temp". What we really want is:
|
||||
///
|
||||
/// | time | location | temp |
|
||||
/// | --- | --- | --- |
|
||||
/// | 1 | left | 10 |
|
||||
/// | 1 | right | 20 |
|
||||
/// | 2 | left | 15 |
|
||||
/// | 2 | right | 18 |
|
||||
///
|
||||
/// For a more complex example consider:
|
||||
///
|
||||
/// | time | ax1 | ay1 | bx1 | ay2 |
|
||||
/// | ---- | --- | --- | --- | --- |
|
||||
/// | 0 | 1 | 2 | 3 | 4 |
|
||||
///
|
||||
/// We can pretend a vs b and x vs y are features while 1 and 2 are two different
|
||||
/// kinds of measurements. We thus want to pivot to
|
||||
///
|
||||
/// | time | a/b | x/y | f1 | f2 |
|
||||
/// | ---- | --- | --- | ---- | ---- |
|
||||
/// | 0 | a | x | 1 | null |
|
||||
/// | 0 | a | y | 2 | 4 |
|
||||
/// | 0 | b | x | 3 | null |
|
||||
///
|
||||
/// To do this we create a row template for each combination of features. One should
|
||||
/// be able to do this purely by looking at the column names. For example, given the
|
||||
/// above columns "ax1", "ay1", "bx1", and "ay2" we know we have three feature
|
||||
/// combinations (a, x), (a, y), and (b, x). Similarly, we know we have two possible
|
||||
/// measurements, "1" and "2".
|
||||
///
|
||||
/// For each combination of features we create a row template. In each row template we
|
||||
/// describe the combination and then list which columns to use for the measurements.
|
||||
/// If a measurement doesn't exist for a given combination then we use nullopt.
|
||||
///
|
||||
/// So, for our above example, we have:
|
||||
///
|
||||
/// (a, x): names={"a", "x"}, values={"ax1", nullopt}
|
||||
/// (a, y): names={"a", "y"}, values={"ay1", "ay2"}
|
||||
/// (b, x): names={"b", "x"}, values={"bx1", nullopt}
|
||||
///
|
||||
/// Finishing it off we name our new columns:
|
||||
/// feature_field_names={"a/b","x/y"}
|
||||
/// measurement_field_names={"f1", "f2"}
|
||||
class ARROW_ACERO_EXPORT PivotLongerNodeOptions : public ExecNodeOptions {
|
||||
public:
|
||||
static constexpr std::string_view kName = "pivot_longer";
|
||||
/// One or more row templates to create new output rows
|
||||
///
|
||||
/// Normally there are at least two row templates. The output # of rows
|
||||
/// will be the input # of rows * the number of row templates
|
||||
std::vector<PivotLongerRowTemplate> row_templates;
|
||||
/// The names of the columns which describe the new features
|
||||
std::vector<std::string> feature_field_names;
|
||||
/// The names of the columns which represent the measurements
|
||||
std::vector<std::string> measurement_field_names;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,56 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/options.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using compute::ExecContext;
|
||||
|
||||
namespace acero {
|
||||
|
||||
class OrderByImpl {
|
||||
public:
|
||||
virtual ~OrderByImpl() = default;
|
||||
|
||||
virtual void InputReceived(const std::shared_ptr<RecordBatch>& batch) = 0;
|
||||
|
||||
virtual Result<Datum> DoFinish() = 0;
|
||||
|
||||
virtual std::string ToString() const = 0;
|
||||
|
||||
static Result<std::unique_ptr<OrderByImpl>> MakeSort(
|
||||
ExecContext* ctx, const std::shared_ptr<Schema>& output_schema,
|
||||
const SortOptions& options);
|
||||
|
||||
static Result<std::unique_ptr<OrderByImpl>> MakeSelectK(
|
||||
ExecContext* ctx, const std::shared_ptr<Schema>& output_schema,
|
||||
const SelectKOptions& options);
|
||||
};
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,186 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <random>
|
||||
|
||||
#include "arrow/acero/util.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/util/pcg_random.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
|
||||
class PartitionSort {
|
||||
public:
|
||||
/// \brief Bucket sort rows on partition ids in O(num_rows) time.
|
||||
///
|
||||
/// Include in the output exclusive cumulative sum of bucket sizes.
|
||||
/// This corresponds to ranges in the sorted array containing all row ids for
|
||||
/// each of the partitions.
|
||||
///
|
||||
/// prtn_ranges must be initialized and have at least num_prtns + 1 elements
|
||||
/// when this method returns prtn_ranges[i] will contains the total number of
|
||||
/// elements in partitions 0 through i. prtn_ranges[0] will be 0.
|
||||
///
|
||||
/// prtn_id_impl must be a function that takes in a row id (int) and returns
|
||||
/// a partition id (int). The returned partition id must be between 0 and
|
||||
/// num_prtns (exclusive).
|
||||
///
|
||||
/// output_pos_impl is a function that takes in a row id (int) and a position (int)
|
||||
/// in the bucket sorted output. The function should insert the row in the
|
||||
/// output.
|
||||
///
|
||||
/// For example:
|
||||
///
|
||||
/// in_arr: [5, 7, 2, 3, 5, 4]
|
||||
/// num_prtns: 3
|
||||
/// prtn_id_impl: [&in_arr] (int row_id) { return in_arr[row_id] / 3; }
|
||||
/// output_pos_impl: [&sorted_row_ids] (int row_id, int pos) { sorted_row_ids[pos] =
|
||||
/// row_id; }
|
||||
///
|
||||
/// After Execution
|
||||
/// sorted_row_ids: [2, 0, 3, 4, 5, 1]
|
||||
/// prtn_ranges: [0, 1, 5, 6]
|
||||
template <class INPUT_PRTN_ID_FN, class OUTPUT_POS_FN>
|
||||
static void Eval(int64_t num_rows, int num_prtns, uint16_t* prtn_ranges,
|
||||
INPUT_PRTN_ID_FN prtn_id_impl, OUTPUT_POS_FN output_pos_impl) {
|
||||
ARROW_DCHECK(num_rows > 0 && num_rows <= (1 << 15));
|
||||
ARROW_DCHECK(num_prtns >= 1 && num_prtns <= (1 << 15));
|
||||
|
||||
memset(prtn_ranges, 0, (num_prtns + 1) * sizeof(uint16_t));
|
||||
|
||||
for (int64_t i = 0; i < num_rows; ++i) {
|
||||
int prtn_id = static_cast<int>(prtn_id_impl(i));
|
||||
++prtn_ranges[prtn_id + 1];
|
||||
}
|
||||
|
||||
uint16_t sum = 0;
|
||||
for (int i = 0; i < num_prtns; ++i) {
|
||||
uint16_t sum_next = sum + prtn_ranges[i + 1];
|
||||
prtn_ranges[i + 1] = sum;
|
||||
sum = sum_next;
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < num_rows; ++i) {
|
||||
int prtn_id = static_cast<int>(prtn_id_impl(i));
|
||||
int pos = prtn_ranges[prtn_id + 1]++;
|
||||
output_pos_impl(i, pos);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief A control for synchronizing threads on a partitionable workload
|
||||
class PartitionLocks {
|
||||
public:
|
||||
PartitionLocks();
|
||||
~PartitionLocks();
|
||||
/// \brief Initializes the control, must be called before use
|
||||
///
|
||||
/// \param num_threads Maximum number of threads that will access the partitions
|
||||
/// \param num_prtns Number of partitions to synchronize
|
||||
void Init(size_t num_threads, int num_prtns);
|
||||
/// \brief Cleans up the control, it should not be used after this call
|
||||
void CleanUp();
|
||||
/// \brief Acquire a partition to work on one
|
||||
///
|
||||
/// \param thread_id The index of the thread trying to acquire the partition lock
|
||||
/// \param num_prtns Length of prtns_to_try, must be <= num_prtns used in Init
|
||||
/// \param prtns_to_try An array of partitions that still have remaining work
|
||||
/// \param limit_retries If false, this method will spinwait forever until success
|
||||
/// \param max_retries Max times to attempt checking out work before returning false
|
||||
/// \param[out] locked_prtn_id The id of the partition locked
|
||||
/// \param[out] locked_prtn_id_pos The index of the partition locked in prtns_to_try
|
||||
/// \return True if a partition was locked, false if max_retries was attempted
|
||||
/// without successfully acquiring a lock
|
||||
///
|
||||
/// This method is thread safe
|
||||
bool AcquirePartitionLock(size_t thread_id, int num_prtns, const int* prtns_to_try,
|
||||
bool limit_retries, int max_retries, int* locked_prtn_id,
|
||||
int* locked_prtn_id_pos);
|
||||
/// \brief Release a partition so that other threads can work on it
|
||||
void ReleasePartitionLock(int prtn_id);
|
||||
|
||||
// Executes (synchronously and using current thread) the same operation on a set of
|
||||
// multiple partitions. Tries to minimize partition locking overhead by randomizing and
|
||||
// adjusting order in which partitions are processed.
|
||||
//
|
||||
// PROCESS_PRTN_FN is a callback which will be executed for each partition after
|
||||
// acquiring the lock for that partition. It gets partition id as an argument.
|
||||
// IS_PRTN_EMPTY_FN is a callback which filters out (when returning true) partitions
|
||||
// with specific ids from processing.
|
||||
//
|
||||
template <typename IS_PRTN_EMPTY_FN, typename PROCESS_PRTN_FN>
|
||||
Status ForEachPartition(size_t thread_id,
|
||||
/*scratch space buffer with space for one element per partition;
|
||||
dirty in and dirty out*/
|
||||
int* temp_unprocessed_prtns, IS_PRTN_EMPTY_FN is_prtn_empty_fn,
|
||||
PROCESS_PRTN_FN process_prtn_fn) {
|
||||
int num_unprocessed_partitions = 0;
|
||||
for (int i = 0; i < num_prtns_; ++i) {
|
||||
bool is_prtn_empty = is_prtn_empty_fn(i);
|
||||
if (!is_prtn_empty) {
|
||||
temp_unprocessed_prtns[num_unprocessed_partitions++] = i;
|
||||
}
|
||||
}
|
||||
while (num_unprocessed_partitions > 0) {
|
||||
int locked_prtn_id;
|
||||
int locked_prtn_id_pos;
|
||||
AcquirePartitionLock(thread_id, num_unprocessed_partitions, temp_unprocessed_prtns,
|
||||
/*limit_retries=*/false, /*max_retries=*/-1, &locked_prtn_id,
|
||||
&locked_prtn_id_pos);
|
||||
{
|
||||
class AutoReleaseLock {
|
||||
public:
|
||||
AutoReleaseLock(PartitionLocks* locks, int prtn_id)
|
||||
: locks(locks), prtn_id(prtn_id) {}
|
||||
~AutoReleaseLock() { locks->ReleasePartitionLock(prtn_id); }
|
||||
PartitionLocks* locks;
|
||||
int prtn_id;
|
||||
} auto_release_lock(this, locked_prtn_id);
|
||||
ARROW_RETURN_NOT_OK(process_prtn_fn(locked_prtn_id));
|
||||
}
|
||||
if (locked_prtn_id_pos < num_unprocessed_partitions - 1) {
|
||||
temp_unprocessed_prtns[locked_prtn_id_pos] =
|
||||
temp_unprocessed_prtns[num_unprocessed_partitions - 1];
|
||||
}
|
||||
--num_unprocessed_partitions;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
std::atomic<bool>* lock_ptr(int prtn_id);
|
||||
int random_int(size_t thread_id, int num_values);
|
||||
|
||||
struct PartitionLock {
|
||||
static constexpr int kCacheLineBytes = 64;
|
||||
std::atomic<bool> lock;
|
||||
uint8_t padding[kCacheLineBytes];
|
||||
};
|
||||
int num_prtns_;
|
||||
std::unique_ptr<PartitionLock[]> locks_;
|
||||
std::unique_ptr<arrow::random::pcg32_fast[]> rngs_;
|
||||
};
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,151 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#pragma once
|
||||
|
||||
#include <string_view>
|
||||
|
||||
#include "arrow/acero/exec_plan.h"
|
||||
#include "arrow/acero/task_util.h"
|
||||
#include "arrow/acero/util.h"
|
||||
#include "arrow/compute/exec.h"
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/util/async_util.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using compute::default_exec_context;
|
||||
using io::IOContext;
|
||||
|
||||
namespace acero {
|
||||
|
||||
class ARROW_ACERO_EXPORT QueryContext {
|
||||
public:
|
||||
QueryContext(QueryOptions opts = {},
|
||||
ExecContext exec_context = *default_exec_context());
|
||||
|
||||
Status Init(arrow::util::AsyncTaskScheduler* scheduler);
|
||||
|
||||
const ::arrow::internal::CpuInfo* cpu_info() const;
|
||||
int64_t hardware_flags() const;
|
||||
const QueryOptions& options() const { return options_; }
|
||||
MemoryPool* memory_pool() const { return exec_context_.memory_pool(); }
|
||||
::arrow::internal::Executor* executor() const { return exec_context_.executor(); }
|
||||
ExecContext* exec_context() { return &exec_context_; }
|
||||
IOContext* io_context() { return &io_context_; }
|
||||
TaskScheduler* scheduler() { return task_scheduler_.get(); }
|
||||
arrow::util::AsyncTaskScheduler* async_scheduler() { return async_scheduler_; }
|
||||
|
||||
size_t GetThreadIndex();
|
||||
size_t max_concurrency() const;
|
||||
|
||||
/// \brief Start an external task
|
||||
///
|
||||
/// This should be avoided if possible. It is kept in for now for legacy
|
||||
/// purposes. This should be called before the external task is started. If
|
||||
/// a valid future is returned then it should be marked complete when the
|
||||
/// external task has finished.
|
||||
///
|
||||
/// \param name A name to give the task for traceability and debugging
|
||||
///
|
||||
/// \return an invalid future if the plan has already ended, otherwise this
|
||||
/// returns a future that must be completed when the external task
|
||||
/// finishes.
|
||||
Result<Future<>> BeginExternalTask(std::string_view name);
|
||||
|
||||
/// \brief Add a single function as a task to the query's task group
|
||||
/// on the compute threadpool.
|
||||
///
|
||||
/// \param fn The task to run. Takes no arguments and returns a Status.
|
||||
/// \param name A name to give the task for traceability and debugging
|
||||
void ScheduleTask(std::function<Status()> fn, std::string_view name);
|
||||
/// \brief Add a single function as a task to the query's task group
|
||||
/// on the compute threadpool.
|
||||
///
|
||||
/// \param fn The task to run. Takes the thread index and returns a Status.
|
||||
/// \param name A name to give the task for traceability and debugging
|
||||
void ScheduleTask(std::function<Status(size_t)> fn, std::string_view name);
|
||||
/// \brief Add a single function as a task to the query's task group on
|
||||
/// the IO thread pool
|
||||
///
|
||||
/// \param fn The task to run. Returns a status.
|
||||
/// \param name A name to give the task for traceability and debugging
|
||||
void ScheduleIOTask(std::function<Status()> fn, std::string_view name);
|
||||
|
||||
// Register/Start TaskGroup is a way of performing a "Parallel For" pattern:
|
||||
// - The task function takes the thread index and the index of the task
|
||||
// - The on_finished function takes the thread index
|
||||
// Returns an integer ID that will be used to reference the task group in
|
||||
// StartTaskGroup. At runtime, call StartTaskGroup with the ID and the number of times
|
||||
// you'd like the task to be executed. The need to register a task group before use will
|
||||
// be removed after we rewrite the scheduler.
|
||||
/// \brief Register a "parallel for" task group with the scheduler
|
||||
///
|
||||
/// \param task The function implementing the task. Takes the thread_index and
|
||||
/// the task index.
|
||||
/// \param on_finished The function that gets run once all tasks have been completed.
|
||||
/// Takes the thread_index.
|
||||
///
|
||||
/// Must be called inside of ExecNode::Init.
|
||||
int RegisterTaskGroup(std::function<Status(size_t, int64_t)> task,
|
||||
std::function<Status(size_t)> on_finished);
|
||||
|
||||
/// \brief Start the task group with the specified ID. This can only
|
||||
/// be called once per task_group_id.
|
||||
///
|
||||
/// \param task_group_id The ID of the task group to run
|
||||
/// \param num_tasks The number of times to run the task
|
||||
Status StartTaskGroup(int task_group_id, int64_t num_tasks);
|
||||
|
||||
// This is an RAII class for keeping track of in-flight file IO. Useful for getting
|
||||
// an estimate of memory use, and how much memory we expect to be freed soon.
|
||||
// Returned by ReportTempFileIO.
|
||||
struct [[nodiscard]] TempFileIOMark {
|
||||
QueryContext* ctx_;
|
||||
size_t bytes_;
|
||||
|
||||
TempFileIOMark(QueryContext* ctx, size_t bytes) : ctx_(ctx), bytes_(bytes) {
|
||||
ctx_->in_flight_bytes_to_disk_.fetch_add(bytes_, std::memory_order_acquire);
|
||||
}
|
||||
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(TempFileIOMark);
|
||||
|
||||
~TempFileIOMark() {
|
||||
ctx_->in_flight_bytes_to_disk_.fetch_sub(bytes_, std::memory_order_release);
|
||||
}
|
||||
};
|
||||
|
||||
TempFileIOMark ReportTempFileIO(size_t bytes) { return {this, bytes}; }
|
||||
|
||||
size_t GetCurrentTempFileIO() { return in_flight_bytes_to_disk_.load(); }
|
||||
|
||||
private:
|
||||
QueryOptions options_;
|
||||
// To be replaced with Acero-specific context once scheduler is done and
|
||||
// we don't need ExecContext for kernels
|
||||
ExecContext exec_context_;
|
||||
IOContext io_context_;
|
||||
|
||||
arrow::util::AsyncTaskScheduler* async_scheduler_ = NULLPTR;
|
||||
std::unique_ptr<TaskScheduler> task_scheduler_ = TaskScheduler::Make();
|
||||
|
||||
ThreadIndexer thread_indexer_;
|
||||
|
||||
std::atomic<size_t> in_flight_bytes_to_disk_{0};
|
||||
};
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,226 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/type.h" // for DataType, FieldRef, Field and Schema
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using internal::checked_cast;
|
||||
|
||||
namespace acero {
|
||||
|
||||
// Identifiers for all different row schemas that are used in a join
|
||||
//
|
||||
enum class HashJoinProjection : int {
|
||||
INPUT = 0,
|
||||
KEY = 1,
|
||||
PAYLOAD = 2,
|
||||
FILTER = 3,
|
||||
OUTPUT = 4
|
||||
};
|
||||
|
||||
struct SchemaProjectionMap {
|
||||
static constexpr int kMissingField = -1;
|
||||
int num_cols;
|
||||
const int* source_to_base;
|
||||
const int* base_to_target;
|
||||
inline int get(int i) const {
|
||||
assert(i >= 0 && i < num_cols);
|
||||
assert(source_to_base[i] != kMissingField);
|
||||
return base_to_target[source_to_base[i]];
|
||||
}
|
||||
};
|
||||
|
||||
/// Helper class for managing different projections of the same row schema.
|
||||
/// Used to efficiently map any field in one projection to a corresponding field in
|
||||
/// another projection.
|
||||
/// Materialized mappings are generated lazily at the time of the first access.
|
||||
/// Thread-safe apart from initialization.
|
||||
template <typename ProjectionIdEnum>
|
||||
class SchemaProjectionMaps {
|
||||
public:
|
||||
static constexpr int kMissingField = -1;
|
||||
|
||||
Status Init(ProjectionIdEnum full_schema_handle, const Schema& schema,
|
||||
const std::vector<ProjectionIdEnum>& projection_handles,
|
||||
const std::vector<const std::vector<FieldRef>*>& projections) {
|
||||
assert(projection_handles.size() == projections.size());
|
||||
ARROW_RETURN_NOT_OK(RegisterSchema(full_schema_handle, schema));
|
||||
for (size_t i = 0; i < projections.size(); ++i) {
|
||||
ARROW_RETURN_NOT_OK(
|
||||
RegisterProjectedSchema(projection_handles[i], *(projections[i]), schema));
|
||||
}
|
||||
RegisterEnd();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int num_cols(ProjectionIdEnum schema_handle) const {
|
||||
int id = schema_id(schema_handle);
|
||||
return static_cast<int>(schemas_[id].second.data_types.size());
|
||||
}
|
||||
|
||||
bool is_empty(ProjectionIdEnum schema_handle) const {
|
||||
return num_cols(schema_handle) == 0;
|
||||
}
|
||||
|
||||
const std::string& field_name(ProjectionIdEnum schema_handle, int field_id) const {
|
||||
int id = schema_id(schema_handle);
|
||||
return schemas_[id].second.field_names[field_id];
|
||||
}
|
||||
|
||||
const std::shared_ptr<DataType>& data_type(ProjectionIdEnum schema_handle,
|
||||
int field_id) const {
|
||||
int id = schema_id(schema_handle);
|
||||
return schemas_[id].second.data_types[field_id];
|
||||
}
|
||||
|
||||
const std::vector<std::shared_ptr<DataType>>& data_types(
|
||||
ProjectionIdEnum schema_handle) const {
|
||||
int id = schema_id(schema_handle);
|
||||
return schemas_[id].second.data_types;
|
||||
}
|
||||
|
||||
SchemaProjectionMap map(ProjectionIdEnum from, ProjectionIdEnum to) const {
|
||||
int id_from = schema_id(from);
|
||||
int id_to = schema_id(to);
|
||||
SchemaProjectionMap result;
|
||||
result.num_cols = num_cols(from);
|
||||
result.source_to_base = mappings_[id_from].data();
|
||||
result.base_to_target = inverse_mappings_[id_to].data();
|
||||
return result;
|
||||
}
|
||||
|
||||
protected:
|
||||
struct FieldInfos {
|
||||
std::vector<int> field_paths;
|
||||
std::vector<std::string> field_names;
|
||||
std::vector<std::shared_ptr<DataType>> data_types;
|
||||
};
|
||||
|
||||
Status RegisterSchema(ProjectionIdEnum handle, const Schema& schema) {
|
||||
FieldInfos out_fields;
|
||||
const FieldVector& in_fields = schema.fields();
|
||||
out_fields.field_paths.resize(in_fields.size());
|
||||
out_fields.field_names.resize(in_fields.size());
|
||||
out_fields.data_types.resize(in_fields.size());
|
||||
for (size_t i = 0; i < in_fields.size(); ++i) {
|
||||
const std::string& name = in_fields[i]->name();
|
||||
const std::shared_ptr<DataType>& type = in_fields[i]->type();
|
||||
out_fields.field_paths[i] = static_cast<int>(i);
|
||||
out_fields.field_names[i] = name;
|
||||
out_fields.data_types[i] = type;
|
||||
}
|
||||
schemas_.push_back(std::make_pair(handle, out_fields));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegisterProjectedSchema(ProjectionIdEnum handle,
|
||||
const std::vector<FieldRef>& selected_fields,
|
||||
const Schema& full_schema) {
|
||||
FieldInfos out_fields;
|
||||
const FieldVector& in_fields = full_schema.fields();
|
||||
out_fields.field_paths.resize(selected_fields.size());
|
||||
out_fields.field_names.resize(selected_fields.size());
|
||||
out_fields.data_types.resize(selected_fields.size());
|
||||
for (size_t i = 0; i < selected_fields.size(); ++i) {
|
||||
// All fields must be found in schema without ambiguity
|
||||
ARROW_ASSIGN_OR_RAISE(auto match, selected_fields[i].FindOne(full_schema));
|
||||
const std::string& name = in_fields[match[0]]->name();
|
||||
const std::shared_ptr<DataType>& type = in_fields[match[0]]->type();
|
||||
out_fields.field_paths[i] = match[0];
|
||||
out_fields.field_names[i] = name;
|
||||
out_fields.data_types[i] = type;
|
||||
}
|
||||
schemas_.push_back(std::make_pair(handle, out_fields));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void RegisterEnd() {
|
||||
size_t size = schemas_.size();
|
||||
mappings_.resize(size);
|
||||
inverse_mappings_.resize(size);
|
||||
int id_base = 0;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
GenerateMapForProjection(static_cast<int>(i), id_base);
|
||||
}
|
||||
}
|
||||
|
||||
int schema_id(ProjectionIdEnum schema_handle) const {
|
||||
for (size_t i = 0; i < schemas_.size(); ++i) {
|
||||
if (schemas_[i].first == schema_handle) {
|
||||
return static_cast<int>(i);
|
||||
}
|
||||
}
|
||||
// We should never get here
|
||||
assert(false);
|
||||
return -1;
|
||||
}
|
||||
|
||||
void GenerateMapForProjection(int id_proj, int id_base) {
|
||||
int num_cols_proj = static_cast<int>(schemas_[id_proj].second.data_types.size());
|
||||
int num_cols_base = static_cast<int>(schemas_[id_base].second.data_types.size());
|
||||
|
||||
std::vector<int>& mapping = mappings_[id_proj];
|
||||
std::vector<int>& inverse_mapping = inverse_mappings_[id_proj];
|
||||
mapping.resize(num_cols_proj);
|
||||
inverse_mapping.resize(num_cols_base);
|
||||
|
||||
if (id_proj == id_base) {
|
||||
for (int i = 0; i < num_cols_base; ++i) {
|
||||
mapping[i] = inverse_mapping[i] = i;
|
||||
}
|
||||
} else {
|
||||
const FieldInfos& fields_proj = schemas_[id_proj].second;
|
||||
const FieldInfos& fields_base = schemas_[id_base].second;
|
||||
for (int i = 0; i < num_cols_base; ++i) {
|
||||
inverse_mapping[i] = SchemaProjectionMap::kMissingField;
|
||||
}
|
||||
for (int i = 0; i < num_cols_proj; ++i) {
|
||||
int field_id = SchemaProjectionMap::kMissingField;
|
||||
for (int j = 0; j < num_cols_base; ++j) {
|
||||
if (fields_proj.field_paths[i] == fields_base.field_paths[j]) {
|
||||
field_id = j;
|
||||
// If there are multiple matches for the same input field,
|
||||
// it will be mapped to the first match.
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(field_id != SchemaProjectionMap::kMissingField);
|
||||
mapping[i] = field_id;
|
||||
inverse_mapping[field_id] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// vector used as a mapping from ProjectionIdEnum to fields
|
||||
std::vector<std::pair<ProjectionIdEnum, FieldInfos>> schemas_;
|
||||
std::vector<std::vector<int>> mappings_;
|
||||
std::vector<std::vector<int>> inverse_mappings_;
|
||||
};
|
||||
|
||||
using HashJoinProjectionMaps = SchemaProjectionMaps<HashJoinProjection>;
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,102 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/visibility.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/config.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
|
||||
// Atomic value surrounded by padding bytes to avoid cache line invalidation
|
||||
// whenever it is modified by a concurrent thread on a different CPU core.
|
||||
//
|
||||
template <typename T>
|
||||
class AtomicWithPadding {
|
||||
private:
|
||||
static constexpr int kCacheLineSize = 64;
|
||||
uint8_t padding_before[kCacheLineSize];
|
||||
|
||||
public:
|
||||
std::atomic<T> value;
|
||||
|
||||
private:
|
||||
uint8_t padding_after[kCacheLineSize];
|
||||
};
|
||||
|
||||
// Used for asynchronous execution of operations that can be broken into
|
||||
// a fixed number of symmetric tasks that can be executed concurrently.
|
||||
//
|
||||
// Implements priorities between multiple such operations, called task groups.
|
||||
//
|
||||
// Allows to specify the maximum number of in-flight tasks at any moment.
|
||||
//
|
||||
// Also allows for executing next pending tasks immediately using a caller thread.
|
||||
//
|
||||
class ARROW_ACERO_EXPORT TaskScheduler {
|
||||
public:
|
||||
using TaskImpl = std::function<Status(size_t, int64_t)>;
|
||||
using TaskGroupContinuationImpl = std::function<Status(size_t)>;
|
||||
using ScheduleImpl = std::function<Status(TaskGroupContinuationImpl)>;
|
||||
using AbortContinuationImpl = std::function<void()>;
|
||||
|
||||
virtual ~TaskScheduler() = default;
|
||||
|
||||
// Order in which task groups are registered represents priorities of their tasks
|
||||
// (the first group has the highest priority).
|
||||
//
|
||||
// Returns task group identifier that is used to request operations on the task group.
|
||||
virtual int RegisterTaskGroup(TaskImpl task_impl,
|
||||
TaskGroupContinuationImpl cont_impl) = 0;
|
||||
|
||||
virtual void RegisterEnd() = 0;
|
||||
|
||||
// total_num_tasks may be zero, in which case task group continuation will be executed
|
||||
// immediately
|
||||
virtual Status StartTaskGroup(size_t thread_id, int group_id,
|
||||
int64_t total_num_tasks) = 0;
|
||||
|
||||
// Execute given number of tasks immediately using caller thread
|
||||
virtual Status ExecuteMore(size_t thread_id, int num_tasks_to_execute,
|
||||
bool execute_all) = 0;
|
||||
|
||||
// Begin scheduling tasks using provided callback and
|
||||
// the limit on the number of in-flight tasks at any moment.
|
||||
//
|
||||
// Scheduling will continue as long as there are waiting tasks.
|
||||
//
|
||||
// It will automatically resume whenever new task group gets started.
|
||||
virtual Status StartScheduling(size_t thread_id, ScheduleImpl schedule_impl,
|
||||
int num_concurrent_tasks, bool use_sync_execution) = 0;
|
||||
|
||||
// Abort scheduling and execution.
|
||||
// Used in case of being notified about unrecoverable error for the entire query.
|
||||
virtual void Abort(AbortContinuationImpl impl) = 0;
|
||||
|
||||
static std::unique_ptr<TaskScheduler> Make();
|
||||
};
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,86 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "arrow/acero/options.h"
|
||||
#include "arrow/acero/test_util_internal.h"
|
||||
#include "arrow/testing/random.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
|
||||
// \brief Make a delaying source that is optionally noisy (prints when it emits)
|
||||
AsyncGenerator<std::optional<ExecBatch>> MakeDelayedGen(
|
||||
Iterator<std::optional<ExecBatch>> src, std::string label, double delay_sec,
|
||||
bool noisy = false);
|
||||
|
||||
// \brief Make a delaying source that is optionally noisy (prints when it emits)
|
||||
AsyncGenerator<std::optional<ExecBatch>> MakeDelayedGen(
|
||||
AsyncGenerator<std::optional<ExecBatch>> src, std::string label, double delay_sec,
|
||||
bool noisy = false);
|
||||
|
||||
// \brief Make a delaying source that is optionally noisy (prints when it emits)
|
||||
AsyncGenerator<std::optional<ExecBatch>> MakeDelayedGen(BatchesWithSchema src,
|
||||
std::string label,
|
||||
double delay_sec,
|
||||
bool noisy = false);
|
||||
|
||||
/// A node that slightly resequences the input at random
|
||||
struct JitterNodeOptions : public ExecNodeOptions {
|
||||
random::SeedType seed;
|
||||
/// The max amount to add to a node's "cost".
|
||||
int max_jitter_modifier;
|
||||
|
||||
explicit JitterNodeOptions(random::SeedType seed, int max_jitter_modifier = 5)
|
||||
: seed(seed), max_jitter_modifier(max_jitter_modifier) {}
|
||||
static constexpr std::string_view kName = "jitter";
|
||||
};
|
||||
|
||||
class GateImpl;
|
||||
|
||||
class Gate {
|
||||
public:
|
||||
static std::shared_ptr<Gate> Make();
|
||||
|
||||
Gate();
|
||||
virtual ~Gate();
|
||||
|
||||
void ReleaseAllBatches();
|
||||
void ReleaseOneBatch();
|
||||
Future<> WaitForNextReleasedBatch();
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Gate);
|
||||
|
||||
GateImpl* impl_;
|
||||
};
|
||||
|
||||
// A node that holds all input batches until a given gate is released
|
||||
struct GatedNodeOptions : public ExecNodeOptions {
|
||||
explicit GatedNodeOptions(Gate* gate) : gate(gate) {}
|
||||
Gate* gate;
|
||||
|
||||
static constexpr std::string_view kName = "gated";
|
||||
};
|
||||
|
||||
void RegisterTestNodes();
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,31 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/type_traits.h"
|
||||
|
||||
namespace arrow::acero {
|
||||
|
||||
// normalize the value to unsigned 64-bits while preserving ordering of values
|
||||
template <typename T, enable_if_t<std::is_integral<T>::value, bool> = true>
|
||||
uint64_t NormalizeTime(T t);
|
||||
|
||||
uint64_t GetTime(const RecordBatch* batch, Type::type time_type, int col, uint64_t row);
|
||||
|
||||
} // namespace arrow::acero
|
||||
@@ -0,0 +1,65 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/type_fwd.h"
|
||||
#include "arrow/acero/visibility.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace acero {
|
||||
namespace internal {
|
||||
|
||||
class ARROW_ACERO_EXPORT TpchGen {
|
||||
public:
|
||||
virtual ~TpchGen() = default;
|
||||
|
||||
/*
|
||||
* \brief Create a factory for nodes that generate TPC-H data
|
||||
*
|
||||
* Note: Individual tables will reference each other. It is important that you only
|
||||
* create a single TpchGen instance for each plan and then you can create nodes for each
|
||||
* table from that single TpchGen instance. Note: Every batch will be scheduled as a new
|
||||
* task using the ExecPlan's scheduler.
|
||||
*/
|
||||
static Result<std::unique_ptr<TpchGen>> Make(
|
||||
ExecPlan* plan, double scale_factor = 1.0, int64_t batch_size = 4096,
|
||||
std::optional<int64_t> seed = std::nullopt);
|
||||
|
||||
// The below methods will create and add an ExecNode to the plan that generates
|
||||
// data for the desired table. If columns is empty, all columns will be generated.
|
||||
// The methods return the added ExecNode, which should be used for inputs.
|
||||
virtual Result<ExecNode*> Supplier(std::vector<std::string> columns = {}) = 0;
|
||||
virtual Result<ExecNode*> Part(std::vector<std::string> columns = {}) = 0;
|
||||
virtual Result<ExecNode*> PartSupp(std::vector<std::string> columns = {}) = 0;
|
||||
virtual Result<ExecNode*> Customer(std::vector<std::string> columns = {}) = 0;
|
||||
virtual Result<ExecNode*> Orders(std::vector<std::string> columns = {}) = 0;
|
||||
virtual Result<ExecNode*> Lineitem(std::vector<std::string> columns = {}) = 0;
|
||||
virtual Result<ExecNode*> Nation(std::vector<std::string> columns = {}) = 0;
|
||||
virtual Result<ExecNode*> Region(std::vector<std::string> columns = {}) = 0;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,36 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace acero {
|
||||
|
||||
class ExecNode;
|
||||
class ExecPlan;
|
||||
class ExecNodeOptions;
|
||||
class ExecFactoryRegistry;
|
||||
class QueryContext;
|
||||
struct QueryOptions;
|
||||
struct Declaration;
|
||||
class SinkNodeConsumer;
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,184 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/options.h"
|
||||
#include "arrow/acero/type_fwd.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/compute/util.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/cpu_info.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "arrow/util/mutex.h"
|
||||
#include "arrow/util/thread_pool.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace acero {
|
||||
|
||||
ARROW_ACERO_EXPORT
|
||||
Status ValidateExecNodeInputs(ExecPlan* plan, const std::vector<ExecNode*>& inputs,
|
||||
int expected_num_inputs, const char* kind_name);
|
||||
|
||||
ARROW_ACERO_EXPORT
|
||||
Result<std::shared_ptr<Table>> TableFromExecBatches(
|
||||
const std::shared_ptr<Schema>& schema, const std::vector<ExecBatch>& exec_batches);
|
||||
|
||||
class ARROW_ACERO_EXPORT AtomicCounter {
|
||||
public:
|
||||
AtomicCounter() = default;
|
||||
|
||||
int count() const { return count_.load(); }
|
||||
|
||||
std::optional<int> total() const {
|
||||
int total = total_.load();
|
||||
if (total == -1) return {};
|
||||
return total;
|
||||
}
|
||||
|
||||
// return true if the counter is complete
|
||||
bool Increment() {
|
||||
ARROW_DCHECK_NE(count_.load(), total_.load());
|
||||
int count = count_.fetch_add(1) + 1;
|
||||
if (count != total_.load()) return false;
|
||||
return DoneOnce();
|
||||
}
|
||||
|
||||
// return true if the counter is complete
|
||||
bool SetTotal(int total) {
|
||||
total_.store(total);
|
||||
if (count_.load() != total) return false;
|
||||
return DoneOnce();
|
||||
}
|
||||
|
||||
// return true if the counter has not already been completed
|
||||
bool Cancel() { return DoneOnce(); }
|
||||
|
||||
// return true if the counter has finished or been cancelled
|
||||
bool Completed() { return complete_.load(); }
|
||||
|
||||
private:
|
||||
// ensure there is only one true return from Increment(), SetTotal(), or Cancel()
|
||||
bool DoneOnce() {
|
||||
bool expected = false;
|
||||
return complete_.compare_exchange_strong(expected, true);
|
||||
}
|
||||
|
||||
std::atomic<int> count_{0}, total_{-1};
|
||||
std::atomic<bool> complete_{false};
|
||||
};
|
||||
|
||||
class ARROW_ACERO_EXPORT ThreadIndexer {
|
||||
public:
|
||||
size_t operator()();
|
||||
|
||||
static size_t Capacity();
|
||||
|
||||
private:
|
||||
static size_t Check(size_t thread_index);
|
||||
|
||||
arrow::util::Mutex mutex_;
|
||||
std::unordered_map<std::thread::id, size_t> id_to_index_;
|
||||
};
|
||||
|
||||
/// \brief A consumer that collects results into an in-memory table
|
||||
struct ARROW_ACERO_EXPORT TableSinkNodeConsumer : public SinkNodeConsumer {
|
||||
public:
|
||||
TableSinkNodeConsumer(std::shared_ptr<Table>* out, MemoryPool* pool)
|
||||
: out_(out), pool_(pool) {}
|
||||
Status Init(const std::shared_ptr<Schema>& schema,
|
||||
BackpressureControl* backpressure_control, ExecPlan* plan) override;
|
||||
Status Consume(ExecBatch batch) override;
|
||||
Future<> Finish() override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Table>* out_;
|
||||
MemoryPool* pool_;
|
||||
std::shared_ptr<Schema> schema_;
|
||||
std::vector<std::shared_ptr<RecordBatch>> batches_;
|
||||
arrow::util::Mutex consume_mutex_;
|
||||
};
|
||||
|
||||
class ARROW_ACERO_EXPORT NullSinkNodeConsumer : public SinkNodeConsumer {
|
||||
public:
|
||||
Status Init(const std::shared_ptr<Schema>&, BackpressureControl*,
|
||||
ExecPlan* plan) override {
|
||||
return Status::OK();
|
||||
}
|
||||
Status Consume(ExecBatch exec_batch) override { return Status::OK(); }
|
||||
Future<> Finish() override { return Status::OK(); }
|
||||
|
||||
public:
|
||||
static std::shared_ptr<NullSinkNodeConsumer> Make() {
|
||||
return std::make_shared<NullSinkNodeConsumer>();
|
||||
}
|
||||
};
|
||||
|
||||
/// CRTP helper for tracing helper functions
|
||||
|
||||
class ARROW_ACERO_EXPORT TracedNode {
|
||||
public:
|
||||
// All nodes should call TraceStartProducing or NoteStartProducing exactly once
|
||||
// Most nodes will be fine with a call to NoteStartProducing since the StartProducing
|
||||
// call is usually fairly cheap and simply schedules tasks to fetch the actual data.
|
||||
|
||||
explicit TracedNode(ExecNode* node) : node_(node) {}
|
||||
|
||||
// Create a span to record the StartProducing work
|
||||
[[nodiscard]] ::arrow::internal::tracing::Scope TraceStartProducing(
|
||||
std::string extra_details) const;
|
||||
|
||||
// Record a call to StartProducing without creating with a span
|
||||
void NoteStartProducing(std::string extra_details) const;
|
||||
|
||||
// All nodes should call TraceInputReceived for each batch they receive. This call
|
||||
// should track the time spent processing the batch. NoteInputReceived is available
|
||||
// but usually won't be used unless a node is simply adding batches to a trivial queue.
|
||||
|
||||
// Create a span to record the InputReceived work
|
||||
[[nodiscard]] ::arrow::internal::tracing::Scope TraceInputReceived(
|
||||
const ExecBatch& batch) const;
|
||||
|
||||
// Record a call to InputReceived without creating with a span
|
||||
void NoteInputReceived(const ExecBatch& batch) const;
|
||||
|
||||
// Create a span to record any "finish" work. This should NOT be called as part of
|
||||
// InputFinished and many nodes may not need to call this at all. This should be used
|
||||
// when a node has some extra work that has to be done once it has received all of its
|
||||
// data. For example, an aggregation node calculating aggregations. This will
|
||||
// typically be called as a result of InputFinished OR InputReceived.
|
||||
[[nodiscard]] ::arrow::internal::tracing::Scope TraceFinish() const;
|
||||
|
||||
private:
|
||||
ExecNode* node_;
|
||||
};
|
||||
|
||||
} // namespace acero
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,50 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(push)
|
||||
# pragma warning(disable : 4251)
|
||||
# else
|
||||
# pragma GCC diagnostic ignored "-Wattributes"
|
||||
# endif
|
||||
|
||||
# ifdef ARROW_ACERO_STATIC
|
||||
# define ARROW_ACERO_EXPORT
|
||||
# elif defined(ARROW_ACERO_EXPORTING)
|
||||
# define ARROW_ACERO_EXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define ARROW_ACERO_EXPORT __declspec(dllimport)
|
||||
# endif
|
||||
|
||||
# define ARROW_ACERO_NO_EXPORT
|
||||
#else // Not Windows
|
||||
# ifndef ARROW_ACERO_EXPORT
|
||||
# define ARROW_ACERO_EXPORT __attribute__((visibility("default")))
|
||||
# endif
|
||||
# ifndef ARROW_ACERO_NO_EXPORT
|
||||
# define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
# endif
|
||||
#endif // Not-Windows
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# pragma warning(pop)
|
||||
#endif
|
||||
@@ -0,0 +1,323 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/adapters/orc/options.h"
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace adapters {
|
||||
namespace orc {
|
||||
|
||||
/// \brief Information about an ORC stripe
|
||||
struct StripeInformation {
|
||||
/// \brief Offset of the stripe from the start of the file, in bytes
|
||||
int64_t offset;
|
||||
/// \brief Length of the stripe, in bytes
|
||||
int64_t length;
|
||||
/// \brief Number of rows in the stripe
|
||||
int64_t num_rows;
|
||||
/// \brief Index of the first row of the stripe
|
||||
int64_t first_row_id;
|
||||
};
|
||||
|
||||
/// \class ORCFileReader
|
||||
/// \brief Read an Arrow Table or RecordBatch from an ORC file.
|
||||
class ARROW_EXPORT ORCFileReader {
|
||||
public:
|
||||
~ORCFileReader();
|
||||
|
||||
/// \brief Creates a new ORC reader
|
||||
///
|
||||
/// \param[in] file the data source
|
||||
/// \param[in] pool a MemoryPool to use for buffer allocations
|
||||
/// \return the returned reader object
|
||||
static Result<std::unique_ptr<ORCFileReader>> Open(
|
||||
const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool);
|
||||
|
||||
/// \brief Return the schema read from the ORC file
|
||||
///
|
||||
/// \return the returned Schema object
|
||||
Result<std::shared_ptr<Schema>> ReadSchema();
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read();
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] schema the Table schema
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema);
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] include_names the selected field names to read
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::vector<std::string>& include_names);
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] schema the Table schema
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema,
|
||||
const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Read a single stripe as a RecordBatch
|
||||
///
|
||||
/// \param[in] stripe the stripe index
|
||||
/// \return the returned RecordBatch
|
||||
Result<std::shared_ptr<RecordBatch>> ReadStripe(int64_t stripe);
|
||||
|
||||
/// \brief Read a single stripe as a RecordBatch
|
||||
///
|
||||
/// \param[in] stripe the stripe index
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the returned RecordBatch
|
||||
Result<std::shared_ptr<RecordBatch>> ReadStripe(
|
||||
int64_t stripe, const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Read a single stripe as a RecordBatch
|
||||
///
|
||||
/// \param[in] stripe the stripe index
|
||||
/// \param[in] include_names the selected field names to read
|
||||
/// \return the returned RecordBatch
|
||||
Result<std::shared_ptr<RecordBatch>> ReadStripe(
|
||||
int64_t stripe, const std::vector<std::string>& include_names);
|
||||
|
||||
/// \brief Seek to designated row. Invoke NextStripeReader() after seek
|
||||
/// will return stripe reader starting from designated row.
|
||||
///
|
||||
/// \param[in] row_number the rows number to seek
|
||||
Status Seek(int64_t row_number);
|
||||
|
||||
/// \brief Get a stripe level record batch iterator.
|
||||
///
|
||||
/// Each record batch will have up to `batch_size` rows.
|
||||
/// NextStripeReader serves as a fine-grained alternative to ReadStripe
|
||||
/// which may cause OOM issues by loading the whole stripe into memory.
|
||||
///
|
||||
/// Note this will only read rows for the current stripe, not the entire
|
||||
/// file.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows in each record batch
|
||||
/// \return the returned stripe reader
|
||||
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(int64_t batch_size);
|
||||
|
||||
/// \brief Get a stripe level record batch iterator.
|
||||
///
|
||||
/// Each record batch will have up to `batch_size` rows.
|
||||
/// NextStripeReader serves as a fine-grained alternative to ReadStripe
|
||||
/// which may cause OOM issues by loading the whole stripe into memory.
|
||||
///
|
||||
/// Note this will only read rows for the current stripe, not the entire
|
||||
/// file.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows in each record batch
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the stripe reader
|
||||
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
|
||||
int64_t batch_size, const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Get a record batch iterator for the entire file.
|
||||
///
|
||||
/// Each record batch will have up to `batch_size` rows.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows in each record batch
|
||||
/// \param[in] include_names the selected field names to read, if not empty
|
||||
/// (otherwise all fields are read)
|
||||
/// \return the record batch iterator
|
||||
Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
|
||||
int64_t batch_size, const std::vector<std::string>& include_names);
|
||||
|
||||
/// \brief The number of stripes in the file
|
||||
int64_t NumberOfStripes();
|
||||
|
||||
/// \brief The number of rows in the file
|
||||
int64_t NumberOfRows();
|
||||
|
||||
/// \brief StripeInformation for each stripe.
|
||||
StripeInformation GetStripeInformation(int64_t stripe);
|
||||
|
||||
/// \brief Get the format version of the file.
|
||||
/// Currently known values are 0.11 and 0.12.
|
||||
///
|
||||
/// \return The FileVersion of the ORC file.
|
||||
FileVersion GetFileVersion();
|
||||
|
||||
/// \brief Get the software instance and version that wrote this file.
|
||||
///
|
||||
/// \return a user-facing string that specifies the software version
|
||||
std::string GetSoftwareVersion();
|
||||
|
||||
/// \brief Get the compression kind of the file.
|
||||
///
|
||||
/// \return The kind of compression in the ORC file.
|
||||
Result<Compression::type> GetCompression();
|
||||
|
||||
/// \brief Get the buffer size for the compression.
|
||||
///
|
||||
/// \return Number of bytes to buffer for the compression codec.
|
||||
int64_t GetCompressionSize();
|
||||
|
||||
/// \brief Get the number of rows per an entry in the row index.
|
||||
/// \return the number of rows per an entry in the row index or 0 if there
|
||||
/// is no row index.
|
||||
int64_t GetRowIndexStride();
|
||||
|
||||
/// \brief Get ID of writer that generated the file.
|
||||
///
|
||||
/// \return UNKNOWN_WRITER if the writer ID is undefined
|
||||
WriterId GetWriterId();
|
||||
|
||||
/// \brief Get the writer id value when getWriterId() returns an unknown writer.
|
||||
///
|
||||
/// \return the integer value of the writer ID.
|
||||
int32_t GetWriterIdValue();
|
||||
|
||||
/// \brief Get the version of the writer.
|
||||
///
|
||||
/// \return the version of the writer.
|
||||
|
||||
WriterVersion GetWriterVersion();
|
||||
|
||||
/// \brief Get the number of stripe statistics in the file.
|
||||
///
|
||||
/// \return the number of stripe statistics
|
||||
int64_t GetNumberOfStripeStatistics();
|
||||
|
||||
/// \brief Get the length of the data stripes in the file.
|
||||
///
|
||||
/// \return return the number of bytes in stripes
|
||||
int64_t GetContentLength();
|
||||
|
||||
/// \brief Get the length of the file stripe statistics.
|
||||
///
|
||||
/// \return the number of compressed bytes in the file stripe statistics
|
||||
int64_t GetStripeStatisticsLength();
|
||||
|
||||
/// \brief Get the length of the file footer.
|
||||
///
|
||||
/// \return the number of compressed bytes in the file footer
|
||||
int64_t GetFileFooterLength();
|
||||
|
||||
/// \brief Get the length of the file postscript.
|
||||
///
|
||||
/// \return the number of bytes in the file postscript
|
||||
int64_t GetFilePostscriptLength();
|
||||
|
||||
/// \brief Get the total length of the file.
|
||||
///
|
||||
/// \return the number of bytes in the file
|
||||
int64_t GetFileLength();
|
||||
|
||||
/// \brief Get the serialized file tail.
|
||||
/// Useful if another reader of the same file wants to avoid re-reading
|
||||
/// the file tail. See ReadOptions.SetSerializedFileTail().
|
||||
///
|
||||
/// \return a string of bytes with the file tail
|
||||
std::string GetSerializedFileTail();
|
||||
|
||||
/// \brief Return the metadata read from the ORC file
|
||||
///
|
||||
/// \return A KeyValueMetadata object containing the ORC metadata
|
||||
Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
ORCFileReader();
|
||||
};
|
||||
|
||||
/// \class ORCFileWriter
|
||||
/// \brief Write an Arrow Table or RecordBatch to an ORC file.
|
||||
class ARROW_EXPORT ORCFileWriter {
|
||||
public:
|
||||
~ORCFileWriter();
|
||||
/// \brief Creates a new ORC writer.
|
||||
///
|
||||
/// \param[in] output_stream a pointer to the io::OutputStream to write into
|
||||
/// \param[in] write_options the ORC writer options for Arrow
|
||||
/// \return the returned writer object
|
||||
static Result<std::unique_ptr<ORCFileWriter>> Open(
|
||||
io::OutputStream* output_stream,
|
||||
const WriteOptions& write_options = WriteOptions());
|
||||
|
||||
/// \brief Write a table. This can be called multiple times.
|
||||
///
|
||||
/// Tables passed in subsequent calls must match the schema of the table that was
|
||||
/// written first.
|
||||
///
|
||||
/// \param[in] table the Arrow table from which data is extracted.
|
||||
/// \return Status
|
||||
Status Write(const Table& table);
|
||||
|
||||
/// \brief Write a RecordBatch. This can be called multiple times.
|
||||
///
|
||||
/// RecordBatches passed in subsequent calls must match the schema of the
|
||||
/// RecordBatch that was written first.
|
||||
///
|
||||
/// \param[in] record_batch the Arrow RecordBatch from which data is extracted.
|
||||
/// \return Status
|
||||
Status Write(const RecordBatch& record_batch);
|
||||
|
||||
/// \brief Close an ORC writer (orc::Writer)
|
||||
///
|
||||
/// \return Status
|
||||
Status Close();
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
|
||||
private:
|
||||
ORCFileWriter();
|
||||
};
|
||||
|
||||
} // namespace orc
|
||||
} // namespace adapters
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,120 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace adapters {
|
||||
|
||||
namespace orc {
|
||||
|
||||
enum class WriterId : int32_t {
|
||||
kOrcJava = 0,
|
||||
kOrcCpp = 1,
|
||||
kPresto = 2,
|
||||
kScritchleyGo = 3,
|
||||
kTrino = 4,
|
||||
kUnknown = INT32_MAX
|
||||
};
|
||||
|
||||
enum class WriterVersion : int32_t {
|
||||
kOriginal = 0,
|
||||
kHive8732 = 1,
|
||||
kHive4243 = 2,
|
||||
kHive12055 = 3,
|
||||
kHive13083 = 4,
|
||||
kOrc101 = 5,
|
||||
kOrc135 = 6,
|
||||
kOrc517 = 7,
|
||||
kOrc203 = 8,
|
||||
kOrc14 = 9,
|
||||
kMax = INT32_MAX
|
||||
};
|
||||
|
||||
enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
|
||||
|
||||
class ARROW_EXPORT FileVersion {
|
||||
private:
|
||||
int32_t major_version_;
|
||||
int32_t minor_version_;
|
||||
|
||||
public:
|
||||
static const FileVersion& v_0_11();
|
||||
static const FileVersion& v_0_12();
|
||||
|
||||
FileVersion(int32_t major, int32_t minor)
|
||||
: major_version_(major), minor_version_(minor) {}
|
||||
|
||||
/**
|
||||
* Get major version
|
||||
*/
|
||||
int32_t major_version() const { return this->major_version_; }
|
||||
|
||||
/**
|
||||
* Get minor version
|
||||
*/
|
||||
int32_t minor_version() const { return this->minor_version_; }
|
||||
|
||||
bool operator==(const FileVersion& right) const {
|
||||
return this->major_version() == right.major_version() &&
|
||||
this->minor_version() == right.minor_version();
|
||||
}
|
||||
|
||||
bool operator!=(const FileVersion& right) const { return !(*this == right); }
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
/// Options for the ORC Writer
|
||||
struct ARROW_EXPORT WriteOptions {
|
||||
/// Number of rows the ORC writer writes at a time, default 1024
|
||||
int64_t batch_size = 1024;
|
||||
/// Which ORC file version to use, default FileVersion(0, 12)
|
||||
FileVersion file_version = FileVersion(0, 12);
|
||||
/// Size of each ORC stripe in bytes, default 64 MiB
|
||||
int64_t stripe_size = 64 * 1024 * 1024;
|
||||
/// The compression codec of the ORC file, there is no compression by default
|
||||
Compression::type compression = Compression::UNCOMPRESSED;
|
||||
/// The size of each compression block in bytes, default 64 KiB
|
||||
int64_t compression_block_size = 64 * 1024;
|
||||
/// The compression strategy i.e. speed vs size reduction, default
|
||||
/// CompressionStrategy::kSpeed
|
||||
CompressionStrategy compression_strategy = CompressionStrategy::kSpeed;
|
||||
/// The number of rows per an entry in the row index, default 10000
|
||||
int64_t row_index_stride = 10000;
|
||||
/// The padding tolerance, default 0.0
|
||||
double padding_tolerance = 0.0;
|
||||
/// The dictionary key size threshold. 0 to disable dictionary encoding.
|
||||
/// 1 to always enable dictionary encoding, default 0.0
|
||||
double dictionary_key_size_threshold = 0.0;
|
||||
/// The array of columns that use the bloom filter, default empty
|
||||
std::vector<int64_t> bloom_filter_columns;
|
||||
/// The upper limit of the false-positive rate of the bloom filter, default 0.05
|
||||
double bloom_filter_fpp = 0.05;
|
||||
};
|
||||
|
||||
} // namespace orc
|
||||
} // namespace adapters
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,128 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
|
||||
#include "arrow/type.h"
|
||||
|
||||
// These utilities are supposed to be included in TensorFlow operators
|
||||
// that need to be compiled separately from Arrow because of ABI issues.
|
||||
// They therefore need to be header-only.
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace adapters {
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
Status GetArrowType(::tensorflow::DataType dtype, std::shared_ptr<DataType>* out) {
|
||||
switch (dtype) {
|
||||
case ::tensorflow::DT_BOOL:
|
||||
*out = arrow::boolean();
|
||||
break;
|
||||
case ::tensorflow::DT_FLOAT:
|
||||
*out = arrow::float32();
|
||||
break;
|
||||
case ::tensorflow::DT_DOUBLE:
|
||||
*out = arrow::float64();
|
||||
break;
|
||||
case ::tensorflow::DT_HALF:
|
||||
*out = arrow::float16();
|
||||
break;
|
||||
case ::tensorflow::DT_INT8:
|
||||
*out = arrow::int8();
|
||||
break;
|
||||
case ::tensorflow::DT_INT16:
|
||||
*out = arrow::int16();
|
||||
break;
|
||||
case ::tensorflow::DT_INT32:
|
||||
*out = arrow::int32();
|
||||
break;
|
||||
case ::tensorflow::DT_INT64:
|
||||
*out = arrow::int64();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT8:
|
||||
*out = arrow::uint8();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT16:
|
||||
*out = arrow::uint16();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT32:
|
||||
*out = arrow::uint32();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT64:
|
||||
*out = arrow::uint64();
|
||||
break;
|
||||
default:
|
||||
return Status::TypeError("TensorFlow data type is not supported");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GetTensorFlowType(std::shared_ptr<DataType> dtype, ::tensorflow::DataType* out) {
|
||||
switch (dtype->id()) {
|
||||
case Type::BOOL:
|
||||
*out = ::tensorflow::DT_BOOL;
|
||||
break;
|
||||
case Type::UINT8:
|
||||
*out = ::tensorflow::DT_UINT8;
|
||||
break;
|
||||
case Type::INT8:
|
||||
*out = ::tensorflow::DT_INT8;
|
||||
break;
|
||||
case Type::UINT16:
|
||||
*out = ::tensorflow::DT_UINT16;
|
||||
break;
|
||||
case Type::INT16:
|
||||
*out = ::tensorflow::DT_INT16;
|
||||
break;
|
||||
case Type::UINT32:
|
||||
*out = ::tensorflow::DT_UINT32;
|
||||
break;
|
||||
case Type::INT32:
|
||||
*out = ::tensorflow::DT_INT32;
|
||||
break;
|
||||
case Type::UINT64:
|
||||
*out = ::tensorflow::DT_UINT64;
|
||||
break;
|
||||
case Type::INT64:
|
||||
*out = ::tensorflow::DT_INT64;
|
||||
break;
|
||||
case Type::HALF_FLOAT:
|
||||
*out = ::tensorflow::DT_HALF;
|
||||
break;
|
||||
case Type::FLOAT:
|
||||
*out = ::tensorflow::DT_FLOAT;
|
||||
break;
|
||||
case Type::DOUBLE:
|
||||
*out = ::tensorflow::DT_DOUBLE;
|
||||
break;
|
||||
default:
|
||||
return Status::TypeError("Arrow data type is not supported");
|
||||
}
|
||||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
} // namespace adapters
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,47 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Coarse public API while the library is in development
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/array.h" // IWYU pragma: export
|
||||
#include "arrow/array/array_run_end.h" // IWYU pragma: export
|
||||
#include "arrow/array/concatenate.h" // IWYU pragma: export
|
||||
#include "arrow/buffer.h" // IWYU pragma: export
|
||||
#include "arrow/builder.h" // IWYU pragma: export
|
||||
#include "arrow/chunked_array.h" // IWYU pragma: export
|
||||
#include "arrow/compare.h" // IWYU pragma: export
|
||||
#include "arrow/config.h" // IWYU pragma: export
|
||||
#include "arrow/datum.h" // IWYU pragma: export
|
||||
#include "arrow/extension_type.h" // IWYU pragma: export
|
||||
#include "arrow/memory_pool.h" // IWYU pragma: export
|
||||
#include "arrow/pretty_print.h" // IWYU pragma: export
|
||||
#include "arrow/record_batch.h" // IWYU pragma: export
|
||||
#include "arrow/result.h" // IWYU pragma: export
|
||||
#include "arrow/status.h" // IWYU pragma: export
|
||||
#include "arrow/table.h" // IWYU pragma: export
|
||||
#include "arrow/table_builder.h" // IWYU pragma: export
|
||||
#include "arrow/tensor.h" // IWYU pragma: export
|
||||
#include "arrow/type.h" // IWYU pragma: export
|
||||
#include "arrow/util/key_value_metadata.h" // IWYU pragma: export
|
||||
#include "arrow/visit_array_inline.h" // IWYU pragma: export
|
||||
#include "arrow/visit_scalar_inline.h" // IWYU pragma: export
|
||||
#include "arrow/visitor.h" // IWYU pragma: export
|
||||
|
||||
/// \brief Top-level namespace for Apache Arrow C++ API
|
||||
namespace arrow {}
|
||||
@@ -0,0 +1,49 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Kitchen-sink public API for arrow::Array data structures. C++ library code
|
||||
// (especially header files) in Apache Arrow should use more specific headers
|
||||
// unless it's a file that uses most or all Array types in which case using
|
||||
// arrow/array.h is fine.
|
||||
|
||||
#pragma once
|
||||
|
||||
/// \defgroup numeric-arrays Concrete classes for numeric arrays
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup binary-arrays Concrete classes for binary/string arrays
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup nested-arrays Concrete classes for nested arrays
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup run-end-encoded-arrays Concrete classes for run-end encoded arrays
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
#include "arrow/array/array_base.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_binary.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_decimal.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_dict.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_nested.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_primitive.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_run_end.h" // IWYU pragma: keep
|
||||
#include "arrow/array/data.h" // IWYU pragma: keep
|
||||
#include "arrow/array/util.h" // IWYU pragma: keep
|
||||
@@ -0,0 +1,323 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
#include "arrow/visitor.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// User array accessor types
|
||||
|
||||
/// \brief Array base type
|
||||
/// Immutable data array with some logical type and some length.
|
||||
///
|
||||
/// Any memory is owned by the respective Buffer instance (or its parents).
|
||||
///
|
||||
/// The base class is only required to have a null bitmap buffer if the null
|
||||
/// count is greater than 0
|
||||
///
|
||||
/// If known, the null count can be provided in the base Array constructor. If
|
||||
/// the null count is not known, pass -1 to indicate that the null count is to
|
||||
/// be computed on the first call to null_count()
|
||||
class ARROW_EXPORT Array {
|
||||
public:
|
||||
virtual ~Array() = default;
|
||||
|
||||
/// \brief Return true if value at index is null. Does not boundscheck
|
||||
bool IsNull(int64_t i) const { return !IsValid(i); }
|
||||
|
||||
/// \brief Return true if value at index is valid (not null). Does not
|
||||
/// boundscheck
|
||||
bool IsValid(int64_t i) const {
|
||||
if (null_bitmap_data_ != NULLPTR) {
|
||||
return bit_util::GetBit(null_bitmap_data_, i + data_->offset);
|
||||
}
|
||||
// Dispatching with a few conditionals like this makes IsNull more
|
||||
// efficient for how it is used in practice. Making IsNull virtual
|
||||
// would add a vtable lookup to every call and prevent inlining +
|
||||
// a potential inner-branch removal.
|
||||
if (type_id() == Type::SPARSE_UNION) {
|
||||
return !internal::IsNullSparseUnion(*data_, i);
|
||||
}
|
||||
if (type_id() == Type::DENSE_UNION) {
|
||||
return !internal::IsNullDenseUnion(*data_, i);
|
||||
}
|
||||
if (type_id() == Type::RUN_END_ENCODED) {
|
||||
return !internal::IsNullRunEndEncoded(*data_, i);
|
||||
}
|
||||
return data_->null_count != data_->length;
|
||||
}
|
||||
|
||||
/// \brief Return a Scalar containing the value of this array at i
|
||||
Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
|
||||
|
||||
/// Size in the number of elements this array contains.
|
||||
int64_t length() const { return data_->length; }
|
||||
|
||||
/// A relative position into another array's data, to enable zero-copy
|
||||
/// slicing. This value defaults to zero
|
||||
int64_t offset() const { return data_->offset; }
|
||||
|
||||
/// The number of null entries in the array. If the null count was not known
|
||||
/// at time of construction (and set to a negative value), then the null
|
||||
/// count will be computed and cached on the first invocation of this
|
||||
/// function
|
||||
int64_t null_count() const;
|
||||
|
||||
/// \brief Computes the logical null count for arrays of all types including
|
||||
/// those that do not have a validity bitmap like union and run-end encoded
|
||||
/// arrays
|
||||
///
|
||||
/// If the array has a validity bitmap, this function behaves the same as
|
||||
/// null_count(). For types that have no validity bitmap, this function will
|
||||
/// recompute the null count every time it is called.
|
||||
///
|
||||
/// \see GetNullCount
|
||||
int64_t ComputeLogicalNullCount() const;
|
||||
|
||||
const std::shared_ptr<DataType>& type() const { return data_->type; }
|
||||
Type::type type_id() const { return data_->type->id(); }
|
||||
|
||||
/// Buffer for the validity (null) bitmap, if any. Note that Union types
|
||||
/// never have a null bitmap.
|
||||
///
|
||||
/// Note that for `null_count == 0` or for null type, this will be null.
|
||||
/// This buffer does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
|
||||
|
||||
/// Raw pointer to the null bitmap.
|
||||
///
|
||||
/// Note that for `null_count == 0` or for null type, this will be null.
|
||||
/// This buffer does not account for any slice offset
|
||||
const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
|
||||
|
||||
/// Equality comparison with another array
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool Equals(const std::shared_ptr<Array>& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// \brief Return the formatted unified diff of arrow::Diff between this
|
||||
/// Array and another Array
|
||||
std::string Diff(const Array& other) const;
|
||||
|
||||
/// Approximate equality comparison with another array
|
||||
///
|
||||
/// epsilon is only used if this is FloatArray or DoubleArray
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
bool ApproxEquals(const std::shared_ptr<Array>& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool ApproxEquals(const Array& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// Compare if the range of slots specified are equal for the given array and
|
||||
/// this array. end_idx exclusive. This methods does not bounds check.
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
|
||||
const Array& other,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
|
||||
const std::shared_ptr<Array>& other,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
|
||||
int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
|
||||
int64_t end_idx, int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// \brief Apply the ArrayVisitor::Visit() method specialized to the array type
|
||||
Status Accept(ArrayVisitor* visitor) const;
|
||||
|
||||
/// Construct a zero-copy view of this array with the given type.
|
||||
///
|
||||
/// This method checks if the types are layout-compatible.
|
||||
/// Nested types are traversed in depth-first order. Data buffers must have
|
||||
/// the same item sizes, even though the logical types may be different.
|
||||
/// An error is returned if the types are not layout-compatible.
|
||||
Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;
|
||||
|
||||
/// \brief Construct a copy of the array with all buffers on destination
|
||||
/// Memory Manager
|
||||
///
|
||||
/// This method recursively copies the array's buffers and those of its children
|
||||
/// onto the destination MemoryManager device and returns the new Array.
|
||||
Result<std::shared_ptr<Array>> CopyTo(const std::shared_ptr<MemoryManager>& to) const;
|
||||
|
||||
/// \brief Construct a new array attempting to zero-copy view if possible.
|
||||
///
|
||||
/// Like CopyTo this method recursively goes through all of the array's buffers
|
||||
/// and those of it's children and first attempts to create zero-copy
|
||||
/// views on the destination MemoryManager device. If it can't, it falls back
|
||||
/// to performing a copy. See Buffer::ViewOrCopy.
|
||||
Result<std::shared_ptr<Array>> ViewOrCopyTo(
|
||||
const std::shared_ptr<MemoryManager>& to) const;
|
||||
|
||||
/// Construct a zero-copy slice of the array with the indicated offset and
|
||||
/// length
|
||||
///
|
||||
/// \param[in] offset the position of the first element in the constructed
|
||||
/// slice
|
||||
/// \param[in] length the length of the slice. If there are not enough
|
||||
/// elements in the array, the length will be adjusted accordingly
|
||||
///
|
||||
/// \return a new object wrapped in std::shared_ptr<Array>
|
||||
std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
/// Slice from offset until end of the array
|
||||
std::shared_ptr<Array> Slice(int64_t offset) const;
|
||||
|
||||
/// Input-checking variant of Array::Slice
|
||||
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
|
||||
/// Input-checking variant of Array::Slice
|
||||
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
|
||||
|
||||
const std::shared_ptr<ArrayData>& data() const { return data_; }
|
||||
|
||||
int num_fields() const { return static_cast<int>(data_->child_data.size()); }
|
||||
|
||||
/// \return PrettyPrint representation of array suitable for debugging
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Perform cheap validation checks to determine obvious inconsistencies
|
||||
/// within the array's internal data.
|
||||
///
|
||||
/// This is O(k) where k is the number of descendents.
|
||||
///
|
||||
/// \return Status
|
||||
Status Validate() const;
|
||||
|
||||
/// \brief Perform extensive validation checks to determine inconsistencies
|
||||
/// within the array's internal data.
|
||||
///
|
||||
/// This is potentially O(k*n) where k is the number of descendents and n
|
||||
/// is the array length.
|
||||
///
|
||||
/// \return Status
|
||||
Status ValidateFull() const;
|
||||
|
||||
/// \brief Return the device_type that this array's data is allocated on
|
||||
///
|
||||
/// This just delegates to calling device_type on the underlying ArrayData
|
||||
/// object which backs this Array.
|
||||
///
|
||||
/// \return DeviceAllocationType
|
||||
DeviceAllocationType device_type() const { return data_->device_type(); }
|
||||
|
||||
/// \brief Return the statistics of this Array
|
||||
///
|
||||
/// This just delegates to calling statistics on the underlying ArrayData
|
||||
/// object which backs this Array.
|
||||
///
|
||||
/// \return const std::shared_ptr<ArrayStatistics>&
|
||||
const std::shared_ptr<ArrayStatistics>& statistics() const { return data_->statistics; }
|
||||
|
||||
protected:
|
||||
Array() = default;
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
|
||||
|
||||
std::shared_ptr<ArrayData> data_;
|
||||
const uint8_t* null_bitmap_data_ = NULLPTR;
|
||||
|
||||
/// Protected method for constructors
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
if (data->buffers.size() > 0) {
|
||||
null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
|
||||
} else {
|
||||
null_bitmap_data_ = NULLPTR;
|
||||
}
|
||||
data_ = data;
|
||||
}
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
|
||||
};
|
||||
|
||||
ARROW_EXPORT void PrintTo(const Array& x, std::ostream* os);
|
||||
|
||||
static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
|
||||
os << x.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
/// Base class for non-nested arrays
|
||||
class ARROW_EXPORT FlatArray : public Array {
|
||||
protected:
|
||||
using Array::Array;
|
||||
};
|
||||
|
||||
/// Base class for arrays of fixed-size logical types
|
||||
class ARROW_EXPORT PrimitiveArray : public FlatArray {
|
||||
public:
|
||||
/// Does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& values() const { return data_->buffers[1]; }
|
||||
|
||||
protected:
|
||||
PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
PrimitiveArray() : raw_values_(NULLPTR) {}
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->Array::SetData(data);
|
||||
raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
|
||||
}
|
||||
|
||||
explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
|
||||
|
||||
const uint8_t* raw_values_;
|
||||
};
|
||||
|
||||
/// Degenerate null type Array
|
||||
class ARROW_EXPORT NullArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = NullType;
|
||||
|
||||
explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
|
||||
explicit NullArray(int64_t length);
|
||||
|
||||
private:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
null_bitmap_data_ = NULLPTR;
|
||||
data->null_count = data->length;
|
||||
data_ = data;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,321 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes for Binary, LargeBinary, String, LargeString,
|
||||
// FixedSizeBinary
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/stl_iterator.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup binary-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary and String
|
||||
|
||||
/// Base class for variable-sized binary arrays, regardless of offset size
|
||||
/// and logical interpretation.
|
||||
template <typename TYPE>
|
||||
class BaseBinaryArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
|
||||
|
||||
/// Return the pointer to the given elements bytes
|
||||
// XXX should GetValue(int64_t i) return a string_view?
|
||||
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
|
||||
const offset_type pos = raw_value_offsets_[i];
|
||||
*out_length = raw_value_offsets_[i + 1] - pos;
|
||||
return raw_data_ + pos;
|
||||
}
|
||||
|
||||
/// \brief Get binary value as a string_view
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the view over the selected value
|
||||
std::string_view GetView(int64_t i) const {
|
||||
const offset_type pos = raw_value_offsets_[i];
|
||||
return std::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
|
||||
raw_value_offsets_[i + 1] - pos);
|
||||
}
|
||||
|
||||
std::optional<std::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
/// \brief Get binary value as a string_view
|
||||
/// Provided for consistency with other arrays.
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the view over the selected value
|
||||
std::string_view Value(int64_t i) const { return GetView(i); }
|
||||
|
||||
/// \brief Get binary value as a std::string
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the value copied into a std::string
|
||||
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
|
||||
|
||||
const offset_type* raw_value_offsets() const { return raw_value_offsets_; }
|
||||
|
||||
const uint8_t* raw_data() const { return raw_data_; }
|
||||
|
||||
/// \brief Return the data buffer absolute offset of the data for the value
|
||||
/// at the passed index.
|
||||
///
|
||||
/// Does not perform boundschecking
|
||||
offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; }
|
||||
|
||||
/// \brief Return the length of the data for the value at the passed index.
|
||||
///
|
||||
/// Does not perform boundschecking
|
||||
offset_type value_length(int64_t i) const {
|
||||
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
|
||||
}
|
||||
|
||||
/// \brief Return the total length of the memory in the data buffer
|
||||
/// referenced by this array. If the array has been sliced then this may be
|
||||
/// less than the size of the data buffer (data_->buffers[2]).
|
||||
offset_type total_values_length() const {
|
||||
if (data_->length > 0) {
|
||||
return raw_value_offsets_[data_->length] - raw_value_offsets_[0];
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
// For subclasses
|
||||
BaseBinaryArray() = default;
|
||||
|
||||
// Protected method for constructors
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->Array::SetData(data);
|
||||
raw_value_offsets_ = data->GetValuesSafe<offset_type>(1);
|
||||
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
|
||||
}
|
||||
|
||||
const offset_type* raw_value_offsets_ = NULLPTR;
|
||||
const uint8_t* raw_data_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size binary data
|
||||
class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
|
||||
public:
|
||||
explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
protected:
|
||||
// For subclasses such as StringArray
|
||||
BinaryArray() : BaseBinaryArray() {}
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size string (utf-8) data
|
||||
class ARROW_EXPORT StringArray : public BinaryArray {
|
||||
public:
|
||||
using TypeClass = StringType;
|
||||
|
||||
explicit StringArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
/// Concrete Array class for large variable-size binary data
|
||||
class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
|
||||
public:
|
||||
explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
protected:
|
||||
// For subclasses such as LargeStringArray
|
||||
LargeBinaryArray() : BaseBinaryArray() {}
|
||||
};
|
||||
|
||||
/// Concrete Array class for large variable-size string (utf-8) data
|
||||
class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
|
||||
public:
|
||||
using TypeClass = LargeStringType;
|
||||
|
||||
explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// BinaryView and StringView
|
||||
|
||||
/// Concrete Array class for variable-size binary view data using the
|
||||
/// BinaryViewType::c_type struct to reference in-line or out-of-line string values
|
||||
class ARROW_EXPORT BinaryViewArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = BinaryViewType;
|
||||
using IteratorType = stl::ArrayIterator<BinaryViewArray>;
|
||||
using c_type = BinaryViewType::c_type;
|
||||
|
||||
explicit BinaryViewArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> views, BufferVector data_buffers,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
// For API compatibility with BinaryArray etc.
|
||||
std::string_view GetView(int64_t i) const;
|
||||
std::string GetString(int64_t i) const { return std::string{GetView(i)}; }
|
||||
|
||||
const auto& values() const { return data_->buffers[1]; }
|
||||
const c_type* raw_values() const { return raw_values_; }
|
||||
|
||||
std::optional<std::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
using FlatArray::FlatArray;
|
||||
|
||||
void SetData(std::shared_ptr<ArrayData> data) {
|
||||
FlatArray::SetData(std::move(data));
|
||||
raw_values_ = data_->GetValuesSafe<c_type>(1);
|
||||
}
|
||||
|
||||
const c_type* raw_values_;
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size string view (utf-8) data using
|
||||
/// BinaryViewType::c_type to reference in-line or out-of-line string values
|
||||
class ARROW_EXPORT StringViewArray : public BinaryViewArray {
|
||||
public:
|
||||
using TypeClass = StringViewType;
|
||||
|
||||
explicit StringViewArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
using BinaryViewArray::BinaryViewArray;
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Fixed width binary
|
||||
|
||||
/// Concrete Array class for fixed-size binary data
|
||||
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = FixedSizeBinaryType;
|
||||
using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
|
||||
|
||||
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const uint8_t* GetValue(int64_t i) const { return values_ + i * byte_width_; }
|
||||
const uint8_t* Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
std::string_view GetView(int64_t i) const {
|
||||
return std::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width_);
|
||||
}
|
||||
|
||||
std::optional<std::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
|
||||
|
||||
int32_t byte_width() const { return byte_width_; }
|
||||
|
||||
const uint8_t* raw_values() const { return values_; }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->PrimitiveArray::SetData(data);
|
||||
byte_width_ =
|
||||
internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
|
||||
values_ = raw_values_ + data_->offset * byte_width_;
|
||||
}
|
||||
|
||||
const uint8_t* values_;
|
||||
int32_t byte_width_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,104 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal32Array
|
||||
|
||||
/// Concrete Array class for 32-bit decimal data
|
||||
class ARROW_EXPORT Decimal32Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal32Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal32Array from ArrayData instance
|
||||
explicit Decimal32Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal64Array
|
||||
|
||||
/// Concrete Array class for 64-bit decimal data
|
||||
class ARROW_EXPORT Decimal64Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal64Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal64Array from ArrayData instance
|
||||
explicit Decimal64Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal128Array
|
||||
|
||||
/// Concrete Array class for 128-bit decimal data
|
||||
class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal128Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal128Array from ArrayData instance
|
||||
explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
// Backward compatibility
|
||||
using DecimalArray = Decimal128Array;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal256Array
|
||||
|
||||
/// Concrete Array class for 256-bit decimal data
|
||||
class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal256Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal256Array from ArrayData instance
|
||||
explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,182 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// DictionaryArray
|
||||
|
||||
/// \brief Array type for dictionary-encoded data with a
|
||||
/// data-dependent dictionary
|
||||
///
|
||||
/// A dictionary array contains an array of non-negative integers (the
|
||||
/// "dictionary indices") along with a data type containing a "dictionary"
|
||||
/// corresponding to the distinct values represented in the data.
|
||||
///
|
||||
/// For example, the array
|
||||
///
|
||||
/// ["foo", "bar", "foo", "bar", "foo", "bar"]
|
||||
///
|
||||
/// with dictionary ["bar", "foo"], would have dictionary array representation
|
||||
///
|
||||
/// indices: [1, 0, 1, 0, 1, 0]
|
||||
/// dictionary: ["bar", "foo"]
|
||||
///
|
||||
/// The indices in principle may be any integer type.
|
||||
class ARROW_EXPORT DictionaryArray : public Array {
|
||||
public:
|
||||
using TypeClass = DictionaryType;
|
||||
|
||||
explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DictionaryArray(const std::shared_ptr<DataType>& type,
|
||||
const std::shared_ptr<Array>& indices,
|
||||
const std::shared_ptr<Array>& dictionary);
|
||||
|
||||
/// \brief Construct DictionaryArray from dictionary and indices
|
||||
/// array and validate
|
||||
///
|
||||
/// This function does the validation of the indices and input type. It checks if
|
||||
/// all indices are non-negative and smaller than the size of the dictionary.
|
||||
///
|
||||
/// \param[in] type a dictionary type
|
||||
/// \param[in] dictionary the dictionary with same value type as the
|
||||
/// type object
|
||||
/// \param[in] indices an array of non-negative integers smaller than the
|
||||
/// size of the dictionary
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
|
||||
const std::shared_ptr<Array>& dictionary);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
|
||||
return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
|
||||
dictionary);
|
||||
}
|
||||
|
||||
/// \brief Transpose this DictionaryArray
|
||||
///
|
||||
/// This method constructs a new dictionary array with the given dictionary
|
||||
/// type, transposing indices using the transpose map. The type and the
|
||||
/// transpose map are typically computed using DictionaryUnifier.
|
||||
///
|
||||
/// \param[in] type the new type object
|
||||
/// \param[in] dictionary the new dictionary
|
||||
/// \param[in] transpose_map transposition array of this array's indices
|
||||
/// into the target array's indices
|
||||
/// \param[in] pool a pool to allocate the array data from
|
||||
Result<std::shared_ptr<Array>> Transpose(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
|
||||
const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
Result<std::shared_ptr<Array>> Compact(MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Determine whether dictionary arrays may be compared without unification
|
||||
bool CanCompareIndices(const DictionaryArray& other) const;
|
||||
|
||||
/// \brief Return the dictionary for this array, which is stored as
|
||||
/// a member of the ArrayData internal structure
|
||||
const std::shared_ptr<Array>& dictionary() const;
|
||||
const std::shared_ptr<Array>& indices() const;
|
||||
|
||||
/// \brief Return the ith value of indices, cast to int64_t. Not recommended
|
||||
/// for use in performance-sensitive code. Does not validate whether the
|
||||
/// value is null or out-of-bounds.
|
||||
int64_t GetValueIndex(int64_t i) const;
|
||||
|
||||
const DictionaryType* dict_type() const { return dict_type_; }
|
||||
|
||||
private:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
const DictionaryType* dict_type_;
|
||||
std::shared_ptr<Array> indices_;
|
||||
|
||||
// Lazily initialized when invoking dictionary()
|
||||
mutable std::shared_ptr<Array> dictionary_;
|
||||
};
|
||||
|
||||
/// \brief Helper class for incremental dictionary unification
|
||||
class ARROW_EXPORT DictionaryUnifier {
|
||||
public:
|
||||
virtual ~DictionaryUnifier() = default;
|
||||
|
||||
/// \brief Construct a DictionaryUnifier
|
||||
/// \param[in] value_type the data type of the dictionaries
|
||||
/// \param[in] pool MemoryPool to use for memory allocations
|
||||
static Result<std::unique_ptr<DictionaryUnifier>> Make(
|
||||
std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Unify dictionaries across array chunks
|
||||
///
|
||||
/// The dictionaries in the array chunks will be unified, their indices
|
||||
/// accordingly transposed.
|
||||
///
|
||||
/// Only dictionaries with a primitive value type are currently supported.
|
||||
/// However, dictionaries nested inside a more complex type are correctly unified.
|
||||
static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
|
||||
const std::shared_ptr<ChunkedArray>& array,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Unify dictionaries across the chunks of each table column
|
||||
///
|
||||
/// The dictionaries in each table column will be unified, their indices
|
||||
/// accordingly transposed.
|
||||
///
|
||||
/// Only dictionaries with a primitive value type are currently supported.
|
||||
/// However, dictionaries nested inside a more complex type are correctly unified.
|
||||
static Result<std::shared_ptr<Table>> UnifyTable(
|
||||
const Table& table, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Append dictionary to the internal memo
|
||||
virtual Status Unify(const Array& dictionary) = 0;
|
||||
|
||||
/// \brief Append dictionary and compute transpose indices
|
||||
/// \param[in] dictionary the dictionary values to unify
|
||||
/// \param[out] out_transpose a Buffer containing computed transpose indices
|
||||
/// as int32_t values equal in length to the passed dictionary. The value in
|
||||
/// each slot corresponds to the new index value for each original index
|
||||
/// for a DictionaryArray with the old dictionary
|
||||
virtual Status Unify(const Array& dictionary,
|
||||
std::shared_ptr<Buffer>* out_transpose) = 0;
|
||||
|
||||
/// \brief Return a result DictionaryType with the smallest possible index
|
||||
/// type to accommodate the unified dictionary. The unifier cannot be used
|
||||
/// after this is called
|
||||
virtual Status GetResult(std::shared_ptr<DataType>* out_type,
|
||||
std::shared_ptr<Array>* out_dict) = 0;
|
||||
|
||||
/// \brief Return a unified dictionary with the given index type. If
|
||||
/// the index type is not large enough then an invalid status will be returned.
|
||||
/// The unifier cannot be used after this is called
|
||||
virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
|
||||
std::shared_ptr<Array>* out_dict) = 0;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,887 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList,
|
||||
// Map, Struct, and Union
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// VarLengthListLikeArray
|
||||
|
||||
template <typename TYPE>
|
||||
class VarLengthListLikeArray;
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Private helper for [Large]List[View]Array::SetData.
|
||||
// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header
|
||||
// doesn't play well with MSVC.
|
||||
template <typename TYPE>
|
||||
void SetListData(VarLengthListLikeArray<TYPE>* self,
|
||||
const std::shared_ptr<ArrayData>& data,
|
||||
Type::type expected_type_id = TYPE::type_id);
|
||||
|
||||
/// \brief A version of Flatten that keeps recursively flattening until an array of
|
||||
/// non-list values is reached.
|
||||
///
|
||||
/// Array types considered to be lists by this function:
|
||||
/// - list
|
||||
/// - large_list
|
||||
/// - list_view
|
||||
/// - large_list_view
|
||||
/// - fixed_size_list
|
||||
///
|
||||
/// \see ListArray::Flatten
|
||||
ARROW_EXPORT Result<std::shared_ptr<Array>> FlattenLogicalListRecursively(
|
||||
const Array& in_array, MemoryPool* memory_pool);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// Base class for variable-sized list and list-view arrays, regardless of offset size.
|
||||
template <typename TYPE>
|
||||
class VarLengthListLikeArray : public Array {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
const TypeClass* var_length_list_like_type() const { return this->list_type_; }
|
||||
|
||||
/// \brief Return array object containing the list's values
|
||||
///
|
||||
/// Note that this buffer does not account for any slice offset or length.
|
||||
const std::shared_ptr<Array>& values() const { return values_; }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset or length.
|
||||
const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[1]; }
|
||||
|
||||
const std::shared_ptr<DataType>& value_type() const { return list_type_->value_type(); }
|
||||
|
||||
/// Return pointer to raw value offsets accounting for any slice offset
|
||||
const offset_type* raw_value_offsets() const { return raw_value_offsets_; }
|
||||
|
||||
// The following functions will not perform boundschecking
|
||||
|
||||
offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; }
|
||||
|
||||
/// \brief Return the size of the value at a particular index
|
||||
///
|
||||
/// Since non-empty null lists and list-views are possible, avoid calling this
|
||||
/// function when the list at slot i is null.
|
||||
///
|
||||
/// \pre IsValid(i)
|
||||
virtual offset_type value_length(int64_t i) const = 0;
|
||||
|
||||
/// \pre IsValid(i)
|
||||
std::shared_ptr<Array> value_slice(int64_t i) const {
|
||||
return values_->Slice(value_offset(i), value_length(i));
|
||||
}
|
||||
|
||||
/// \brief Flatten all level recursively until reach a non-list type, and return
|
||||
/// a non-list type Array.
|
||||
///
|
||||
/// \see internal::FlattenLogicalListRecursively
|
||||
Result<std::shared_ptr<Array>> FlattenRecursively(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const {
|
||||
return internal::FlattenLogicalListRecursively(*this, memory_pool);
|
||||
}
|
||||
|
||||
protected:
|
||||
friend void internal::SetListData<TYPE>(VarLengthListLikeArray<TYPE>* self,
|
||||
const std::shared_ptr<ArrayData>& data,
|
||||
Type::type expected_type_id);
|
||||
|
||||
const TypeClass* list_type_ = NULLPTR;
|
||||
std::shared_ptr<Array> values_;
|
||||
const offset_type* raw_value_offsets_ = NULLPTR;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListArray / LargeListArray
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListArray : public VarLengthListLikeArray<TYPE> {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TYPE::offset_type;
|
||||
|
||||
const TypeClass* list_type() const { return this->var_length_list_like_type(); }
|
||||
|
||||
/// \brief Return the size of the value at a particular index
|
||||
///
|
||||
/// Since non-empty null lists are possible, avoid calling this
|
||||
/// function when the list at slot i is null.
|
||||
///
|
||||
/// \pre IsValid(i)
|
||||
offset_type value_length(int64_t i) const final {
|
||||
return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i];
|
||||
}
|
||||
};
|
||||
|
||||
/// Concrete Array class for list data
|
||||
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
|
||||
public:
|
||||
explicit ListArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
ListArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct ListArray from array of offsets and child value array
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed.
|
||||
///
|
||||
/// If a null_bitmap is not provided, the nulls will be inferred from the offsets'
|
||||
/// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls.
|
||||
///
|
||||
/// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an
|
||||
/// array with offset() > 0).
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int32 type
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<ListArray>> FromArrays(
|
||||
const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<ListArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Build a ListArray from a ListViewArray
|
||||
static Result<std::shared_ptr<ListArray>> FromListView(const ListViewArray& source,
|
||||
MemoryPool* pool);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration of this array's offsets as well as null elements backed
|
||||
/// by non-empty lists (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list offsets as an Int32Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to ListArray::FromArrays() and get back the same list array
|
||||
/// if the original one has nulls.
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
protected:
|
||||
// This constructor defers SetData to a derived array class
|
||||
ListArray() = default;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// Concrete Array class for large list data (with 64-bit offsets)
|
||||
class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
|
||||
public:
|
||||
explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct LargeListArray from array of offsets and child value array
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed.
|
||||
///
|
||||
/// If a null_bitmap is not provided, the nulls will be inferred from the offsets'
|
||||
/// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls.
|
||||
///
|
||||
/// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an
|
||||
/// array with offset() > 0).
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int64 type
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<LargeListArray>> FromArrays(
|
||||
const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<LargeListArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Build a LargeListArray from a LargeListViewArray
|
||||
static Result<std::shared_ptr<LargeListArray>> FromListView(
|
||||
const LargeListViewArray& source, MemoryPool* pool);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration of this array's offsets as well as null elements backed
|
||||
/// by non-empty lists (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list offsets as an Int64Array
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListViewArray / LargeListViewArray
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListViewArray : public VarLengthListLikeArray<TYPE> {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TYPE::offset_type;
|
||||
|
||||
const TypeClass* list_view_type() const { return this->var_length_list_like_type(); }
|
||||
|
||||
/// \brief Note that this buffer does not account for any slice offset or length.
|
||||
const std::shared_ptr<Buffer>& value_sizes() const { return this->data_->buffers[2]; }
|
||||
|
||||
/// \brief Return pointer to raw value offsets accounting for any slice offset
|
||||
const offset_type* raw_value_sizes() const { return raw_value_sizes_; }
|
||||
|
||||
/// \brief Return the size of the value at a particular index
|
||||
///
|
||||
/// This should not be called if the list-view at slot i is null.
|
||||
/// The returned size in those cases could be any value from 0 to the
|
||||
/// length of the child values array.
|
||||
///
|
||||
/// \pre IsValid(i)
|
||||
offset_type value_length(int64_t i) const final { return this->raw_value_sizes_[i]; }
|
||||
|
||||
protected:
|
||||
const offset_type* raw_value_sizes_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief Concrete Array class for list-view data
|
||||
class ARROW_EXPORT ListViewArray : public BaseListViewArray<ListViewType> {
|
||||
public:
|
||||
explicit ListViewArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
ListViewArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> value_offsets,
|
||||
std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array> values,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct ListViewArray from array of offsets, sizes, and child
|
||||
/// value array
|
||||
///
|
||||
/// Construct a ListViewArray using buffers from offsets and sizes arrays
|
||||
/// that project views into the child values array.
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets/sizes and
|
||||
/// input types. The offset and length of the offsets and sizes arrays must
|
||||
/// match and that will be checked, but their contents will be assumed to be
|
||||
/// well-formed.
|
||||
///
|
||||
/// If a null_bitmap is not provided, the nulls will be inferred from the
|
||||
/// offsets's null bitmap. But if a null_bitmap is provided, the offsets array
|
||||
/// can't have nulls.
|
||||
///
|
||||
/// And when a null_bitmap is provided, neither the offsets or sizes array can be a
|
||||
/// slice (i.e. an array with offset() > 0).
|
||||
///
|
||||
/// \param[in] offsets An array of int32 offsets into the values array. NULL values are
|
||||
/// supported if the corresponding values in sizes is NULL or 0.
|
||||
/// \param[in] sizes An array containing the int32 sizes of every view. NULL values are
|
||||
/// taken to represent a NULL list-view in the array being created.
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<ListViewArray>> FromArrays(
|
||||
const Array& offsets, const Array& sizes, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<ListViewArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
|
||||
const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Build a ListViewArray from a ListArray
|
||||
static Result<std::shared_ptr<ListViewArray>> FromList(const ListArray& list_array,
|
||||
MemoryPool* pool);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the list-views in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration this array's offsets (which can be in any order)
|
||||
/// and sizes. Nulls are skipped.
|
||||
///
|
||||
/// This function invokes Concatenate() if list-views are non-contiguous. It
|
||||
/// will try to minimize the number of array slices passed to Concatenate() by
|
||||
/// maximizing the size of each slice (containing as many contiguous
|
||||
/// list-views as possible).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list-view offsets as an Int32Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to ListArray::FromArrays() and get back the same list array
|
||||
/// if the original one has nulls.
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
/// \brief Return list-view sizes as an Int32Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to ListViewArray::FromArrays() and get back the same list
|
||||
/// array if the original one has nulls.
|
||||
std::shared_ptr<Array> sizes() const;
|
||||
|
||||
protected:
|
||||
// This constructor defers SetData to a derived array class
|
||||
ListViewArray() = default;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// \brief Concrete Array class for large list-view data (with 64-bit offsets
|
||||
/// and sizes)
|
||||
class ARROW_EXPORT LargeListViewArray : public BaseListViewArray<LargeListViewType> {
|
||||
public:
|
||||
explicit LargeListViewArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
LargeListViewArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> value_offsets,
|
||||
std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array> values,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct LargeListViewArray from array of offsets, sizes, and child
|
||||
/// value array
|
||||
///
|
||||
/// Construct an LargeListViewArray using buffers from offsets and sizes arrays
|
||||
/// that project views into the values array.
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets/sizes and
|
||||
/// input types. The offset and length of the offsets and sizes arrays must
|
||||
/// match and that will be checked, but their contents will be assumed to be
|
||||
/// well-formed.
|
||||
///
|
||||
/// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or
|
||||
/// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a
|
||||
/// null_bitmap is provided, the offsets array and the sizes array can't have nulls.
|
||||
///
|
||||
/// And when a null_bitmap is provided, neither the offsets or sizes array can be a
|
||||
/// slice (i.e. an array with offset() > 0).
|
||||
///
|
||||
/// \param[in] offsets An array of int64 offsets into the values array. NULL values are
|
||||
/// supported if the corresponding values in sizes is NULL or 0.
|
||||
/// \param[in] sizes An array containing the int64 sizes of every view. NULL values are
|
||||
/// taken to represent a NULL list-view in the array being created.
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
|
||||
const Array& offsets, const Array& sizes, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
|
||||
const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Build a LargeListViewArray from a LargeListArray
|
||||
static Result<std::shared_ptr<LargeListViewArray>> FromList(
|
||||
const LargeListArray& list_array, MemoryPool* pool);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the large list-views in this
|
||||
/// array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration this array's offsets (which can be in any order)
|
||||
/// and sizes. Nulls are skipped.
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list-view offsets as an Int64Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to LargeListArray::FromArrays() and get back the same list array
|
||||
/// if the original one has nulls.
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
/// \brief Return list-view sizes as an Int64Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to LargeListViewArray::FromArrays() and get back the same list
|
||||
/// array if the original one has nulls.
|
||||
std::shared_ptr<Array> sizes() const;
|
||||
|
||||
protected:
|
||||
// This constructor defers SetData to a derived array class
|
||||
LargeListViewArray() = default;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// MapArray
|
||||
|
||||
/// Concrete Array class for map data
|
||||
///
|
||||
/// NB: "value" in this context refers to a pair of a key and the corresponding item
|
||||
class ARROW_EXPORT MapArray : public ListArray {
|
||||
public:
|
||||
using TypeClass = MapType;
|
||||
|
||||
explicit MapArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length, BufferVector buffers,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct MapArray from array of offsets and child key, item arrays
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int32 type
|
||||
/// \param[in] keys Array containing key values
|
||||
/// \param[in] items Array containing item values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// allocated because of null values
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
|
||||
const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR);
|
||||
|
||||
const MapType* map_type() const { return map_type_; }
|
||||
|
||||
/// \brief Return array object containing all map keys
|
||||
const std::shared_ptr<Array>& keys() const { return keys_; }
|
||||
|
||||
/// \brief Return array object containing all mapped items
|
||||
const std::shared_ptr<Array>& items() const { return items_; }
|
||||
|
||||
/// Validate child data before constructing the actual MapArray.
|
||||
static Status ValidateChildData(
|
||||
const std::vector<std::shared_ptr<ArrayData>>& child_data);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArraysInternal(
|
||||
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
MemoryPool* pool, std::shared_ptr<Buffer> null_bitmap = NULLPTR);
|
||||
|
||||
private:
|
||||
const MapType* map_type_;
|
||||
std::shared_ptr<Array> keys_, items_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeListArray
|
||||
|
||||
/// Concrete Array class for fixed size list data
|
||||
class ARROW_EXPORT FixedSizeListArray : public Array {
|
||||
public:
|
||||
using TypeClass = FixedSizeListType;
|
||||
using offset_type = TypeClass::offset_type;
|
||||
|
||||
explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const FixedSizeListType* list_type() const;
|
||||
|
||||
/// \brief Return array object containing the list's values
|
||||
const std::shared_ptr<Array>& values() const;
|
||||
|
||||
const std::shared_ptr<DataType>& value_type() const;
|
||||
|
||||
// The following functions will not perform boundschecking
|
||||
int64_t value_offset(int64_t i) const {
|
||||
i += data_->offset;
|
||||
return list_size_ * i;
|
||||
}
|
||||
/// \brief Return the fixed-size of the values
|
||||
///
|
||||
/// No matter the value of the index parameter, the result is the same.
|
||||
/// So even when the value at slot i is null, this function will return a
|
||||
/// non-zero size.
|
||||
///
|
||||
/// \pre IsValid(i)
|
||||
int32_t value_length(int64_t i = 0) const {
|
||||
ARROW_UNUSED(i);
|
||||
return list_size_;
|
||||
}
|
||||
/// \pre IsValid(i)
|
||||
std::shared_ptr<Array> value_slice(int64_t i) const {
|
||||
return values_->Slice(value_offset(i), value_length(i));
|
||||
}
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration null elements (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Flatten all level recursively until reach a non-list type, and return
|
||||
/// a non-list type Array.
|
||||
///
|
||||
/// \see internal::FlattenLogicalListRecursively
|
||||
Result<std::shared_ptr<Array>> FlattenRecursively(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const {
|
||||
return internal::FlattenLogicalListRecursively(*this, memory_pool);
|
||||
}
|
||||
|
||||
/// \brief Construct FixedSizeListArray from child value array and value_length
|
||||
///
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] list_size The fixed length of each list
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
/// \return Will have length equal to values.length() / list_size
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& values, int32_t list_size,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Construct FixedSizeListArray from child value array and type
|
||||
///
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] type The fixed sized list type
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
/// \return Will have length equal to values.length() / type.list_size()
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
int32_t list_size_;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Array> values_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Struct
|
||||
|
||||
/// Concrete Array class for struct data
|
||||
class ARROW_EXPORT StructArray : public Array {
|
||||
public:
|
||||
using TypeClass = StructType;
|
||||
|
||||
explicit StructArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
StructArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::vector<std::shared_ptr<Array>>& children,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Return a StructArray from child arrays and field names.
|
||||
///
|
||||
/// The length and data type are automatically inferred from the arguments.
|
||||
/// There should be at least one child array.
|
||||
static Result<std::shared_ptr<StructArray>> Make(
|
||||
const ArrayVector& children, const std::vector<std::string>& field_names,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Return a StructArray from child arrays and fields.
|
||||
///
|
||||
/// The length is automatically inferred from the arguments.
|
||||
/// There should be at least one child array. This method does not
|
||||
/// check that field types and child array types are consistent.
|
||||
static Result<std::shared_ptr<StructArray>> Make(
|
||||
const ArrayVector& children, const FieldVector& fields,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const StructType* struct_type() const;
|
||||
|
||||
// Return a shared pointer in case the requestor desires to share ownership
|
||||
// with this array. The returned array has its offset, length and null
|
||||
// count adjusted.
|
||||
const std::shared_ptr<Array>& field(int pos) const;
|
||||
|
||||
const ArrayVector& fields() const;
|
||||
|
||||
/// Returns null if name not found
|
||||
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
|
||||
|
||||
/// Indicate if field named `name` can be found unambiguously in the struct.
|
||||
Status CanReferenceFieldByName(const std::string& name) const;
|
||||
|
||||
/// Indicate if fields named `names` can be found unambiguously in the struct.
|
||||
Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;
|
||||
|
||||
/// \brief Flatten this array as a vector of arrays, one for each field
|
||||
///
|
||||
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Get one of the child arrays, combining its null bitmap
|
||||
/// with the parent struct array's bitmap.
|
||||
///
|
||||
/// \param[in] index Which child array to get
|
||||
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<std::shared_ptr<Array>> GetFlattenedField(
|
||||
int index, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
private:
|
||||
// For caching boxed child data
|
||||
// XXX This is not handled in a thread-safe manner.
|
||||
mutable ArrayVector boxed_fields_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Union
|
||||
|
||||
/// Base class for SparseUnionArray and DenseUnionArray
|
||||
class ARROW_EXPORT UnionArray : public Array {
|
||||
public:
|
||||
using type_code_t = int8_t;
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& type_codes() const { return data_->buffers[1]; }
|
||||
|
||||
const type_code_t* raw_type_codes() const { return raw_type_codes_; }
|
||||
|
||||
/// The logical type code of the value at index.
|
||||
type_code_t type_code(int64_t i) const { return raw_type_codes_[i]; }
|
||||
|
||||
/// The physical child id containing value at index.
|
||||
int child_id(int64_t i) const { return union_type_->child_ids()[raw_type_codes_[i]]; }
|
||||
|
||||
const UnionType* union_type() const { return union_type_; }
|
||||
|
||||
UnionMode::type mode() const { return union_type_->mode(); }
|
||||
|
||||
/// \brief Return the given field as an individual array.
|
||||
///
|
||||
/// For sparse unions, the returned array has its offset, length and null
|
||||
/// count adjusted.
|
||||
std::shared_ptr<Array> field(int pos) const;
|
||||
|
||||
protected:
|
||||
void SetData(std::shared_ptr<ArrayData> data);
|
||||
|
||||
const type_code_t* raw_type_codes_;
|
||||
const UnionType* union_type_;
|
||||
|
||||
// For caching boxed child data
|
||||
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
|
||||
};
|
||||
|
||||
/// Concrete Array class for sparse union data
|
||||
class ARROW_EXPORT SparseUnionArray : public UnionArray {
|
||||
public:
|
||||
using TypeClass = SparseUnionType;
|
||||
|
||||
explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
|
||||
std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct SparseUnionArray from type_ids and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
|
||||
std::vector<type_code_t> type_codes) {
|
||||
return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
|
||||
std::move(type_codes));
|
||||
}
|
||||
|
||||
/// \brief Construct SparseUnionArray with custom field names from type_ids and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] field_names Vector of strings containing the name of each field.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
|
||||
std::vector<std::string> field_names = {},
|
||||
std::vector<type_code_t> type_codes = {});
|
||||
|
||||
const SparseUnionType* union_type() const {
|
||||
return internal::checked_cast<const SparseUnionType*>(union_type_);
|
||||
}
|
||||
|
||||
/// \brief Get one of the child arrays, adjusting its null bitmap
|
||||
/// where the union array type code does not match.
|
||||
///
|
||||
/// \param[in] index Which child array to get (i.e. the physical index, not the type
|
||||
/// code) \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<std::shared_ptr<Array>> GetFlattenedField(
|
||||
int index, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
protected:
|
||||
void SetData(std::shared_ptr<ArrayData> data);
|
||||
};
|
||||
|
||||
/// \brief Concrete Array class for dense union data
|
||||
///
|
||||
/// Note that union types do not have a validity bitmap
|
||||
class ARROW_EXPORT DenseUnionArray : public UnionArray {
|
||||
public:
|
||||
using TypeClass = DenseUnionType;
|
||||
|
||||
explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
|
||||
std::shared_ptr<Buffer> type_ids,
|
||||
std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] value_offsets An array of signed int32 values indicating the
|
||||
/// relative offset into the respective child array for the type in a given slot.
|
||||
/// The respective offsets for each child value array must be in order / increasing.
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
|
||||
const Array& value_offsets,
|
||||
ArrayVector children,
|
||||
std::vector<type_code_t> type_codes) {
|
||||
return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
|
||||
std::move(type_codes));
|
||||
}
|
||||
|
||||
/// \brief Construct DenseUnionArray with custom field names from type_ids,
|
||||
/// value_offsets, and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] value_offsets An array of signed int32 values indicating the
|
||||
/// relative offset into the respective child array for the type in a given slot.
|
||||
/// The respective offsets for each child value array must be in order / increasing.
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] field_names Vector of strings containing the name of each field.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
|
||||
const Array& value_offsets,
|
||||
ArrayVector children,
|
||||
std::vector<std::string> field_names = {},
|
||||
std::vector<type_code_t> type_codes = {});
|
||||
|
||||
const DenseUnionType* union_type() const {
|
||||
return internal::checked_cast<const DenseUnionType*>(union_type_);
|
||||
}
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[2]; }
|
||||
|
||||
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i]; }
|
||||
|
||||
const int32_t* raw_value_offsets() const { return raw_value_offsets_; }
|
||||
|
||||
protected:
|
||||
const int32_t* raw_value_offsets_;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,220 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor types for primitive/C-type-based arrays, such as numbers,
|
||||
// boolean, and temporal types.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/stl_iterator.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h" // IWYU pragma: export
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// Concrete Array class for boolean data
|
||||
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = BooleanType;
|
||||
using IteratorType = stl::ArrayIterator<BooleanArray>;
|
||||
|
||||
explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
bool Value(int64_t i) const {
|
||||
return bit_util::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
|
||||
i + data_->offset);
|
||||
}
|
||||
|
||||
bool GetView(int64_t i) const { return Value(i); }
|
||||
|
||||
std::optional<bool> operator[](int64_t i) const { return *IteratorType(*this, i); }
|
||||
|
||||
/// \brief Return the number of false (0) values among the valid
|
||||
/// values. Result is not cached.
|
||||
int64_t false_count() const;
|
||||
|
||||
/// \brief Return the number of true (1) values among the valid
|
||||
/// values. Result is not cached.
|
||||
int64_t true_count() const;
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
using PrimitiveArray::PrimitiveArray;
|
||||
};
|
||||
|
||||
/// \addtogroup numeric-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Concrete Array class for numeric data with a corresponding C type
|
||||
///
|
||||
/// This class is templated on the corresponding DataType subclass for the
|
||||
/// given data, for example NumericArray<Int8Type> or NumericArray<Date32Type>.
|
||||
///
|
||||
/// Note that convenience aliases are available for all accepted types
|
||||
/// (for example Int8Array for NumericArray<Int8Type>).
|
||||
template <typename TYPE>
|
||||
class NumericArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using value_type = typename TypeClass::c_type;
|
||||
using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
|
||||
|
||||
explicit NumericArray(const std::shared_ptr<ArrayData>& data) {
|
||||
NumericArray::SetData(data);
|
||||
}
|
||||
|
||||
// Only enable this constructor without a type argument for types without additional
|
||||
// metadata
|
||||
template <typename T1 = TYPE>
|
||||
NumericArray(enable_if_parameter_free<T1, int64_t> length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0) {
|
||||
NumericArray::SetData(ArrayData::Make(TypeTraits<T1>::type_singleton(), length,
|
||||
{null_bitmap, data}, null_count, offset));
|
||||
}
|
||||
|
||||
NumericArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0) {
|
||||
NumericArray::SetData(ArrayData::Make(std::move(type), length, {null_bitmap, data},
|
||||
null_count, offset));
|
||||
}
|
||||
|
||||
const value_type* raw_values() const { return values_; }
|
||||
|
||||
value_type Value(int64_t i) const { return values_[i]; }
|
||||
|
||||
// For API compatibility with BinaryArray etc.
|
||||
value_type GetView(int64_t i) const { return values_[i]; }
|
||||
|
||||
std::optional<value_type> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
NumericArray() : values_(NULLPTR) {}
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->PrimitiveArray::SetData(data);
|
||||
values_ = raw_values_
|
||||
? (reinterpret_cast<const value_type*>(raw_values_) + data_->offset)
|
||||
: NULLPTR;
|
||||
}
|
||||
|
||||
const value_type* values_;
|
||||
};
|
||||
|
||||
/// DayTimeArray
|
||||
/// ---------------------
|
||||
/// \brief Array of Day and Millisecond values.
|
||||
class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = DayTimeIntervalType;
|
||||
using IteratorType = stl::ArrayIterator<DayTimeIntervalArray>;
|
||||
|
||||
explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
DayTimeIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
TypeClass::DayMilliseconds GetValue(int64_t i) const;
|
||||
TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
// For compatibility with Take kernel.
|
||||
TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
std::optional<TypeClass::DayMilliseconds> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
|
||||
};
|
||||
|
||||
/// \brief Array of Month, Day and nanosecond values.
|
||||
class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = MonthDayNanoIntervalType;
|
||||
using IteratorType = stl::ArrayIterator<MonthDayNanoIntervalArray>;
|
||||
|
||||
explicit MonthDayNanoIntervalArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
MonthDayNanoIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
TypeClass::MonthDayNanos GetValue(int64_t i) const;
|
||||
TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
// For compatibility with Take kernel.
|
||||
TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
std::optional<TypeClass::MonthDayNanos> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,133 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes run-end encoded arrays
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup run-end-encoded-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// RunEndEncoded
|
||||
|
||||
/// \brief Array type for run-end encoded data
|
||||
class ARROW_EXPORT RunEndEncodedArray : public Array {
|
||||
private:
|
||||
std::shared_ptr<Array> run_ends_array_;
|
||||
std::shared_ptr<Array> values_array_;
|
||||
|
||||
public:
|
||||
using TypeClass = RunEndEncodedType;
|
||||
|
||||
explicit RunEndEncodedArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
/// \brief Construct a RunEndEncodedArray from all parameters
|
||||
///
|
||||
/// The length and offset parameters refer to the dimensions of the logical
|
||||
/// array which is the array we would get after expanding all the runs into
|
||||
/// repeated values. As such, length can be much greater than the length of
|
||||
/// the child run_ends and values arrays.
|
||||
RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Array>& run_ends,
|
||||
const std::shared_ptr<Array>& values, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct a RunEndEncodedArray from all parameters
|
||||
///
|
||||
/// The length and offset parameters refer to the dimensions of the logical
|
||||
/// array which is the array we would get after expanding all the runs into
|
||||
/// repeated values. As such, length can be much greater than the length of
|
||||
/// the child run_ends and values arrays.
|
||||
static Result<std::shared_ptr<RunEndEncodedArray>> Make(
|
||||
const std::shared_ptr<DataType>& type, int64_t logical_length,
|
||||
const std::shared_ptr<Array>& run_ends, const std::shared_ptr<Array>& values,
|
||||
int64_t logical_offset = 0);
|
||||
|
||||
/// \brief Construct a RunEndEncodedArray from values and run ends arrays
|
||||
///
|
||||
/// The data type is automatically inferred from the arguments.
|
||||
/// The run_ends and values arrays must have the same length.
|
||||
static Result<std::shared_ptr<RunEndEncodedArray>> Make(
|
||||
int64_t logical_length, const std::shared_ptr<Array>& run_ends,
|
||||
const std::shared_ptr<Array>& values, int64_t logical_offset = 0);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
public:
|
||||
/// \brief Returns an array holding the logical indexes of each run-end
|
||||
///
|
||||
/// The physical offset to the array is applied.
|
||||
const std::shared_ptr<Array>& run_ends() const { return run_ends_array_; }
|
||||
|
||||
/// \brief Returns an array holding the values of each run
|
||||
///
|
||||
/// The physical offset to the array is applied.
|
||||
const std::shared_ptr<Array>& values() const { return values_array_; }
|
||||
|
||||
/// \brief Returns an array holding the logical indexes of each run end
|
||||
///
|
||||
/// If a non-zero logical offset is set, this function allocates a new
|
||||
/// array and rewrites all the run end values to be relative to the logical
|
||||
/// offset and cuts the end of the array to the logical length.
|
||||
Result<std::shared_ptr<Array>> LogicalRunEnds(MemoryPool* pool) const;
|
||||
|
||||
/// \brief Returns an array holding the values of each run
|
||||
///
|
||||
/// If a non-zero logical offset is set, this function allocates a new
|
||||
/// array containing only the values within the logical range.
|
||||
std::shared_ptr<Array> LogicalValues() const;
|
||||
|
||||
/// \brief Find the physical offset of this REE array
|
||||
///
|
||||
/// This function uses binary-search, so it has a O(log N) cost.
|
||||
int64_t FindPhysicalOffset() const;
|
||||
|
||||
/// \brief Find the physical length of this REE array
|
||||
///
|
||||
/// The physical length of an REE is the number of physical values (and
|
||||
/// run-ends) necessary to represent the logical range of values from offset
|
||||
/// to length.
|
||||
///
|
||||
/// Avoid calling this function if the physical length can be established in
|
||||
/// some other way (e.g. when iterating over the runs sequentially until the
|
||||
/// end). This function uses binary-search, so it has a O(log N) cost.
|
||||
int64_t FindPhysicalLength() const;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,215 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
|
||||
public:
|
||||
AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
explicit AdaptiveIntBuilderBase(MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: AdaptiveIntBuilderBase(sizeof(uint8_t), pool, alignment) {}
|
||||
|
||||
/// \brief Append multiple nulls
|
||||
/// \param[in] length the number of nulls to append
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(CommitPendingData());
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
|
||||
UnsafeSetNull(length);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
pending_data_[pending_pos_] = 0;
|
||||
pending_valid_[pending_pos_] = 0;
|
||||
pending_has_nulls_ = true;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
++null_count_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(CommitPendingData());
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
|
||||
UnsafeSetNotNull(length);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
pending_data_[pending_pos_] = 0;
|
||||
pending_valid_[pending_pos_] = 1;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
|
||||
protected:
|
||||
Status AppendInternal(const uint64_t val) {
|
||||
pending_data_[pending_pos_] = val;
|
||||
pending_valid_[pending_pos_] = 1;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status CommitPendingData() = 0;
|
||||
|
||||
template <typename new_type, typename old_type>
|
||||
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
|
||||
ExpandIntSizeInternal();
|
||||
template <typename new_type, typename old_type>
|
||||
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
|
||||
ExpandIntSizeInternal();
|
||||
|
||||
std::shared_ptr<ResizableBuffer> data_;
|
||||
uint8_t* raw_data_ = NULLPTR;
|
||||
|
||||
const uint8_t start_int_size_;
|
||||
uint8_t int_size_;
|
||||
|
||||
static constexpr int32_t pending_size_ = 1024;
|
||||
uint8_t pending_valid_[pending_size_];
|
||||
uint64_t pending_data_[pending_size_];
|
||||
int32_t pending_pos_ = 0;
|
||||
bool pending_has_nulls_ = false;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
|
||||
public:
|
||||
explicit AdaptiveUIntBuilder(uint8_t start_int_size,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool())
|
||||
: AdaptiveUIntBuilder(sizeof(uint8_t), pool) {}
|
||||
|
||||
using internal::AdaptiveIntBuilderBase::Reset;
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const uint64_t val) { return AppendInternal(val); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const uint64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
protected:
|
||||
Status CommitPendingData() override;
|
||||
Status ExpandIntSize(uint8_t new_int_size);
|
||||
|
||||
Status AppendValuesInternal(const uint64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes);
|
||||
|
||||
template <typename new_type>
|
||||
Status ExpandIntSizeN();
|
||||
};
|
||||
|
||||
class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
|
||||
public:
|
||||
explicit AdaptiveIntBuilder(uint8_t start_int_size,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {}
|
||||
|
||||
using internal::AdaptiveIntBuilderBase::Reset;
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const int64_t val) { return AppendInternal(static_cast<uint64_t>(val)); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
protected:
|
||||
Status CommitPendingData() override;
|
||||
Status ExpandIntSize(uint8_t new_int_size);
|
||||
|
||||
Status AppendValuesInternal(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes);
|
||||
|
||||
template <typename new_type>
|
||||
Status ExpandIntSizeN();
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,371 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm> // IWYU pragma: keep
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_primitive.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <class Builder, class V>
|
||||
class ArrayBuilderExtraOps {
|
||||
public:
|
||||
/// \brief Append a value from an optional or null if it has no value.
|
||||
Status AppendOrNull(const std::optional<V>& value) {
|
||||
auto* self = static_cast<Builder*>(this);
|
||||
return value.has_value() ? self->Append(*value) : self->AppendNull();
|
||||
}
|
||||
|
||||
/// \brief Append a value from an optional or null if it has no value.
|
||||
///
|
||||
/// Unsafe methods don't check existing size.
|
||||
void UnsafeAppendOrNull(const std::optional<V>& value) {
|
||||
auto* self = static_cast<Builder*>(this);
|
||||
return value.has_value() ? self->UnsafeAppend(*value) : self->UnsafeAppendNull();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \defgroup numeric-builders Concrete builder subclasses for numeric types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup temporal-builders Concrete builder subclasses for temporal types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup binary-builders Concrete builder subclasses for binary types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup nested-builders Concrete builder subclasses for nested types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup run-end-encoded-builders Concrete builder subclasses for run-end encoded
|
||||
/// arrays
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
constexpr int64_t kMinBuilderCapacity = 1 << 5;
|
||||
constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
|
||||
|
||||
/// Base class for all data array builders.
|
||||
///
|
||||
/// This class provides a facilities for incrementally building the null bitmap
|
||||
/// (see Append methods) and as a side effect the current number of slots and
|
||||
/// the null count.
|
||||
///
|
||||
/// \note Users are expected to use builders as one of the concrete types below.
|
||||
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
|
||||
class ARROW_EXPORT ArrayBuilder {
|
||||
public:
|
||||
explicit ArrayBuilder(MemoryPool* pool, int64_t alignment = kDefaultBufferAlignment)
|
||||
: pool_(pool), alignment_(alignment), null_bitmap_builder_(pool, alignment) {}
|
||||
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
|
||||
|
||||
virtual ~ArrayBuilder() = default;
|
||||
|
||||
/// For nested types. Since the objects are owned by this class instance, we
|
||||
/// skip shared pointers and just return a raw pointer
|
||||
ArrayBuilder* child(int i) { return children_[i].get(); }
|
||||
|
||||
const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
|
||||
|
||||
int num_children() const { return static_cast<int>(children_.size()); }
|
||||
|
||||
virtual int64_t length() const { return length_; }
|
||||
int64_t null_count() const { return null_count_; }
|
||||
int64_t capacity() const { return capacity_; }
|
||||
|
||||
/// \brief Ensure that enough memory has been allocated to fit the indicated
|
||||
/// number of total elements in the builder, including any that have already
|
||||
/// been appended. Does not account for reallocations that may be due to
|
||||
/// variable size data, like binary values. To make space for incremental
|
||||
/// appends, use Reserve instead.
|
||||
///
|
||||
/// \param[in] capacity the minimum number of total array values to
|
||||
/// accommodate. Must be greater than the current capacity.
|
||||
/// \return Status
|
||||
virtual Status Resize(int64_t capacity);
|
||||
|
||||
/// \brief Ensure that there is enough space allocated to append the indicated
|
||||
/// number of elements without any further reallocation. Overallocation is
|
||||
/// used in order to minimize the impact of incremental Reserve() calls.
|
||||
/// Note that additional_capacity is relative to the current number of elements
|
||||
/// rather than to the current capacity, so calls to Reserve() which are not
|
||||
/// interspersed with addition of new elements may not increase the capacity.
|
||||
///
|
||||
/// \param[in] additional_capacity the number of additional array values
|
||||
/// \return Status
|
||||
Status Reserve(int64_t additional_capacity) {
|
||||
auto current_capacity = capacity();
|
||||
auto min_capacity = length() + additional_capacity;
|
||||
if (min_capacity <= current_capacity) return Status::OK();
|
||||
|
||||
// leave growth factor up to BufferBuilder
|
||||
auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
|
||||
return Resize(new_capacity);
|
||||
}
|
||||
|
||||
/// Reset the builder.
|
||||
virtual void Reset();
|
||||
|
||||
/// \brief Append a null value to builder
|
||||
virtual Status AppendNull() = 0;
|
||||
/// \brief Append a number of null values to builder
|
||||
virtual Status AppendNulls(int64_t length) = 0;
|
||||
|
||||
/// \brief Append a non-null value to builder
|
||||
///
|
||||
/// The appended value is an implementation detail, but the corresponding
|
||||
/// memory slot is guaranteed to be initialized.
|
||||
/// This method is useful when appending a null value to a parent nested type.
|
||||
virtual Status AppendEmptyValue() = 0;
|
||||
|
||||
/// \brief Append a number of non-null values to builder
|
||||
///
|
||||
/// The appended values are an implementation detail, but the corresponding
|
||||
/// memory slot is guaranteed to be initialized.
|
||||
/// This method is useful when appending null values to a parent nested type.
|
||||
virtual Status AppendEmptyValues(int64_t length) = 0;
|
||||
|
||||
/// \brief Append a value from a scalar
|
||||
Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
|
||||
virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
|
||||
virtual Status AppendScalars(const ScalarVector& scalars);
|
||||
|
||||
/// \brief Append a range of values from an array.
|
||||
///
|
||||
/// The given array must be the same type as the builder.
|
||||
virtual Status AppendArraySlice(const ArraySpan& ARROW_ARG_UNUSED(array),
|
||||
int64_t ARROW_ARG_UNUSED(offset),
|
||||
int64_t ARROW_ARG_UNUSED(length)) {
|
||||
return Status::NotImplemented("AppendArraySlice for builder for ", *type());
|
||||
}
|
||||
|
||||
/// \brief Return result of builder as an internal generic ArrayData
|
||||
/// object. Resets builder except for dictionary builder
|
||||
///
|
||||
/// \param[out] out the finalized ArrayData object
|
||||
/// \return Status
|
||||
virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
|
||||
|
||||
/// \brief Return result of builder as an Array object.
|
||||
///
|
||||
/// The builder is reset except for DictionaryBuilder.
|
||||
///
|
||||
/// \param[out] out the finalized Array object
|
||||
/// \return Status
|
||||
Status Finish(std::shared_ptr<Array>* out);
|
||||
|
||||
/// \brief Return result of builder as an Array object.
|
||||
///
|
||||
/// The builder is reset except for DictionaryBuilder.
|
||||
///
|
||||
/// \return The finalized Array object
|
||||
Result<std::shared_ptr<Array>> Finish();
|
||||
|
||||
/// \brief Return the type of the built Array
|
||||
virtual std::shared_ptr<DataType> type() const = 0;
|
||||
|
||||
protected:
|
||||
/// Append to null bitmap
|
||||
Status AppendToBitmap(bool is_valid);
|
||||
|
||||
/// Vector append. Treat each zero byte as a null. If valid_bytes is null
|
||||
/// assume all of length bits are valid.
|
||||
Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
|
||||
|
||||
/// Uniform append. Append N times the same validity bit.
|
||||
Status AppendToBitmap(int64_t num_bits, bool value);
|
||||
|
||||
/// Set the next length bits to not null (i.e. valid).
|
||||
Status SetNotNull(int64_t length);
|
||||
|
||||
// Unsafe operations (don't check capacity/don't resize)
|
||||
|
||||
void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
|
||||
|
||||
// Append to null bitmap, update the length
|
||||
void UnsafeAppendToBitmap(bool is_valid) {
|
||||
null_bitmap_builder_.UnsafeAppend(is_valid);
|
||||
++length_;
|
||||
if (!is_valid) ++null_count_;
|
||||
}
|
||||
|
||||
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
|
||||
// assume all of length bits are valid.
|
||||
void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
|
||||
if (valid_bytes == NULLPTR) {
|
||||
return UnsafeSetNotNull(length);
|
||||
}
|
||||
null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
|
||||
length_ += length;
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
// Vector append. Copy from a given bitmap. If bitmap is null assume
|
||||
// all of length bits are valid.
|
||||
void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
|
||||
if (bitmap == NULLPTR) {
|
||||
return UnsafeSetNotNull(length);
|
||||
}
|
||||
null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
|
||||
length_ += length;
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
// Append the same validity value a given number of times.
|
||||
void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
|
||||
if (value) {
|
||||
UnsafeSetNotNull(num_bits);
|
||||
} else {
|
||||
UnsafeSetNull(num_bits);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
|
||||
|
||||
// Set the next validity bits to not null (i.e. valid).
|
||||
void UnsafeSetNotNull(int64_t length);
|
||||
|
||||
// Set the next validity bits to null (i.e. invalid).
|
||||
void UnsafeSetNull(int64_t length);
|
||||
|
||||
static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
|
||||
|
||||
/// \brief Finish to an array of the specified ArrayType
|
||||
template <typename ArrayType>
|
||||
Status FinishTyped(std::shared_ptr<ArrayType>* out) {
|
||||
std::shared_ptr<Array> out_untyped;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out_untyped));
|
||||
*out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Check the requested capacity for validity
|
||||
Status CheckCapacity(int64_t new_capacity) {
|
||||
if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
|
||||
return Status::Invalid(
|
||||
"Resize capacity must be positive (requested: ", new_capacity, ")");
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
|
||||
return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
|
||||
", current length: ", length_, ")");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Check for array type
|
||||
Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
|
||||
const Array& array, const char* message);
|
||||
Status CheckArrayType(Type::type expected_type, const Array& array,
|
||||
const char* message);
|
||||
|
||||
MemoryPool* pool_;
|
||||
int64_t alignment_;
|
||||
|
||||
TypedBufferBuilder<bool> null_bitmap_builder_;
|
||||
int64_t null_count_ = 0;
|
||||
|
||||
// Array length, so far. Also, the index of the next element to be added
|
||||
int64_t length_ = 0;
|
||||
int64_t capacity_ = 0;
|
||||
|
||||
// Child value array builders. These are owned by this class
|
||||
std::vector<std::shared_ptr<ArrayBuilder>> children_;
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
|
||||
};
|
||||
|
||||
/// \brief Construct an empty ArrayBuilder corresponding to the data
|
||||
/// type
|
||||
/// \param[in] pool the MemoryPool to use for allocations
|
||||
/// \param[in] type the data type to create the builder for
|
||||
/// \param[out] out the created ArrayBuilder
|
||||
ARROW_EXPORT
|
||||
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
|
||||
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Construct an empty ArrayBuilder corresponding to the data
|
||||
/// type, where any top-level or nested dictionary builders return the
|
||||
/// exact index type specified by the type.
|
||||
ARROW_EXPORT
|
||||
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
|
||||
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Construct an empty DictionaryBuilder initialized optionally
|
||||
/// with a preexisting dictionary
|
||||
/// \param[in] pool the MemoryPool to use for allocations
|
||||
/// \param[in] type the dictionary type to create the builder for
|
||||
/// \param[in] dictionary the initial dictionary, if any. May be nullptr
|
||||
/// \param[out] out the created ArrayBuilder
|
||||
ARROW_EXPORT
|
||||
Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
const std::shared_ptr<Array>& dictionary,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,993 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/binary_view_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup binary-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary and String
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseBinaryBuilder
|
||||
: public ArrayBuilder,
|
||||
public internal::ArrayBuilderExtraOps<BaseBinaryBuilder<TYPE>, std::string_view> {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
offsets_builder_(pool, alignment),
|
||||
value_data_builder_(pool, alignment) {}
|
||||
|
||||
BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
|
||||
: BaseBinaryBuilder(pool) {}
|
||||
|
||||
Status Append(const uint8_t* value, offset_type length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNextOffset();
|
||||
// Safety check for UBSAN.
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
||||
}
|
||||
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value, offset_type length) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
Status Append(std::string_view value) {
|
||||
return Append(value.data(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
/// Extend the last appended value by appending more data at the end
|
||||
///
|
||||
/// Unlike Append, this does not create a new offset.
|
||||
Status ExtendCurrent(const uint8_t* value, offset_type length) {
|
||||
// Safety check for UBSAN.
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ExtendCurrent(std::string_view value) {
|
||||
return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNextOffset();
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNextOffset();
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append without checking capacity
|
||||
///
|
||||
/// Offsets and data should have been presized using Reserve() and
|
||||
/// ReserveData(), respectively.
|
||||
void UnsafeAppend(const uint8_t* value, offset_type length) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(value, length);
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value, offset_type length) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const std::string& value) {
|
||||
UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(std::string_view value) {
|
||||
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
/// Like ExtendCurrent, but do not check capacity
|
||||
void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
|
||||
value_data_builder_.UnsafeAppend(value, length);
|
||||
}
|
||||
|
||||
void UnsafeExtendCurrent(std::string_view value) {
|
||||
UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppendEmptyValue() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of strings in one shot.
|
||||
///
|
||||
/// \param[in] values a vector of strings
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<std::string>& values,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
std::size_t total_length = std::accumulate(
|
||||
values.begin(), values.end(), 0ULL,
|
||||
[](uint64_t sum, const std::string& str) { return sum + str.size(); });
|
||||
ARROW_RETURN_NOT_OK(Reserve(values.size()));
|
||||
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
||||
|
||||
if (valid_bytes != NULLPTR) {
|
||||
for (std::size_t i = 0; i < values.size(); ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (valid_bytes[i]) {
|
||||
value_data_builder_.UnsafeAppend(
|
||||
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (const auto& value : values) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
value.size());
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeAppendToBitmap(valid_bytes, values.size());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of nul-terminated strings in one shot.
|
||||
/// If one of the values is NULL, it is processed as a null
|
||||
/// value even if the corresponding valid_bytes entry is 1.
|
||||
///
|
||||
/// \param[in] values a contiguous C array of nul-terminated char *
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const char** values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
std::size_t total_length = 0;
|
||||
std::vector<std::size_t> value_lengths(length);
|
||||
bool have_null_value = false;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
if (values[i] != NULLPTR) {
|
||||
auto value_length = strlen(values[i]);
|
||||
value_lengths[i] = value_length;
|
||||
total_length += value_length;
|
||||
} else {
|
||||
have_null_value = true;
|
||||
}
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
||||
|
||||
if (valid_bytes) {
|
||||
int64_t valid_bytes_offset = 0;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (valid_bytes[i]) {
|
||||
if (values[i]) {
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
} else {
|
||||
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
|
||||
i - valid_bytes_offset);
|
||||
UnsafeAppendToBitmap(false);
|
||||
valid_bytes_offset = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
|
||||
} else {
|
||||
if (have_null_value) {
|
||||
std::vector<uint8_t> valid_vector(length, 0);
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (values[i]) {
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
valid_vector[i] = 1;
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(valid_vector.data(), length);
|
||||
} else {
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
}
|
||||
UnsafeAppendToBitmap(NULLPTR, length);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
auto bitmap = array.GetValues<uint8_t>(0, 0);
|
||||
auto offsets = array.GetValues<offset_type>(1);
|
||||
auto data = array.GetValues<uint8_t>(2, 0);
|
||||
auto total_length = offsets[offset + length] - offsets[offset];
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
||||
for (int64_t i = 0; i < length; i++) {
|
||||
if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
|
||||
const offset_type start = offsets[offset + i];
|
||||
const offset_type end = offsets[offset + i + 1];
|
||||
UnsafeAppend(data + start, end - start);
|
||||
} else {
|
||||
UnsafeAppendNull();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
ArrayBuilder::Reset();
|
||||
offsets_builder_.Reset();
|
||||
value_data_builder_.Reset();
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_bytes) {
|
||||
auto new_size = value_data_builder_.length() + new_bytes;
|
||||
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
||||
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
||||
" bytes, have ", new_size);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
// One more than requested for offsets
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated capacity to append the indicated
|
||||
/// number of bytes to the value data buffer without additional allocations
|
||||
Status ReserveData(int64_t elements) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
||||
return value_data_builder_.Reserve(elements);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
// Write final offset (values length)
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
|
||||
// These buffers' padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
|
||||
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
|
||||
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
|
||||
null_count_, 0);
|
||||
Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \return data pointer of the value date builder
|
||||
const uint8_t* value_data() const { return value_data_builder_.data(); }
|
||||
/// \return size of values buffer so far
|
||||
int64_t value_data_length() const { return value_data_builder_.length(); }
|
||||
/// \return capacity of values buffer
|
||||
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
|
||||
|
||||
/// \return data pointer of the value date builder
|
||||
const offset_type* offsets_data() const { return offsets_builder_.data(); }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
|
||||
const offset_type* offsets = offsets_builder_.data();
|
||||
const auto offset = offsets[i];
|
||||
if (i == (length_ - 1)) {
|
||||
*out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
|
||||
} else {
|
||||
*out_length = offsets[i + 1] - offset;
|
||||
}
|
||||
return value_data_builder_.data() + offset;
|
||||
}
|
||||
|
||||
offset_type offset(int64_t i) const { return offsets_data()[i]; }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This view becomes invalid on the next modifying operation.
|
||||
std::string_view GetView(int64_t i) const {
|
||||
offset_type value_length;
|
||||
const uint8_t* value_data = GetValue(i, &value_length);
|
||||
return std::string_view(reinterpret_cast<const char*>(value_data), value_length);
|
||||
}
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t memory_limit() {
|
||||
return std::numeric_limits<offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<offset_type> offsets_builder_;
|
||||
TypedBufferBuilder<uint8_t> value_data_builder_;
|
||||
|
||||
Status AppendNextOffset() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
|
||||
void UnsafeAppendNextOffset() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
};
|
||||
|
||||
/// \class BinaryBuilder
|
||||
/// \brief Builder class for variable-length binary data
|
||||
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
|
||||
public:
|
||||
using BaseBinaryBuilder::BaseBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return binary(); }
|
||||
};
|
||||
|
||||
/// \class StringBuilder
|
||||
/// \brief Builder class for UTF8 strings
|
||||
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
|
||||
public:
|
||||
using BinaryBuilder::BinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return utf8(); }
|
||||
};
|
||||
|
||||
/// \class LargeBinaryBuilder
|
||||
/// \brief Builder class for large variable-length binary data
|
||||
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
|
||||
public:
|
||||
using BaseBinaryBuilder::BaseBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return large_binary(); }
|
||||
};
|
||||
|
||||
/// \class LargeStringBuilder
|
||||
/// \brief Builder class for large UTF8 strings
|
||||
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
|
||||
public:
|
||||
using LargeBinaryBuilder::LargeBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return large_utf8(); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// BinaryViewBuilder, StringViewBuilder
|
||||
//
|
||||
// These builders do not support building raw pointer view arrays.
|
||||
|
||||
namespace internal {
|
||||
|
||||
// We allocate medium-sized memory chunks and accumulate data in those, which
|
||||
// may result in some waste if there are many large-ish strings. If a string
|
||||
// comes along that does not fit into a block, we allocate a new block and
|
||||
// write into that.
|
||||
//
|
||||
// Later we can implement optimizations to continuing filling underfull blocks
|
||||
// after encountering a large string that required allocating a new block.
|
||||
class ARROW_EXPORT StringHeapBuilder {
|
||||
public:
|
||||
static constexpr int64_t kDefaultBlocksize = 32 << 10; // 32KB
|
||||
|
||||
StringHeapBuilder(MemoryPool* pool, int64_t alignment)
|
||||
: pool_(pool), alignment_(alignment) {}
|
||||
|
||||
void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; }
|
||||
|
||||
using c_type = BinaryViewType::c_type;
|
||||
|
||||
template <bool Safe>
|
||||
std::conditional_t<Safe, Result<c_type>, c_type> Append(const uint8_t* value,
|
||||
int64_t length) {
|
||||
if (length <= BinaryViewType::kInlineSize) {
|
||||
return util::ToInlineBinaryView(value, static_cast<int32_t>(length));
|
||||
}
|
||||
|
||||
if constexpr (Safe) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
}
|
||||
|
||||
auto v = util::ToNonInlineBinaryView(value, static_cast<int32_t>(length),
|
||||
static_cast<int32_t>(blocks_.size() - 1),
|
||||
current_offset_);
|
||||
|
||||
memcpy(current_out_buffer_, value, static_cast<size_t>(length));
|
||||
current_out_buffer_ += length;
|
||||
current_remaining_bytes_ -= length;
|
||||
current_offset_ += static_cast<int32_t>(length);
|
||||
return v;
|
||||
}
|
||||
|
||||
static constexpr int64_t ValueSizeLimit() {
|
||||
return std::numeric_limits<int32_t>::max();
|
||||
}
|
||||
|
||||
/// \brief Ensure that the indicated number of bytes can be appended via
|
||||
/// UnsafeAppend operations without the need to allocate more memory
|
||||
Status Reserve(int64_t num_bytes) {
|
||||
if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) {
|
||||
return Status::CapacityError(
|
||||
"BinaryView or StringView elements cannot reference "
|
||||
"strings larger than 2GB");
|
||||
}
|
||||
if (num_bytes > current_remaining_bytes_) {
|
||||
ARROW_RETURN_NOT_OK(FinishLastBlock());
|
||||
current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
std::shared_ptr<ResizableBuffer> new_block,
|
||||
AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_));
|
||||
current_offset_ = 0;
|
||||
current_out_buffer_ = new_block->mutable_data();
|
||||
blocks_.emplace_back(std::move(new_block));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
current_offset_ = 0;
|
||||
current_out_buffer_ = NULLPTR;
|
||||
current_remaining_bytes_ = 0;
|
||||
blocks_.clear();
|
||||
}
|
||||
|
||||
int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
|
||||
|
||||
Result<std::vector<std::shared_ptr<ResizableBuffer>>> Finish() {
|
||||
if (!blocks_.empty()) {
|
||||
ARROW_RETURN_NOT_OK(FinishLastBlock());
|
||||
}
|
||||
current_offset_ = 0;
|
||||
current_out_buffer_ = NULLPTR;
|
||||
current_remaining_bytes_ = 0;
|
||||
return std::move(blocks_);
|
||||
}
|
||||
|
||||
private:
|
||||
Status FinishLastBlock() {
|
||||
if (current_remaining_bytes_ > 0) {
|
||||
// Avoid leaking uninitialized bytes from the allocator
|
||||
ARROW_RETURN_NOT_OK(
|
||||
blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_,
|
||||
/*shrink_to_fit=*/true));
|
||||
blocks_.back()->ZeroPadding();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
MemoryPool* pool_;
|
||||
int64_t alignment_;
|
||||
int64_t blocksize_ = kDefaultBlocksize;
|
||||
std::vector<std::shared_ptr<ResizableBuffer>> blocks_;
|
||||
|
||||
int32_t current_offset_ = 0;
|
||||
uint8_t* current_out_buffer_ = NULLPTR;
|
||||
int64_t current_remaining_bytes_ = 0;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = BinaryViewType;
|
||||
|
||||
// this constructor provided for MakeBuilder compatibility
|
||||
BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool);
|
||||
|
||||
explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
data_builder_(pool, alignment),
|
||||
data_heap_builder_(pool, alignment) {}
|
||||
|
||||
/// Set the size for future preallocated data buffers.
|
||||
///
|
||||
/// The default size is 32KB, so after each 32KB of string data appended to the builder
|
||||
/// a new data buffer will be allocated. Adjust this to a larger value to decrease the
|
||||
/// frequency of allocation, or to a smaller value to lower the overhead of each
|
||||
/// allocation.
|
||||
void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); }
|
||||
|
||||
/// The number of bytes which can be appended to this builder without allocating another
|
||||
/// data buffer.
|
||||
int64_t current_block_bytes_remaining() const {
|
||||
return data_heap_builder_.current_remaining_bytes();
|
||||
}
|
||||
|
||||
Status Append(const uint8_t* value, int64_t length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(true);
|
||||
ARROW_ASSIGN_OR_RAISE(auto v,
|
||||
data_heap_builder_.Append</*Safe=*/true>(value, length));
|
||||
data_builder_.UnsafeAppend(v);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value, int64_t length) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
Status Append(std::string_view value) {
|
||||
return Append(value.data(), static_cast<int64_t>(value.size()));
|
||||
}
|
||||
|
||||
/// \brief Append without checking capacity
|
||||
///
|
||||
/// Builder should have been presized using Reserve() and ReserveData(),
|
||||
/// respectively, and the value must not be larger than 2GB
|
||||
void UnsafeAppend(const uint8_t* value, int64_t length) {
|
||||
UnsafeAppendToBitmap(true);
|
||||
auto v = data_heap_builder_.Append</*Safe=*/false>(value, length);
|
||||
data_builder_.UnsafeAppend(v);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value, int64_t length) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const std::string& value) {
|
||||
UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(std::string_view value) {
|
||||
UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated available capacity in the
|
||||
/// out-of-line data heap to append the indicated number of bytes without
|
||||
/// additional allocations
|
||||
Status ReserveData(int64_t length);
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a empty element (length-0 inline string)
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append several empty elements
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppendEmptyValue() {
|
||||
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
/// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies
|
||||
/// the underlying out-of-line string memory to avoid memory lifetime issues
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
|
||||
void Reset() override;
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return binary_view(); }
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<BinaryViewType::c_type> data_builder_;
|
||||
|
||||
// Accumulates out-of-line data in fixed-size chunks which are then attached
|
||||
// to the resulting ArrayData
|
||||
internal::StringHeapBuilder data_heap_builder_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
|
||||
public:
|
||||
using BinaryViewBuilder::BinaryViewBuilder;
|
||||
std::shared_ptr<DataType> type() const override { return utf8_view(); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeBinaryBuilder
|
||||
|
||||
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = FixedSizeBinaryType;
|
||||
|
||||
explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
Status Append(const uint8_t* value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value));
|
||||
}
|
||||
|
||||
Status Append(std::string_view view) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(view);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const std::string& s) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(s);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const Buffer& s) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(s);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
|
||||
|
||||
template <size_t NBYTES>
|
||||
Status Append(const std::array<uint8_t, NBYTES>& value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(
|
||||
std::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(const uint8_t* data, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
|
||||
int64_t bitmap_offset);
|
||||
|
||||
Status AppendNull() final;
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(
|
||||
array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const uint8_t* value) {
|
||||
UnsafeAppendToBitmap(true);
|
||||
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
|
||||
byte_builder_.UnsafeAppend(value, byte_width_);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
|
||||
}
|
||||
|
||||
void UnsafeAppend(std::string_view value) {
|
||||
#ifndef NDEBUG
|
||||
CheckValueSize(static_cast<size_t>(value.size()));
|
||||
#endif
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); }
|
||||
|
||||
void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
UnsafeAppendToBitmap(false);
|
||||
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_bytes) const {
|
||||
auto new_size = byte_builder_.length() + new_bytes;
|
||||
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
||||
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
||||
" bytes, have ", new_size);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated capacity to append the indicated
|
||||
/// number of bytes to the value data buffer without additional allocations
|
||||
Status ReserveData(int64_t elements) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
||||
return byte_builder_.Reserve(elements);
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \return size of values buffer so far
|
||||
int64_t value_data_length() const { return byte_builder_.length(); }
|
||||
|
||||
int32_t byte_width() const { return byte_width_; }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
const uint8_t* GetValue(int64_t i) const;
|
||||
|
||||
/// Temporary mutable access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
uint8_t* GetMutableValue(int64_t i) {
|
||||
uint8_t* data_ptr = byte_builder_.mutable_data();
|
||||
return data_ptr + i * byte_width_;
|
||||
}
|
||||
|
||||
/// Temporary mutable access to a value.
|
||||
///
|
||||
/// This view becomes invalid on the next modifying operation.
|
||||
std::string_view GetView(int64_t i) const;
|
||||
|
||||
/// Advance builder without allocating nor writing any values
|
||||
///
|
||||
/// The internal pointer is advanced by `length` values and the same number
|
||||
/// of non-null entries are appended to the validity bitmap.
|
||||
/// This method assumes that the `length` values were populated directly,
|
||||
/// for example using `GetMutableValue`.
|
||||
void UnsafeAdvance(int64_t length) {
|
||||
byte_builder_.UnsafeAdvance(length * byte_width_);
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
}
|
||||
|
||||
/// Advance builder without allocating nor writing any values
|
||||
///
|
||||
/// The internal pointer is advanced by `length` values and the same number
|
||||
/// of validity bits are appended to the validity bitmap.
|
||||
/// This method assumes that the `length` values were populated directly,
|
||||
/// for example using `GetMutableValue`.
|
||||
void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) {
|
||||
byte_builder_.UnsafeAdvance(length * byte_width_);
|
||||
UnsafeAppendToBitmap(validity, valid_bits_offset, length);
|
||||
}
|
||||
|
||||
static constexpr int64_t memory_limit() {
|
||||
return std::numeric_limits<int64_t>::max() - 1;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return fixed_size_binary(byte_width_);
|
||||
}
|
||||
|
||||
protected:
|
||||
int32_t byte_width_;
|
||||
BufferBuilder byte_builder_;
|
||||
|
||||
void CheckValueSize(int64_t size);
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Chunked builders: build a sequence of BinaryArray or StringArray that are
|
||||
// limited to a particular size (to the upper limit of 2GB)
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT ChunkedBinaryBuilder {
|
||||
public:
|
||||
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
virtual ~ChunkedBinaryBuilder() = default;
|
||||
|
||||
Status Append(const uint8_t* value, int32_t length) {
|
||||
if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
|
||||
max_chunk_value_length_)) {
|
||||
if (builder_->value_data_length() == 0) {
|
||||
// The current item is larger than max_chunk_size_;
|
||||
// this chunk will be oversize and hold *only* this item
|
||||
ARROW_RETURN_NOT_OK(builder_->Append(value, length));
|
||||
return NextChunk();
|
||||
}
|
||||
// The current item would cause builder_->value_data_length() to exceed
|
||||
// max_chunk_size_, so finish this chunk and append the current item to the next
|
||||
// chunk
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
return Append(value, length);
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
||||
// The current item would cause builder_->length() to exceed max_chunk_length_, so
|
||||
// finish this chunk and append the current item to the next chunk
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
}
|
||||
|
||||
return builder_->Append(value, length);
|
||||
}
|
||||
|
||||
Status Append(std::string_view value) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<int32_t>(value.size()));
|
||||
}
|
||||
|
||||
Status AppendNull() {
|
||||
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
}
|
||||
return builder_->AppendNull();
|
||||
}
|
||||
|
||||
Status Reserve(int64_t values);
|
||||
|
||||
virtual Status Finish(ArrayVector* out);
|
||||
|
||||
protected:
|
||||
Status NextChunk();
|
||||
|
||||
// maximum total character data size per chunk
|
||||
int64_t max_chunk_value_length_;
|
||||
|
||||
// maximum elements allowed per chunk
|
||||
int64_t max_chunk_length_ = kListMaximumElements;
|
||||
|
||||
// when Reserve() would cause builder_ to exceed its max_chunk_length_,
|
||||
// add to extra_capacity_ instead and wait to reserve until the next chunk
|
||||
int64_t extra_capacity_ = 0;
|
||||
|
||||
std::unique_ptr<BinaryBuilder> builder_;
|
||||
std::vector<std::shared_ptr<Array>> chunks_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
|
||||
public:
|
||||
using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
|
||||
|
||||
Status Finish(ArrayVector* out) override;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,164 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_decimal.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT Decimal32Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal32Type;
|
||||
using ValueType = Decimal32;
|
||||
|
||||
explicit Decimal32Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(Decimal32 val);
|
||||
void UnsafeAppend(Decimal32 val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal32Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal32Type> decimal_type_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Decimal64Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal64Type;
|
||||
using ValueType = Decimal64;
|
||||
|
||||
explicit Decimal64Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(Decimal64 val);
|
||||
void UnsafeAppend(Decimal64 val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal64Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal64Type> decimal_type_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal128Type;
|
||||
using ValueType = Decimal128;
|
||||
|
||||
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(Decimal128 val);
|
||||
void UnsafeAppend(Decimal128 val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal128Type> decimal_type_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal256Type;
|
||||
using ValueType = Decimal256;
|
||||
|
||||
explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(const Decimal256& val);
|
||||
void UnsafeAppend(const Decimal256& val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal256Type> decimal_type_;
|
||||
};
|
||||
|
||||
using DecimalBuilder = Decimal128Builder;
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,728 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
|
||||
#include "arrow/array/builder_base.h" // IWYU pragma: export
|
||||
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/array/util.h"
|
||||
#include "arrow/scalar.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_block_counter.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Dictionary builder
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename T, typename Enable = void>
|
||||
struct DictionaryValue {
|
||||
using type = typename T::c_type;
|
||||
using PhysicalType = T;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_base_binary<T>> {
|
||||
using type = std::string_view;
|
||||
using PhysicalType =
|
||||
typename std::conditional<std::is_same<typename T::offset_type, int32_t>::value,
|
||||
BinaryType, LargeBinaryType>::type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_binary_view_like<T>> {
|
||||
using type = std::string_view;
|
||||
using PhysicalType = BinaryViewType;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
|
||||
using type = std::string_view;
|
||||
using PhysicalType = BinaryType;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT DictionaryMemoTable {
|
||||
public:
|
||||
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<DataType>& type);
|
||||
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<Array>& dictionary);
|
||||
~DictionaryMemoTable();
|
||||
|
||||
Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out);
|
||||
|
||||
/// \brief Insert new memo values
|
||||
Status InsertValues(const Array& values);
|
||||
|
||||
int32_t size() const;
|
||||
|
||||
template <typename T>
|
||||
Status GetOrInsert(typename DictionaryValue<T>::type value, int32_t* out) {
|
||||
// We want to keep the DictionaryMemoTable implementation private, also we can't
|
||||
// use extern template classes because of compiler issues (MinGW?). Instead,
|
||||
// we expose explicit function overrides for each supported physical type.
|
||||
const typename DictionaryValue<T>::PhysicalType* physical_type = NULLPTR;
|
||||
return GetOrInsert(physical_type, value, out);
|
||||
}
|
||||
|
||||
private:
|
||||
Status GetOrInsert(const BooleanType*, bool value, int32_t* out);
|
||||
Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
|
||||
Status GetOrInsert(const DurationType*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const MonthDayNanoIntervalType*,
|
||||
MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out);
|
||||
Status GetOrInsert(const DayTimeIntervalType*,
|
||||
DayTimeIntervalType::DayMilliseconds value, int32_t* out);
|
||||
Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const FloatType*, float value, int32_t* out);
|
||||
Status GetOrInsert(const DoubleType*, double value, int32_t* out);
|
||||
|
||||
Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out);
|
||||
Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out);
|
||||
Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out);
|
||||
|
||||
class DictionaryMemoTableImpl;
|
||||
std::unique_ptr<DictionaryMemoTableImpl> impl_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \addtogroup dictionary-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Array builder for created encoded DictionaryArray from
|
||||
/// dense array
|
||||
///
|
||||
/// Unlike other builders, dictionary builder does not completely
|
||||
/// reset the state on Finish calls.
|
||||
template <typename BuilderType, typename T>
|
||||
class DictionaryBuilderBase : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = DictionaryType;
|
||||
using Value = typename DictionaryValue<T>::type;
|
||||
|
||||
// WARNING: the type given below is the value type, not the DictionaryType.
|
||||
// The DictionaryType is instantiated on the Finish() call.
|
||||
template <typename B = BuilderType, typename T1 = T>
|
||||
DictionaryBuilderBase(uint8_t start_int_size,
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
|
||||
!is_fixed_size_binary_type<T1>::value,
|
||||
const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(start_int_size, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
const std::shared_ptr<DataType>& index_type,
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(index_type, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename B = BuilderType, typename T1 = T>
|
||||
DictionaryBuilderBase(uint8_t start_int_size,
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
|
||||
is_fixed_size_binary_type<T1>::value,
|
||||
const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(start_int_size, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
const std::shared_ptr<DataType>& index_type,
|
||||
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(index_type, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
|
||||
: DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
|
||||
|
||||
// This constructor doesn't check for errors. Use InsertMemoValues instead.
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(pool, alignment),
|
||||
value_type_(dictionary->type()) {}
|
||||
|
||||
~DictionaryBuilderBase() override = default;
|
||||
|
||||
/// \brief The current number of entries in the dictionary
|
||||
int64_t dictionary_length() const { return memo_table_->size(); }
|
||||
|
||||
/// \brief The value byte width (for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, int32_t> byte_width() const {
|
||||
return byte_width_;
|
||||
}
|
||||
|
||||
/// \brief Append a scalar value
|
||||
Status Append(Value value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
|
||||
int32_t memo_index;
|
||||
ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert<T>(value, &memo_index));
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index));
|
||||
length_ += 1;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> Append(const uint8_t* value) {
|
||||
return Append(std::string_view(reinterpret_cast<const char*>(value), byte_width_));
|
||||
}
|
||||
|
||||
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> Append(const char* value) {
|
||||
return Append(std::string_view(value, byte_width_));
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for binary types)
|
||||
template <typename T1 = T>
|
||||
enable_if_binary_like<T1, Status> Append(const uint8_t* value, int32_t length) {
|
||||
return Append(reinterpret_cast<const char*>(value), length);
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for binary types)
|
||||
template <typename T1 = T>
|
||||
enable_if_binary_like<T1, Status> Append(const char* value, int32_t length) {
|
||||
return Append(std::string_view(value, length));
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for string types)
|
||||
template <typename T1 = T>
|
||||
enable_if_string_like<T1, Status> Append(const char* value, int32_t length) {
|
||||
return Append(std::string_view(value, length));
|
||||
}
|
||||
|
||||
/// \brief Append a decimal (only for Decimal32/64/128/256 Type)
|
||||
template <typename T1 = T, typename CType = typename TypeTraits<T1>::CType>
|
||||
enable_if_decimal<T1, Status> Append(const CType& value) {
|
||||
auto bytes = value.ToBytes();
|
||||
return Append(bytes.data(), static_cast<int32_t>(bytes.size()));
|
||||
}
|
||||
|
||||
/// \brief Append a scalar null value
|
||||
Status AppendNull() final {
|
||||
length_ += 1;
|
||||
null_count_ += 1;
|
||||
|
||||
return indices_builder_.AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
length_ += length;
|
||||
null_count_ += length;
|
||||
|
||||
return indices_builder_.AppendNulls(length);
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
length_ += 1;
|
||||
|
||||
return indices_builder_.AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
length_ += length;
|
||||
|
||||
return indices_builder_.AppendEmptyValues(length);
|
||||
}
|
||||
|
||||
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
|
||||
if (!scalar.is_valid) return AppendNulls(n_repeats);
|
||||
|
||||
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*scalar.type);
|
||||
const DictionaryScalar& dict_scalar =
|
||||
internal::checked_cast<const DictionaryScalar&>(scalar);
|
||||
const auto& dict = internal::checked_cast<const typename TypeTraits<T>::ArrayType&>(
|
||||
*dict_scalar.value.dictionary);
|
||||
ARROW_RETURN_NOT_OK(Reserve(n_repeats));
|
||||
switch (dict_ty.index_type()->id()) {
|
||||
case Type::UINT8:
|
||||
return AppendScalarImpl<UInt8Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT8:
|
||||
return AppendScalarImpl<Int8Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT16:
|
||||
return AppendScalarImpl<UInt16Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT16:
|
||||
return AppendScalarImpl<Int16Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT32:
|
||||
return AppendScalarImpl<UInt32Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT32:
|
||||
return AppendScalarImpl<Int32Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT64:
|
||||
return AppendScalarImpl<UInt64Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT64:
|
||||
return AppendScalarImpl<Int64Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
default:
|
||||
return Status::TypeError("Invalid index type: ", dict_ty);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendScalars(const ScalarVector& scalars) override {
|
||||
for (const auto& scalar : scalars) {
|
||||
ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
|
||||
// Visit the indices and insert the unpacked values.
|
||||
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
|
||||
// See if possible to avoid using ToArrayData here
|
||||
const typename TypeTraits<T>::ArrayType dict(array.dictionary().ToArrayData());
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
switch (dict_ty.index_type()->id()) {
|
||||
case Type::UINT8:
|
||||
return AppendArraySliceImpl<uint8_t>(dict, array, offset, length);
|
||||
case Type::INT8:
|
||||
return AppendArraySliceImpl<int8_t>(dict, array, offset, length);
|
||||
case Type::UINT16:
|
||||
return AppendArraySliceImpl<uint16_t>(dict, array, offset, length);
|
||||
case Type::INT16:
|
||||
return AppendArraySliceImpl<int16_t>(dict, array, offset, length);
|
||||
case Type::UINT32:
|
||||
return AppendArraySliceImpl<uint32_t>(dict, array, offset, length);
|
||||
case Type::INT32:
|
||||
return AppendArraySliceImpl<int32_t>(dict, array, offset, length);
|
||||
case Type::UINT64:
|
||||
return AppendArraySliceImpl<uint64_t>(dict, array, offset, length);
|
||||
case Type::INT64:
|
||||
return AppendArraySliceImpl<int64_t>(dict, array, offset, length);
|
||||
default:
|
||||
return Status::TypeError("Invalid index type: ", dict_ty);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Insert values into the dictionary's memo, but do not append any
|
||||
/// indices. Can be used to initialize a new builder with known dictionary
|
||||
/// values
|
||||
/// \param[in] values dictionary values to add to memo. Type must match
|
||||
/// builder type
|
||||
Status InsertMemoValues(const Array& values) {
|
||||
return memo_table_->InsertValues(values);
|
||||
}
|
||||
|
||||
/// \brief Append a whole dense array to the builder
|
||||
template <typename T1 = T>
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, Status> AppendArray(
|
||||
const Array& array) {
|
||||
using ArrayType = typename TypeTraits<T>::ArrayType;
|
||||
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
value_type_, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
|
||||
const auto& concrete_array = static_cast<const ArrayType&>(array);
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
if (array.IsNull(i)) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> AppendArray(const Array& array) {
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
value_type_, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
|
||||
const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
if (array.IsNull(i)) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
// Perform a partial reset. Call ResetFull to also reset the accumulated
|
||||
// dictionary values
|
||||
ArrayBuilder::Reset();
|
||||
indices_builder_.Reset();
|
||||
}
|
||||
|
||||
/// \brief Reset and also clear accumulated dictionary values in memo table
|
||||
void ResetFull() {
|
||||
Reset();
|
||||
memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_));
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
|
||||
capacity_ = indices_builder_.capacity();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Return dictionary indices and a delta dictionary since the last
|
||||
/// time that Finish or FinishDelta were called, and reset state of builder
|
||||
/// (except the memo table)
|
||||
Status FinishDelta(std::shared_ptr<Array>* out_indices,
|
||||
std::shared_ptr<Array>* out_delta) {
|
||||
std::shared_ptr<ArrayData> indices_data;
|
||||
std::shared_ptr<ArrayData> delta_data;
|
||||
ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data));
|
||||
*out_indices = MakeArray(indices_data);
|
||||
*out_delta = MakeArray(delta_data);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return ::arrow::dictionary(indices_builder_.type(), value_type_);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename c_type>
|
||||
Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
|
||||
const ArraySpan& array, int64_t offset, int64_t length) {
|
||||
const c_type* values = array.GetValues<c_type>(1) + offset;
|
||||
return VisitBitBlocks(
|
||||
array.buffers[0].data, array.offset + offset, length,
|
||||
[&](const int64_t position) {
|
||||
const int64_t index = static_cast<int64_t>(values[position]);
|
||||
if (dict.IsValid(index)) {
|
||||
return Append(dict.GetView(index));
|
||||
}
|
||||
return AppendNull();
|
||||
},
|
||||
[&]() { return AppendNull(); });
|
||||
}
|
||||
|
||||
template <typename IndexType>
|
||||
Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
|
||||
const Scalar& index_scalar, int64_t n_repeats) {
|
||||
using ScalarType = typename TypeTraits<IndexType>::ScalarType;
|
||||
const auto index = internal::checked_cast<const ScalarType&>(index_scalar).value;
|
||||
if (index_scalar.is_valid && dict.IsValid(index)) {
|
||||
const auto& value = dict.GetView(index);
|
||||
for (int64_t i = 0; i < n_repeats; i++) {
|
||||
ARROW_RETURN_NOT_OK(Append(value));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
return AppendNulls(n_repeats);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
std::shared_ptr<ArrayData> dictionary;
|
||||
ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
|
||||
|
||||
// Set type of array data to the right dictionary type
|
||||
(*out)->type = type();
|
||||
(*out)->dictionary = dictionary;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishWithDictOffset(int64_t dict_offset,
|
||||
std::shared_ptr<ArrayData>* out_indices,
|
||||
std::shared_ptr<ArrayData>* out_dictionary) {
|
||||
// Finalize indices array
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices));
|
||||
|
||||
// Generate dictionary array from hash table contents
|
||||
ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary));
|
||||
delta_offset_ = memo_table_->size();
|
||||
|
||||
// Update internals for further uses of this DictionaryBuilder
|
||||
ArrayBuilder::Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::unique_ptr<DictionaryMemoTable> memo_table_;
|
||||
|
||||
// The size of the dictionary memo at last invocation of Finish, to use in
|
||||
// FinishDelta for computing dictionary deltas
|
||||
int32_t delta_offset_;
|
||||
|
||||
// Only used for FixedSizeBinaryType
|
||||
int32_t byte_width_;
|
||||
|
||||
BuilderType indices_builder_;
|
||||
std::shared_ptr<DataType> value_type_;
|
||||
};
|
||||
|
||||
template <typename BuilderType>
|
||||
class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
|
||||
public:
|
||||
template <typename B = BuilderType>
|
||||
DictionaryBuilderBase(
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
|
||||
start_int_size,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(index_type, pool) {}
|
||||
|
||||
template <typename B = BuilderType>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
|
||||
start_int_size,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
/// \brief Append a scalar null value
|
||||
Status AppendNull() final {
|
||||
length_ += 1;
|
||||
null_count_ += 1;
|
||||
|
||||
return indices_builder_.AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
length_ += length;
|
||||
null_count_ += length;
|
||||
|
||||
return indices_builder_.AppendNulls(length);
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
length_ += 1;
|
||||
|
||||
return indices_builder_.AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
length_ += length;
|
||||
|
||||
return indices_builder_.AppendEmptyValues(length);
|
||||
}
|
||||
|
||||
/// \brief Append a whole dense array to the builder
|
||||
Status AppendArray(const Array& array) {
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
Type::NA, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
|
||||
capacity_ = indices_builder_.capacity();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
|
||||
(*out)->type = dictionary((*out)->type, null());
|
||||
(*out)->dictionary = NullArray(0).data();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return ::arrow::dictionary(indices_builder_.type(), null());
|
||||
}
|
||||
|
||||
protected:
|
||||
BuilderType indices_builder_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
|
||||
/// smallest index size that can accommodate the dictionary indices
|
||||
template <typename T>
|
||||
class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
|
||||
public:
|
||||
using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
|
||||
using BASE::BASE;
|
||||
|
||||
/// \brief Append dictionary indices directly without modifying memo
|
||||
///
|
||||
/// NOTE: Experimental API
|
||||
Status AppendIndices(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
int64_t null_count_before = this->indices_builder_.null_count();
|
||||
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
|
||||
this->capacity_ = this->indices_builder_.capacity();
|
||||
this->length_ += length;
|
||||
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief A DictionaryArray builder that always returns int32 dictionary
|
||||
/// indices so that data cast to dictionary form will have a consistent index
|
||||
/// type, e.g. for creating a ChunkedArray
|
||||
template <typename T>
|
||||
class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
|
||||
public:
|
||||
using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
|
||||
using BASE::BASE;
|
||||
|
||||
/// \brief Append dictionary indices directly without modifying memo
|
||||
///
|
||||
/// NOTE: Experimental API
|
||||
Status AppendIndices(const int32_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
int64_t null_count_before = this->indices_builder_.null_count();
|
||||
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
|
||||
this->capacity_ = this->indices_builder_.capacity();
|
||||
this->length_ += length;
|
||||
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary / Unicode builders
|
||||
// (compatibility aliases; those used to be derived classes with additional
|
||||
// Append() overloads, but they have been folded into DictionaryBuilderBase)
|
||||
|
||||
using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
|
||||
using StringDictionaryBuilder = DictionaryBuilder<StringType>;
|
||||
using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
|
||||
using StringDictionary32Builder = Dictionary32Builder<StringType>;
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,836 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// VarLengthListLikeBuilder
|
||||
|
||||
template <typename TYPE>
|
||||
class VarLengthListLikeBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
/// Use this constructor to incrementally build the value array along with offsets and
|
||||
/// null bitmap.
|
||||
VarLengthListLikeBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
offsets_builder_(pool, alignment),
|
||||
value_builder_(value_builder),
|
||||
value_field_(type->field(0)->WithType(NULLPTR)) {}
|
||||
|
||||
VarLengthListLikeBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: VarLengthListLikeBuilder(pool, value_builder,
|
||||
std::make_shared<TYPE>(value_builder->type()),
|
||||
alignment) {}
|
||||
|
||||
~VarLengthListLikeBuilder() override = default;
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) {
|
||||
return Status::CapacityError(type_name(),
|
||||
" array cannot reserve space for more than ",
|
||||
maximum_elements(), " got ", capacity);
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
|
||||
// One more than requested for list offsets
|
||||
const int64_t offsets_capacity =
|
||||
is_list_view(TYPE::type_id) ? capacity : capacity + 1;
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
ArrayBuilder::Reset();
|
||||
offsets_builder_.Reset();
|
||||
value_builder_->Reset();
|
||||
}
|
||||
|
||||
/// \brief Start a new variable-length list slot
|
||||
///
|
||||
/// This function should be called before appending elements to the
|
||||
/// value builder. Elements appended to the value builder before this function
|
||||
/// is called for the first time, will not be members of any list value.
|
||||
///
|
||||
/// After this function is called, list_length elements SHOULD be appended to
|
||||
/// the values builder. If this contract is violated, the behavior is defined by
|
||||
/// the concrete builder implementation and SHOULD NOT be relied upon unless
|
||||
/// the caller is specifically building a [Large]List or [Large]ListView array.
|
||||
///
|
||||
/// For [Large]List arrays, the list slot length will be the number of elements
|
||||
/// appended to the values builder before the next call to Append* or Finish. For
|
||||
/// [Large]ListView arrays, the list slot length will be exactly list_length, but if
|
||||
/// Append* is called before at least list_length elements are appended to the values
|
||||
/// builder, the current list slot will share elements with the next list
|
||||
/// slots or an invalid [Large]ListView array will be generated because there
|
||||
/// aren't enough elements in the values builder to fill the list slots.
|
||||
///
|
||||
/// If you're building a [Large]List and don't need to be compatible
|
||||
/// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)`
|
||||
/// is a simpler API.
|
||||
///
|
||||
/// \pre if is_valid is false, list_length MUST be 0
|
||||
/// \param is_valid Whether the new list slot is valid
|
||||
/// \param list_length The number of elements in the list
|
||||
Status Append(bool is_valid, int64_t list_length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
assert(is_valid || list_length == 0);
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
// Append() a null list slot with list_length=0.
|
||||
//
|
||||
// When building [Large]List arrays, elements being appended to the values builder
|
||||
// before the next call to Append* or Finish will extend the list slot length, but
|
||||
// that is totally fine because list arrays admit non-empty null list slots.
|
||||
//
|
||||
// In the case of [Large]ListViews that's not a problem either because the
|
||||
// list slot length remains zero.
|
||||
return Append(false, 0);
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
UnsafeAppendEmptyDimensions(/*num_values=*/length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append an empty list slot
|
||||
///
|
||||
/// \post Another call to Append* or Finish should be made before appending to
|
||||
/// the values builder to ensure list slot remains empty
|
||||
Status AppendEmptyValue() final { return Append(true, 0); }
|
||||
|
||||
/// \brief Append an empty list slot
|
||||
///
|
||||
/// \post Another call to Append* or Finish should be made before appending to
|
||||
/// the values builder to ensure the last list slot remains empty
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
UnsafeAppendEmptyDimensions(/*num_values=*/length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// For list-array builders, the sizes are inferred from the offsets.
|
||||
/// BaseListBuilder<T> provides an implementation that doesn't take sizes, but
|
||||
/// this virtual function allows dispatching calls to both list-array and
|
||||
/// list-view-array builders (which need the sizes)
|
||||
///
|
||||
/// \param offsets The offsets of the variable-length lists
|
||||
/// \param sizes The sizes of the variable-length lists
|
||||
/// \param length The number of offsets, sizes, and validity bits to append
|
||||
/// \param valid_bytes If passed, valid_bytes is of equal length to values,
|
||||
/// and any zero byte will be considered as a null for that slot
|
||||
virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length, const uint8_t* valid_bytes) = 0;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
const offset_type* offsets = array.GetValues<offset_type>(1);
|
||||
[[maybe_unused]] const offset_type* sizes = NULLPTR;
|
||||
if constexpr (is_list_view(TYPE::type_id)) {
|
||||
sizes = array.GetValues<offset_type>(2);
|
||||
}
|
||||
static_assert(internal::may_have_validity_bitmap(TYPE::type_id));
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
|
||||
int64_t size = 0;
|
||||
if (is_valid) {
|
||||
if constexpr (is_list_view(TYPE::type_id)) {
|
||||
size = sizes[row];
|
||||
} else {
|
||||
size = offsets[row + 1] - offsets[row];
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size);
|
||||
if (is_valid) {
|
||||
ARROW_RETURN_NOT_OK(
|
||||
value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements) const {
|
||||
auto new_length = value_builder_->length() + new_elements;
|
||||
if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
|
||||
return Status::CapacityError(type_name(), " array cannot contain more than ",
|
||||
maximum_elements(), " elements, have ", new_elements);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
ArrayBuilder* value_builder() const { return value_builder_.get(); }
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t maximum_elements() {
|
||||
return std::numeric_limits<offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr const char* type_name() {
|
||||
if constexpr (is_list_view(TYPE::type_id)) {
|
||||
return "ListView";
|
||||
} else {
|
||||
return "List";
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
/// \brief Append dimensions for num_values empty list slots.
|
||||
///
|
||||
/// ListViewBuilder overrides this to also append the sizes.
|
||||
virtual void UnsafeAppendEmptyDimensions(int64_t num_values) {
|
||||
const int64_t offset = value_builder_->length();
|
||||
for (int64_t i = 0; i < num_values; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Append dimensions for a single list slot.
|
||||
///
|
||||
/// ListViewBuilder overrides this to also append the size.
|
||||
virtual void UnsafeAppendDimensions(int64_t offset, int64_t ARROW_ARG_UNUSED(size)) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
|
||||
}
|
||||
|
||||
TypedBufferBuilder<offset_type> offsets_builder_;
|
||||
std::shared_ptr<ArrayBuilder> value_builder_;
|
||||
std::shared_ptr<Field> value_field_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListBuilder / LargeListBuilder
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListBuilder : public VarLengthListLikeBuilder<TYPE> {
|
||||
private:
|
||||
using BASE = VarLengthListLikeBuilder<TYPE>;
|
||||
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename BASE::offset_type;
|
||||
|
||||
using BASE::BASE;
|
||||
|
||||
using BASE::Append;
|
||||
|
||||
~BaseListBuilder() override = default;
|
||||
|
||||
/// \brief Start a new variable-length list slot
|
||||
///
|
||||
/// This function should be called before beginning to append elements to the
|
||||
/// value builder
|
||||
Status Append(bool is_valid = true) {
|
||||
// The value_length parameter to BASE::Append(bool, int64_t) is ignored when
|
||||
// building a list array, so we can pass 0 here.
|
||||
return BASE::Append(is_valid, 0);
|
||||
}
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const offset_type* offsets, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
ARROW_RETURN_NOT_OK(this->Reserve(length));
|
||||
this->UnsafeAppendToBitmap(valid_bytes, length);
|
||||
this->offsets_builder_.UnsafeAppend(offsets, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length, const uint8_t* valid_bytes) final {
|
||||
// Offsets are assumed to be valid, but the first length-1 sizes have to be
|
||||
// consistent with the offsets to partially rule out the possibility that the
|
||||
// caller is passing sizes that could work if building a list-view, but don't
|
||||
// work on building a list that requires offsets to be non-decreasing.
|
||||
//
|
||||
// CAUTION: the last size element (`sizes[length - 1]`) is not
|
||||
// validated and could be inconsistent with the offsets given in a
|
||||
// subsequent call to AppendValues.
|
||||
#ifndef NDEBUG
|
||||
if (sizes) {
|
||||
for (int64_t i = 0; i < length - 1; ++i) {
|
||||
if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) {
|
||||
if (!valid_bytes || valid_bytes[i]) {
|
||||
return Status::Invalid(
|
||||
"BaseListBuilder: sizes are inconsistent with offsets provided");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return AppendValues(offsets, length, valid_bytes);
|
||||
}
|
||||
|
||||
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length) {
|
||||
return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR);
|
||||
}
|
||||
|
||||
Status AppendNextOffset() {
|
||||
ARROW_RETURN_NOT_OK(this->ValidateOverflow(0));
|
||||
const int64_t num_values = this->value_builder_->length();
|
||||
return this->offsets_builder_.Append(static_cast<offset_type>(num_values));
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
|
||||
// Offset padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> offsets;
|
||||
std::shared_ptr<Buffer> null_bitmap;
|
||||
ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap));
|
||||
|
||||
if (this->value_builder_->length() == 0) {
|
||||
// Try to make sure we get a non-null values buffer (ARROW-2744)
|
||||
ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> items;
|
||||
ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items));
|
||||
|
||||
*out = ArrayData::Make(this->type(), this->length_,
|
||||
{std::move(null_bitmap), std::move(offsets)},
|
||||
{std::move(items)}, this->null_count_);
|
||||
this->Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
/// \class ListBuilder
|
||||
/// \brief Builder class for variable-length list array value types
|
||||
///
|
||||
/// To use this class, you must append values to the child array builder and use
|
||||
/// the Append function to delimit each distinct list value (once the values
|
||||
/// have been appended to the child array) or use the bulk API to append
|
||||
/// a sequence of offsets and null values.
|
||||
///
|
||||
/// A note on types. Per arrow/type.h all types in the c++ implementation are
|
||||
/// logical so even though this class always builds list array, this can
|
||||
/// represent multiple different logical types. If no logical type is provided
|
||||
/// at construction time, the class defaults to List<T> where t is taken from the
|
||||
/// value_builder/values that the object is constructed with.
|
||||
class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
|
||||
public:
|
||||
using BaseListBuilder::BaseListBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
/// \class LargeListBuilder
|
||||
/// \brief Builder class for large variable-length list array value types
|
||||
///
|
||||
/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
|
||||
class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
|
||||
public:
|
||||
using BaseListBuilder::BaseListBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListViewBuilder / LargeListViewBuilder
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListViewBuilder : public VarLengthListLikeBuilder<TYPE> {
|
||||
private:
|
||||
using BASE = VarLengthListLikeBuilder<TYPE>;
|
||||
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename BASE::offset_type;
|
||||
|
||||
using BASE::BASE;
|
||||
|
||||
~BaseListViewBuilder() override = default;
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(BASE::Resize(capacity));
|
||||
return sizes_builder_.Resize(capacity);
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
BASE::Reset();
|
||||
sizes_builder_.Reset();
|
||||
}
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length, const uint8_t* valid_bytes) final {
|
||||
ARROW_RETURN_NOT_OK(this->Reserve(length));
|
||||
this->UnsafeAppendToBitmap(valid_bytes, length);
|
||||
this->offsets_builder_.UnsafeAppend(offsets, length);
|
||||
this->sizes_builder_.UnsafeAppend(sizes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length) {
|
||||
return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
// Offset and sizes padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> null_bitmap;
|
||||
std::shared_ptr<Buffer> offsets;
|
||||
std::shared_ptr<Buffer> sizes;
|
||||
ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap));
|
||||
ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes));
|
||||
|
||||
if (this->value_builder_->length() == 0) {
|
||||
// Try to make sure we get a non-null values buffer (ARROW-2744)
|
||||
ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> items;
|
||||
ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items));
|
||||
|
||||
*out = ArrayData::Make(this->type(), this->length_,
|
||||
{std::move(null_bitmap), std::move(offsets), std::move(sizes)},
|
||||
{std::move(items)}, this->null_count_);
|
||||
this->Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
void UnsafeAppendEmptyDimensions(int64_t num_values) override {
|
||||
for (int64_t i = 0; i < num_values; ++i) {
|
||||
this->offsets_builder_.UnsafeAppend(0);
|
||||
}
|
||||
for (int64_t i = 0; i < num_values; ++i) {
|
||||
this->sizes_builder_.UnsafeAppend(0);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppendDimensions(int64_t offset, int64_t size) override {
|
||||
this->offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
|
||||
this->sizes_builder_.UnsafeAppend(static_cast<offset_type>(size));
|
||||
}
|
||||
|
||||
private:
|
||||
TypedBufferBuilder<offset_type> sizes_builder_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder<ListViewType> {
|
||||
public:
|
||||
using BaseListViewBuilder::BaseListViewBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ListViewArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
class ARROW_EXPORT LargeListViewBuilder final
|
||||
: public BaseListViewBuilder<LargeListViewType> {
|
||||
public:
|
||||
using BaseListViewBuilder::BaseListViewBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeListViewArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Map builder
|
||||
|
||||
/// \class MapBuilder
|
||||
/// \brief Builder class for arrays of variable-size maps
|
||||
///
|
||||
/// To use this class, you must use the Append function to delimit each distinct
|
||||
/// map before appending values to the key and item array builders, or use the
|
||||
/// bulk API to append a sequence of offsets and null maps.
|
||||
///
|
||||
/// Key uniqueness and ordering are not validated.
|
||||
class ARROW_EXPORT MapBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// Use this constructor to define the built array's type explicitly. If key_builder
|
||||
/// or item_builder has indeterminate type, this builder will also.
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& item_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
/// Use this constructor to infer the built array's type. If key_builder or
|
||||
/// item_builder has indeterminate type, this builder will also.
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
|
||||
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
Status Resize(int64_t capacity) override;
|
||||
void Reset() override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const int32_t* offsets, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Start a new variable-length map slot
|
||||
///
|
||||
/// This function should be called before beginning to append elements to the
|
||||
/// key and item builders
|
||||
Status Append();
|
||||
|
||||
Status AppendNull() final;
|
||||
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
const auto* offsets = array.GetValues<int32_t>(1);
|
||||
static_assert(internal::may_have_validity_bitmap(MapType::type_id));
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
|
||||
if (is_valid) {
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
const int64_t slot_length = offsets[row + 1] - offsets[row];
|
||||
// Add together the inner StructArray offset to the Map/List offset
|
||||
int64_t key_value_offset = array.child_data[0].offset + offsets[row];
|
||||
ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
|
||||
array.child_data[0].child_data[0], key_value_offset, slot_length));
|
||||
ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
|
||||
array.child_data[0].child_data[1], key_value_offset, slot_length));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Get builder to append keys.
|
||||
///
|
||||
/// Append a key with this builder should be followed by appending
|
||||
/// an item or null value with item_builder().
|
||||
ArrayBuilder* key_builder() const { return key_builder_.get(); }
|
||||
|
||||
/// \brief Get builder to append items
|
||||
///
|
||||
/// Appending an item with this builder should have been preceded
|
||||
/// by appending a key with key_builder().
|
||||
ArrayBuilder* item_builder() const { return item_builder_.get(); }
|
||||
|
||||
/// \brief Get builder to add Map entries as struct values.
|
||||
///
|
||||
/// This is used instead of key_builder()/item_builder() and allows
|
||||
/// the Map to be built as a list of struct values.
|
||||
ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
// Key and Item builder may update types, but they don't contain the field names,
|
||||
// so we need to reconstruct the type. (See ARROW-13735.)
|
||||
return std::make_shared<MapType>(
|
||||
field(entries_name_,
|
||||
struct_({field(key_name_, key_builder_->type(), false),
|
||||
field(item_name_, item_builder_->type(), item_nullable_)}),
|
||||
false),
|
||||
keys_sorted_);
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements) {
|
||||
return list_builder_->ValidateOverflow(new_elements);
|
||||
}
|
||||
|
||||
protected:
|
||||
inline Status AdjustStructBuilderLength();
|
||||
|
||||
protected:
|
||||
bool keys_sorted_ = false;
|
||||
bool item_nullable_ = false;
|
||||
std::string entries_name_;
|
||||
std::string key_name_;
|
||||
std::string item_name_;
|
||||
std::shared_ptr<ListBuilder> list_builder_;
|
||||
std::shared_ptr<ArrayBuilder> key_builder_;
|
||||
std::shared_ptr<ArrayBuilder> item_builder_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeList builder
|
||||
|
||||
/// \class FixedSizeListBuilder
|
||||
/// \brief Builder class for fixed-length list array value types
|
||||
class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = FixedSizeListType;
|
||||
|
||||
/// Use this constructor to define the built array's type explicitly. If value_builder
|
||||
/// has indeterminate type, this builder will also.
|
||||
FixedSizeListBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
int32_t list_size);
|
||||
|
||||
/// Use this constructor to infer the built array's type. If value_builder has
|
||||
/// indeterminate type, this builder will also.
|
||||
FixedSizeListBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
Status Resize(int64_t capacity) override;
|
||||
void Reset() override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Append a valid fixed length list.
|
||||
///
|
||||
/// This function affects only the validity bitmap; the child values must be appended
|
||||
/// using the child array builder.
|
||||
Status Append();
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes will be read and any zero byte
|
||||
/// will cause the corresponding slot to be null
|
||||
///
|
||||
/// This function affects only the validity bitmap; the child values must be appended
|
||||
/// using the child array builder. This includes appending nulls for null lists.
|
||||
/// XXX this restriction is confusing, should this method be omitted?
|
||||
Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Append a null fixed length list.
|
||||
///
|
||||
/// The child array builder will have the appropriate number of nulls appended
|
||||
/// automatically.
|
||||
Status AppendNull() final;
|
||||
|
||||
/// \brief Append length null fixed length lists.
|
||||
///
|
||||
/// The child array builder will have the appropriate number of nulls appended
|
||||
/// automatically.
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements);
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
|
||||
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
|
||||
array.child_data[0], list_size_ * (array.offset + row), list_size_));
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ArrayBuilder* value_builder() const { return value_builder_.get(); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
|
||||
}
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t maximum_elements() {
|
||||
return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Field> value_field_;
|
||||
const int32_t list_size_;
|
||||
std::shared_ptr<ArrayBuilder> value_builder_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Struct
|
||||
|
||||
// ---------------------------------------------------------------------------------
|
||||
// StructArray builder
|
||||
/// Append, Resize and Reserve methods are acting on StructBuilder.
|
||||
/// Please make sure all these methods of all child-builders' are consistently
|
||||
/// called to maintain data-structure consistency.
|
||||
class ARROW_EXPORT StructBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// If any of field_builders has indeterminate type, this builder will also
|
||||
StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
|
||||
std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// Null bitmap is of equal length to every child field, and any zero byte
|
||||
/// will be considered as a null for that field, but users must using app-
|
||||
/// end methods or advance methods of the child builders' independently to
|
||||
/// insert data.
|
||||
Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(valid_bytes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Append an element to the Struct. All child-builders' Append method must
|
||||
/// be called independently to maintain data-structure consistency.
|
||||
Status Append(bool is_valid = true) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a null value. Automatically appends an empty value to each child
|
||||
/// builder.
|
||||
Status AppendNull() final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
|
||||
}
|
||||
return Append(false);
|
||||
}
|
||||
|
||||
/// \brief Append multiple null values. Automatically appends empty values to each
|
||||
/// child builder.
|
||||
Status AppendNulls(int64_t length) final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
|
||||
}
|
||||
return Append(true);
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
|
||||
ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i],
|
||||
array.offset + offset, length));
|
||||
}
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(validity, array.offset + offset, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
|
||||
ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
|
||||
|
||||
int num_fields() const { return static_cast<int>(children_.size()); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<DataType> type_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,689 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/float16.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ARROW_EXPORT NullBuilder : public ArrayBuilder {
|
||||
public:
|
||||
explicit NullBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t ARROW_ARG_UNUSED(alignment) = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool) {}
|
||||
|
||||
explicit NullBuilder(const std::shared_ptr<DataType>& ARROW_ARG_UNUSED(type),
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: NullBuilder(pool, alignment) {}
|
||||
|
||||
/// \brief Append the specified number of null elements
|
||||
Status AppendNulls(int64_t length) final {
|
||||
if (length < 0) return Status::Invalid("length must be positive");
|
||||
null_count_ += length;
|
||||
length_ += length;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final { return AppendNulls(1); }
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
|
||||
|
||||
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
|
||||
|
||||
Status Append(std::nullptr_t) { return AppendNull(); }
|
||||
|
||||
Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override {
|
||||
return AppendNulls(length);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return null(); }
|
||||
|
||||
Status Finish(std::shared_ptr<NullArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// Base class for all Builders that emit an Array of a scalar numerical type.
|
||||
template <typename T>
|
||||
class NumericBuilder
|
||||
: public ArrayBuilder,
|
||||
public internal::ArrayBuilderExtraOps<NumericBuilder<T>, typename T::c_type> {
|
||||
public:
|
||||
using TypeClass = T;
|
||||
using value_type = typename T::c_type;
|
||||
using ArrayType = typename TypeTraits<T>::ArrayType;
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit NumericBuilder(
|
||||
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
type_(TypeTraits<T>::type_singleton()),
|
||||
data_builder_(pool, alignment) {}
|
||||
|
||||
NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment), type_(type), data_builder_(pool, alignment) {}
|
||||
|
||||
/// Append a single scalar and increase the size if necessary.
|
||||
Status Append(const value_type val) {
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
|
||||
UnsafeAppend(val);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
|
||||
/// The memory at the corresponding data slot is set to 0 to prevent
|
||||
/// uninitialized memory access
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value_type{}); // zero
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a empty element
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append several empty elements
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value_type{}); // zero
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
|
||||
|
||||
value_type* GetMutableValue(int64_t index) {
|
||||
return &data_builder_.mutable_data()[index];
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
data_builder_.Reset();
|
||||
ArrayBuilder::Reset();
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
value_type operator[](int64_t index) const { return GetValue(index); }
|
||||
|
||||
value_type& operator[](int64_t index) {
|
||||
return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] bitmap a validity bitmap to copy (may be null)
|
||||
/// \param[in] bitmap_offset an offset into the validity bitmap
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap,
|
||||
int64_t bitmap_offset) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length,
|
||||
const std::vector<bool>& is_valid) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of values
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<value_type>& values,
|
||||
const std::vector<bool>& is_valid) {
|
||||
if (values.empty()) {
|
||||
return Status::OK();
|
||||
}
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<value_type>& values) {
|
||||
if (values.empty()) {
|
||||
return Status::OK();
|
||||
}
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
|
||||
null_bitmap_builder_.FinishWithLength(length_));
|
||||
ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
|
||||
capacity_ = length_ = null_count_ = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \return Status
|
||||
template <typename ValuesIter>
|
||||
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
// this updates the length_
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot, with a specified nullmap
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \param[in] valid_begin InputIterator with elements indication valid(1)
|
||||
/// or null(0) values.
|
||||
/// \return Status
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
static_assert(!internal::is_null_pointer<ValidIter>::value,
|
||||
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
|
||||
"version instead");
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Same as above, with a pointer type ValidIter
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
// this updates the length_
|
||||
if (valid_begin == NULLPTR) {
|
||||
UnsafeSetNotNull(length);
|
||||
} else {
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(array.GetValues<value_type>(1) + offset, length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
/// Append a single scalar under the assumption that the underlying Buffer is
|
||||
/// large enough.
|
||||
///
|
||||
/// This method does not capacity-check; make sure to call Reserve
|
||||
/// beforehand.
|
||||
void UnsafeAppend(const value_type val) {
|
||||
ArrayBuilder::UnsafeAppendToBitmap(true);
|
||||
data_builder_.UnsafeAppend(val);
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
ArrayBuilder::UnsafeAppendToBitmap(false);
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
}
|
||||
|
||||
/// Advance builder without allocating nor writing any values
|
||||
///
|
||||
/// The internal pointer is advanced by `length` values and the same number
|
||||
/// of non-null entries are appended to the validity bitmap.
|
||||
/// This method assumes that the `length` values were populated directly,
|
||||
/// for example using `GetMutableValue`.
|
||||
void UnsafeAdvance(int64_t length) {
|
||||
data_builder_.UnsafeAdvance(length);
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
}
|
||||
|
||||
/// Advance builder without allocating nor writing any values
|
||||
///
|
||||
/// The internal pointer is advanced by `length` values and the same number
|
||||
/// of validity bits are appended to the validity bitmap.
|
||||
/// This method assumes that the `length` values were populated directly,
|
||||
/// for example using `GetMutableValue`.
|
||||
void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) {
|
||||
data_builder_.UnsafeAdvance(length);
|
||||
UnsafeAppendToBitmap(validity, valid_bits_offset, length);
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<DataType> type_;
|
||||
TypedBufferBuilder<value_type> data_builder_;
|
||||
};
|
||||
|
||||
// Builders
|
||||
|
||||
using UInt8Builder = NumericBuilder<UInt8Type>;
|
||||
using UInt16Builder = NumericBuilder<UInt16Type>;
|
||||
using UInt32Builder = NumericBuilder<UInt32Type>;
|
||||
using UInt64Builder = NumericBuilder<UInt64Type>;
|
||||
|
||||
using Int8Builder = NumericBuilder<Int8Type>;
|
||||
using Int16Builder = NumericBuilder<Int16Type>;
|
||||
using Int32Builder = NumericBuilder<Int32Type>;
|
||||
using Int64Builder = NumericBuilder<Int64Type>;
|
||||
|
||||
using FloatBuilder = NumericBuilder<FloatType>;
|
||||
using DoubleBuilder = NumericBuilder<DoubleType>;
|
||||
|
||||
/// @}
|
||||
|
||||
/// \addtogroup temporal-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
using Date32Builder = NumericBuilder<Date32Type>;
|
||||
using Date64Builder = NumericBuilder<Date64Type>;
|
||||
using Time32Builder = NumericBuilder<Time32Type>;
|
||||
using Time64Builder = NumericBuilder<Time64Type>;
|
||||
using TimestampBuilder = NumericBuilder<TimestampType>;
|
||||
using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
|
||||
using DurationBuilder = NumericBuilder<DurationType>;
|
||||
|
||||
/// @}
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT HalfFloatBuilder : public NumericBuilder<HalfFloatType> {
|
||||
public:
|
||||
using BaseClass = NumericBuilder<HalfFloatType>;
|
||||
using Float16 = arrow::util::Float16;
|
||||
|
||||
using BaseClass::Append;
|
||||
using BaseClass::AppendValues;
|
||||
using BaseClass::BaseClass;
|
||||
using BaseClass::GetValue;
|
||||
using BaseClass::UnsafeAppend;
|
||||
|
||||
/// Scalar append a arrow::util::Float16
|
||||
Status Append(const Float16 val) { return Append(val.bits()); }
|
||||
|
||||
/// Scalar append a arrow::util::Float16, without checking for capacity
|
||||
void UnsafeAppend(const Float16 val) { UnsafeAppend(val.bits()); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of arrow::util::Float16
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const Float16* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
|
||||
valid_bytes);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of arrow::util::Float16
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] bitmap a validity bitmap to copy (may be null)
|
||||
/// \param[in] bitmap_offset an offset into the validity bitmap
|
||||
/// \return Status
|
||||
Status AppendValues(const Float16* values, int64_t length, const uint8_t* bitmap,
|
||||
int64_t bitmap_offset) {
|
||||
return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
|
||||
bitmap, bitmap_offset);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of arrow::util::Float16
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid a std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const Float16* values, int64_t length,
|
||||
const std::vector<bool>& is_valid) {
|
||||
return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
|
||||
is_valid);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector<arrow::util::Float16>
|
||||
/// \param[in] is_valid a std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<Float16>& values,
|
||||
const std::vector<bool>& is_valid) {
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector<arrow::util::Float16>
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<Float16>& values) {
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
|
||||
}
|
||||
|
||||
/// \brief Append one value many times in one shot
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] value a arrow::util::Float16
|
||||
Status AppendValues(int64_t length, Float16 value) {
|
||||
RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value.bits());
|
||||
ArrayBuilder::UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Get the value at a certain index
|
||||
/// \param[in] index the zero-based index
|
||||
/// @tparam T arrow::util::Float16 or value_type (uint16_t)
|
||||
template <typename T = BaseClass::value_type>
|
||||
T GetValue(int64_t index) const {
|
||||
static_assert(std::is_same_v<T, BaseClass::value_type> ||
|
||||
std::is_same_v<T, arrow::util::Float16>);
|
||||
if constexpr (std::is_same_v<T, BaseClass::value_type>) {
|
||||
return BaseClass::GetValue(index);
|
||||
} else {
|
||||
return Float16::FromBits(BaseClass::GetValue(index));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
class ARROW_EXPORT BooleanBuilder
|
||||
: public ArrayBuilder,
|
||||
public internal::ArrayBuilderExtraOps<BooleanBuilder, bool> {
|
||||
public:
|
||||
using TypeClass = BooleanType;
|
||||
using value_type = bool;
|
||||
|
||||
explicit BooleanBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
BooleanBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, false);
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNull();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(false);
|
||||
UnsafeSetNotNull(1);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, false);
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const bool val) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(val);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const uint8_t val) { return Append(val != 0); }
|
||||
|
||||
/// Scalar append, without checking for capacity
|
||||
void UnsafeAppend(const bool val) {
|
||||
data_builder_.UnsafeAppend(val);
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
data_builder_.UnsafeAppend(false);
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of bytes (non-zero is 1)
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a bitmap of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] validity a validity bitmap to copy (may be null)
|
||||
/// \param[in] offset an offset into the values and validity bitmaps
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity,
|
||||
int64_t offset);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length,
|
||||
const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of bytes
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<uint8_t>& values,
|
||||
const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of bytes
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<uint8_t>& values);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values an std::vector<bool> indicating true (1) or false
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values an std::vector<bool> indicating true (1) or false
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<bool>& values);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// or null(0) values
|
||||
/// \return Status
|
||||
template <typename ValuesIter>
|
||||
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
// this updates length_
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot, with a specified nullmap
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \param[in] valid_begin InputIterator with elements indication valid(1)
|
||||
/// or null(0) values
|
||||
/// \return Status
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
static_assert(!internal::is_null_pointer<ValidIter>::value,
|
||||
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
|
||||
"version instead");
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Same as above, for a pointer type ValidIter
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
|
||||
if (valid_begin == NULLPTR) {
|
||||
UnsafeSetNotNull(length);
|
||||
} else {
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
}
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(int64_t length, bool value);
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(array.GetValues<uint8_t>(1, 0), length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<BooleanArray>* out) { return FinishTyped(out); }
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return boolean(); }
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<bool> data_builder_;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,303 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup run-end-encoded-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief An ArrayBuilder that deduplicates repeated values as they are
|
||||
/// appended to the inner-ArrayBuilder and reports the length of the current run
|
||||
/// of identical values.
|
||||
///
|
||||
/// The following sequence of calls
|
||||
///
|
||||
/// Append(2)
|
||||
/// Append(2)
|
||||
/// Append(2)
|
||||
/// Append(7)
|
||||
/// Append(7)
|
||||
/// Append(2)
|
||||
/// FinishInternal()
|
||||
///
|
||||
/// will cause the inner-builder to receive only 3 Append calls
|
||||
///
|
||||
/// Append(2)
|
||||
/// Append(7)
|
||||
/// Append(2)
|
||||
/// FinishInternal()
|
||||
///
|
||||
/// Note that values returned by length(), null_count() and capacity() are
|
||||
/// related to the compressed array built by the inner-ArrayBuilder.
|
||||
class RunCompressorBuilder : public ArrayBuilder {
|
||||
public:
|
||||
RunCompressorBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> inner_builder,
|
||||
std::shared_ptr<DataType> type);
|
||||
|
||||
~RunCompressorBuilder() override;
|
||||
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(RunCompressorBuilder);
|
||||
|
||||
/// \brief Called right before a run is being closed
|
||||
///
|
||||
/// Subclasses can override this function to perform an additional action when
|
||||
/// a run is closed (i.e. run-length is known and value is appended to the
|
||||
/// inner builder).
|
||||
///
|
||||
/// \param value can be NULLPTR if closing a run of NULLs
|
||||
/// \param length the greater than 0 length of the value run being closed
|
||||
virtual Status WillCloseRun(const std::shared_ptr<const Scalar>& value,
|
||||
int64_t length) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Called right before a run of empty values is being closed
|
||||
///
|
||||
/// Subclasses can override this function to perform an additional action when
|
||||
/// a run of empty values is appended (i.e. run-length is known and a single
|
||||
/// empty value is appended to the inner builder).
|
||||
///
|
||||
/// \param length the greater than 0 length of the value run being closed
|
||||
virtual Status WillCloseRunOfEmptyValues(int64_t length) { return Status::OK(); }
|
||||
|
||||
/// \brief Allocate enough memory for a given number of array elements.
|
||||
///
|
||||
/// NOTE: Conservatively resizing a run-length compressed array for a given
|
||||
/// number of logical elements is not possible, since the physical length will
|
||||
/// vary depending on the values to be appended in the future. But we can
|
||||
/// pessimistically assume that each run will contain a single value and
|
||||
/// allocate that number of runs.
|
||||
Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
|
||||
|
||||
/// \brief Allocate enough memory for a given number of runs.
|
||||
///
|
||||
/// Like Resize on non-encoded builders, it does not account for variable size
|
||||
/// data.
|
||||
Status ResizePhysical(int64_t capacity);
|
||||
|
||||
Status ReservePhysical(int64_t additional_capacity) {
|
||||
return Reserve(additional_capacity);
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
|
||||
Status AppendNull() final { return AppendNulls(1); }
|
||||
Status AppendNulls(int64_t length) override;
|
||||
|
||||
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
|
||||
Status AppendEmptyValues(int64_t length) override;
|
||||
|
||||
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
|
||||
Status AppendScalars(const ScalarVector& scalars) override;
|
||||
|
||||
// AppendArraySlice() is not implemented.
|
||||
|
||||
/// \brief Append a slice of an array containing values from already
|
||||
/// compressed runs.
|
||||
///
|
||||
/// NOTE: WillCloseRun() is not called as the length of each run cannot be
|
||||
/// determined at this point. Caller should ensure that !has_open_run() by
|
||||
/// calling FinishCurrentRun() before calling this.
|
||||
///
|
||||
/// Pre-condition: !has_open_run()
|
||||
Status AppendRunCompressedArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length);
|
||||
|
||||
/// \brief Forces the closing of the current run if one is currently open.
|
||||
///
|
||||
/// This can be called when one wants to ensure the current run will not be
|
||||
/// extended. This may cause identical values to appear close to each other in
|
||||
/// the underlying array (i.e. two runs that could be a single run) if more
|
||||
/// values are appended after this is called.
|
||||
///
|
||||
/// Finish() and FinishInternal() call this automatically.
|
||||
virtual Status FinishCurrentRun();
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
ArrayBuilder& inner_builder() const { return *inner_builder_; }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return inner_builder_->type(); }
|
||||
|
||||
bool has_open_run() const { return current_run_length_ > 0; }
|
||||
int64_t open_run_length() const { return current_run_length_; }
|
||||
|
||||
private:
|
||||
inline void UpdateDimensions() {
|
||||
capacity_ = inner_builder_->capacity();
|
||||
length_ = inner_builder_->length();
|
||||
null_count_ = inner_builder_->null_count();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<ArrayBuilder> inner_builder_;
|
||||
std::shared_ptr<const Scalar> current_value_ = NULLPTR;
|
||||
int64_t current_run_length_ = 0;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// RunEndEncoded builder
|
||||
|
||||
/// \brief Run-end encoded array builder.
|
||||
///
|
||||
/// NOTE: the value returned by and capacity() is related to the
|
||||
/// compressed array (physical) and not the decoded array (logical) that is
|
||||
/// run-end encoded. null_count() always returns 0. length(), on the other hand,
|
||||
/// returns the logical length of the run-end encoded array.
|
||||
class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {
|
||||
private:
|
||||
// An internal::RunCompressorBuilder that produces a run-end in the
|
||||
// RunEndEncodedBuilder every time a value-run is closed.
|
||||
class ValueRunBuilder : public internal::RunCompressorBuilder {
|
||||
public:
|
||||
ValueRunBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
RunEndEncodedBuilder& ree_builder);
|
||||
|
||||
~ValueRunBuilder() override = default;
|
||||
|
||||
Status WillCloseRun(const std::shared_ptr<const Scalar>&, int64_t length) override {
|
||||
return ree_builder_.CloseRun(length);
|
||||
}
|
||||
|
||||
Status WillCloseRunOfEmptyValues(int64_t length) override {
|
||||
return ree_builder_.CloseRun(length);
|
||||
}
|
||||
|
||||
private:
|
||||
RunEndEncodedBuilder& ree_builder_;
|
||||
};
|
||||
|
||||
public:
|
||||
RunEndEncodedBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& run_end_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
std::shared_ptr<DataType> type);
|
||||
|
||||
/// \brief Allocate enough memory for a given number of array elements.
|
||||
///
|
||||
/// NOTE: Conservatively resizing an REE for a given number of logical
|
||||
/// elements is not possible, since the physical length will vary depending on
|
||||
/// the values to be appended in the future. But we can pessimistically assume
|
||||
/// that each run will contain a single value and allocate that number of
|
||||
/// runs.
|
||||
Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
|
||||
|
||||
/// \brief Allocate enough memory for a given number of runs.
|
||||
Status ResizePhysical(int64_t capacity);
|
||||
|
||||
/// \brief Ensure that there is enough space allocated to append the indicated
|
||||
/// number of run without any further reallocation. Overallocation is
|
||||
/// used in order to minimize the impact of incremental ReservePhysical() calls.
|
||||
/// Note that additional_capacity is relative to the current number of elements
|
||||
/// rather than to the current capacity, so calls to Reserve() which are not
|
||||
/// interspersed with addition of new elements may not increase the capacity.
|
||||
///
|
||||
/// \param[in] additional_capacity the number of additional runs
|
||||
/// \return Status
|
||||
Status ReservePhysical(int64_t additional_capacity) {
|
||||
return Reserve(additional_capacity);
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
|
||||
Status AppendNull() final { return AppendNulls(1); }
|
||||
Status AppendNulls(int64_t length) override;
|
||||
|
||||
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
|
||||
Status AppendEmptyValues(int64_t length) override;
|
||||
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
|
||||
Status AppendScalars(const ScalarVector& scalars) override;
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<RunEndEncodedArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Forces the closing of the current run if one is currently open.
|
||||
///
|
||||
/// This can be called when one wants to ensure the current run will not be
|
||||
/// extended. This may cause identical values to appear close to each other in
|
||||
/// the values array (i.e. two runs that could be a single run) if more
|
||||
/// values are appended after this is called.
|
||||
Status FinishCurrentRun();
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
private:
|
||||
/// \brief Update physical capacity and logical length
|
||||
///
|
||||
/// \param committed_logical_length number of logical values that have been
|
||||
/// committed to the values array
|
||||
/// \param open_run_length number of logical values in the currently open run if any
|
||||
inline void UpdateDimensions(int64_t committed_logical_length,
|
||||
int64_t open_run_length) {
|
||||
capacity_ = run_end_builder().capacity();
|
||||
length_ = committed_logical_length + open_run_length;
|
||||
committed_logical_length_ = committed_logical_length;
|
||||
}
|
||||
|
||||
// Pre-condition: !value_run_builder_.has_open_run()
|
||||
template <typename RunEndCType>
|
||||
Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length);
|
||||
|
||||
template <typename RunEndCType>
|
||||
Status DoAppendRunEnd(int64_t run_end);
|
||||
|
||||
/// \brief Cast run_end to the appropriate type and appends it to the run_ends
|
||||
/// array.
|
||||
Status AppendRunEnd(int64_t run_end);
|
||||
|
||||
/// \brief Close a run by appending a value to the run_ends array and updating
|
||||
/// length_ to reflect the new run.
|
||||
///
|
||||
/// Pre-condition: run_length > 0.
|
||||
[[nodiscard]] Status CloseRun(int64_t run_length);
|
||||
|
||||
ArrayBuilder& run_end_builder();
|
||||
ArrayBuilder& value_builder();
|
||||
|
||||
private:
|
||||
std::shared_ptr<RunEndEncodedType> type_;
|
||||
ValueRunBuilder* value_run_builder_;
|
||||
// The length not counting the current open run in the value_run_builder_
|
||||
int64_t committed_logical_length_ = 0;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,66 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Contains declarations of time related Arrow builder types.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/builder_primitive.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup temporal-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// TODO(ARROW-7938): this class is untested
|
||||
|
||||
class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
|
||||
public:
|
||||
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
|
||||
|
||||
explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: DayTimeIntervalBuilder(day_time_interval(), pool, alignment) {}
|
||||
|
||||
explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: NumericBuilder<DayTimeIntervalType>(type, pool, alignment) {}
|
||||
};
|
||||
|
||||
class ARROW_EXPORT MonthDayNanoIntervalBuilder
|
||||
: public NumericBuilder<MonthDayNanoIntervalType> {
|
||||
public:
|
||||
using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos;
|
||||
|
||||
explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool, alignment) {}
|
||||
|
||||
explicit MonthDayNanoIntervalBuilder(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: NumericBuilder<MonthDayNanoIntervalType>(type, pool, alignment) {}
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,254 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Base class for union array builds.
|
||||
///
|
||||
/// Note that while we subclass ArrayBuilder, as union types do not have a
|
||||
/// validity bitmap, the bitmap builder member of ArrayBuilder is not used.
|
||||
class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
|
||||
public:
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Make a new child builder available to the UnionArray
|
||||
///
|
||||
/// \param[in] new_child the child builder
|
||||
/// \param[in] field_name the name of the field in the union array type
|
||||
/// if type inference is used
|
||||
/// \return child index, which is the "type" argument that needs
|
||||
/// to be passed to the "Append" method to add a new element to
|
||||
/// the union array.
|
||||
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
|
||||
const std::string& field_name = "");
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
int64_t length() const override { return types_builder_.length(); }
|
||||
|
||||
protected:
|
||||
BasicUnionBuilder(MemoryPool* pool, int64_t alignment,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
int8_t NextTypeId();
|
||||
|
||||
std::vector<std::shared_ptr<Field>> child_fields_;
|
||||
std::vector<int8_t> type_codes_;
|
||||
UnionMode::type mode_;
|
||||
|
||||
std::vector<ArrayBuilder*> type_id_to_children_;
|
||||
std::vector<int> type_id_to_child_id_;
|
||||
// for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
|
||||
int8_t dense_type_id_ = 0;
|
||||
TypedBufferBuilder<int8_t> types_builder_;
|
||||
};
|
||||
|
||||
/// \class DenseUnionBuilder
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
|
||||
public:
|
||||
/// Use this constructor to initialize the UnionBuilder with no child builders,
|
||||
/// allowing type to be inferred. You will need to call AppendChild for each of the
|
||||
/// children builders you want to use.
|
||||
explicit DenseUnionBuilder(MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, {}, dense_union(FieldVector{})),
|
||||
offsets_builder_(pool, alignment) {}
|
||||
|
||||
/// Use this constructor to specify the type explicitly.
|
||||
/// You can still add child builders to the union after using this constructor
|
||||
DenseUnionBuilder(MemoryPool* pool,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, children, type),
|
||||
offsets_builder_(pool, alignment) {}
|
||||
|
||||
Status AppendNull() final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
|
||||
// Append a null arbitrarily to the first child
|
||||
return child_builder->AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
|
||||
// Append just a single null to the first child
|
||||
return child_builder->AppendNull();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
|
||||
// Append an empty value arbitrarily to the first child
|
||||
return child_builder->AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
|
||||
// Append just a single empty value to the first child
|
||||
return child_builder->AppendEmptyValue();
|
||||
}
|
||||
|
||||
/// \brief Append an element to the UnionArray. This must be followed
|
||||
/// by an append to the appropriate child builder.
|
||||
///
|
||||
/// \param[in] next_type type_id of the child to which the next value will be appended.
|
||||
///
|
||||
/// The corresponding child builder must be appended to independently after this method
|
||||
/// is called.
|
||||
Status Append(int8_t next_type) {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
|
||||
if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
|
||||
return Status::CapacityError(
|
||||
"a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
|
||||
"child");
|
||||
}
|
||||
auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
|
||||
return offsets_builder_.Append(offset);
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
private:
|
||||
TypedBufferBuilder<int32_t> offsets_builder_;
|
||||
};
|
||||
|
||||
/// \class SparseUnionBuilder
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
|
||||
public:
|
||||
/// Use this constructor to initialize the UnionBuilder with no child builders,
|
||||
/// allowing type to be inferred. You will need to call AppendChild for each of the
|
||||
/// children builders you want to use.
|
||||
explicit SparseUnionBuilder(MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, {}, sparse_union(FieldVector{})) {}
|
||||
|
||||
/// Use this constructor to specify the type explicitly.
|
||||
/// You can still add child builders to the union after using this constructor
|
||||
SparseUnionBuilder(MemoryPool* pool,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, children, type) {}
|
||||
|
||||
/// \brief Append a null value.
|
||||
///
|
||||
/// A null is appended to the first child, empty values to the other children.
|
||||
Status AppendNull() final {
|
||||
const auto first_child_code = type_codes_[0];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
|
||||
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append multiple null values.
|
||||
///
|
||||
/// Nulls are appended to the first child, empty values to the other children.
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const auto first_child_code = type_codes_[0];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
|
||||
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
|
||||
ARROW_RETURN_NOT_OK(
|
||||
type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
|
||||
for (int8_t code : type_codes_) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
|
||||
for (int8_t code : type_codes_) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append an element to the UnionArray. This must be followed
|
||||
/// by an append to the appropriate child builder.
|
||||
///
|
||||
/// \param[in] next_type type_id of the child to which the next value will be appended.
|
||||
///
|
||||
/// The corresponding child builder must be appended to independently after this method
|
||||
/// is called, and all other child builders must have null or empty value appended.
|
||||
Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,53 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
/// \brief Concatenate arrays
|
||||
///
|
||||
/// \param[in] arrays a vector of arrays to be concatenated
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \param[out] out_suggested_cast if a non-OK Result is returned, the function might set
|
||||
/// out_suggested_cast to a cast suggestion that would allow concatenating the arrays
|
||||
/// without overflow of offsets (e.g. string to large_string)
|
||||
///
|
||||
/// \return the concatenated array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, MemoryPool* pool,
|
||||
std::shared_ptr<DataType>* out_suggested_cast);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief Concatenate arrays
|
||||
///
|
||||
/// \param[in] arrays a vector of arrays to be concatenated
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \return the concatenated array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,750 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic> // IWYU pragma: export
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/statistics.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/span.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace internal {
|
||||
// ----------------------------------------------------------------------
|
||||
// Null handling for types without a validity bitmap and the dictionary type
|
||||
|
||||
ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i);
|
||||
ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i);
|
||||
ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i);
|
||||
|
||||
ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data);
|
||||
ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data);
|
||||
ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// When slicing, we do not know the null count of the sliced range without
|
||||
// doing some computation. To avoid doing this eagerly, we set the null count
|
||||
// to -1 (any negative number will do). When Array::null_count is called the
|
||||
// first time, the null count will be computed. See ARROW-33
|
||||
constexpr int64_t kUnknownNullCount = -1;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Generic array data container
|
||||
|
||||
/// \class ArrayData
|
||||
/// \brief Mutable container for generic Arrow array data
|
||||
///
|
||||
/// This data structure is a self-contained representation of the memory and
|
||||
/// metadata inside an Arrow array data structure (called vectors in Java). The
|
||||
/// Array class and its concrete subclasses provide strongly-typed accessors
|
||||
/// with support for the visitor pattern and other affordances.
|
||||
///
|
||||
/// This class is designed for easy internal data manipulation, analytical data
|
||||
/// processing, and data transport to and from IPC messages.
|
||||
///
|
||||
/// This class is also useful in an analytics setting where memory may be
|
||||
/// efficiently reused. For example, computing the Abs of a numeric array
|
||||
/// should return null iff the input is null: therefore, an Abs function can
|
||||
/// reuse the validity bitmap (a Buffer) of its input as the validity bitmap
|
||||
/// of its output.
|
||||
///
|
||||
/// This class is meant mostly for immutable data access. Any mutable access
|
||||
/// (either to ArrayData members or to the contents of its Buffers) should take
|
||||
/// into account the fact that ArrayData instances are typically wrapped in a
|
||||
/// shared_ptr and can therefore have multiple owners at any given time.
|
||||
/// Therefore, mutable access is discouraged except when initially populating
|
||||
/// the ArrayData.
|
||||
struct ARROW_EXPORT ArrayData {
|
||||
ArrayData() = default;
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: ArrayData(std::move(type), length, null_count, offset) {
|
||||
this->buffers = std::move(buffers);
|
||||
#ifndef NDEBUG
|
||||
// in debug mode, call the `device_type` function to trigger
|
||||
// the DCHECKs that validate all the buffers are on the same device
|
||||
ARROW_UNUSED(this->device_type());
|
||||
#endif
|
||||
}
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: ArrayData(std::move(type), length, null_count, offset) {
|
||||
this->buffers = std::move(buffers);
|
||||
this->child_data = std::move(child_data);
|
||||
#ifndef NDEBUG
|
||||
// in debug mode, call the `device_type` function to trigger
|
||||
// the DCHECKs that validate all the buffers (including children)
|
||||
// are on the same device
|
||||
ARROW_UNUSED(this->device_type());
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(
|
||||
std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(
|
||||
std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
|
||||
int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
// Move constructor
|
||||
ArrayData(ArrayData&& other) noexcept
|
||||
: type(std::move(other.type)),
|
||||
length(other.length),
|
||||
null_count(other.null_count.load()),
|
||||
offset(other.offset),
|
||||
buffers(std::move(other.buffers)),
|
||||
child_data(std::move(other.child_data)),
|
||||
dictionary(std::move(other.dictionary)),
|
||||
statistics(std::move(other.statistics)) {}
|
||||
|
||||
// Copy constructor
|
||||
ArrayData(const ArrayData& other) noexcept
|
||||
: type(other.type),
|
||||
length(other.length),
|
||||
null_count(other.null_count.load()),
|
||||
offset(other.offset),
|
||||
buffers(other.buffers),
|
||||
child_data(other.child_data),
|
||||
dictionary(other.dictionary),
|
||||
statistics(other.statistics) {}
|
||||
|
||||
// Move assignment
|
||||
ArrayData& operator=(ArrayData&& other) {
|
||||
type = std::move(other.type);
|
||||
length = other.length;
|
||||
SetNullCount(other.null_count);
|
||||
offset = other.offset;
|
||||
buffers = std::move(other.buffers);
|
||||
child_data = std::move(other.child_data);
|
||||
dictionary = std::move(other.dictionary);
|
||||
statistics = std::move(other.statistics);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Copy assignment
|
||||
ArrayData& operator=(const ArrayData& other) {
|
||||
type = other.type;
|
||||
length = other.length;
|
||||
SetNullCount(other.null_count);
|
||||
offset = other.offset;
|
||||
buffers = other.buffers;
|
||||
child_data = other.child_data;
|
||||
dictionary = other.dictionary;
|
||||
statistics = other.statistics;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Return a shallow copy of this ArrayData
|
||||
std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
|
||||
|
||||
/// \brief Deep copy this ArrayData to destination memory manager
|
||||
///
|
||||
/// Returns a new ArrayData object with buffers and all child buffers
|
||||
/// copied to the destination memory manager. This includes dictionaries
|
||||
/// if applicable.
|
||||
Result<std::shared_ptr<ArrayData>> CopyTo(
|
||||
const std::shared_ptr<MemoryManager>& to) const;
|
||||
|
||||
/// \brief View or copy this ArrayData to destination memory manager
|
||||
///
|
||||
/// Tries to view the buffer contents on the given memory manager's device
|
||||
/// if possible (to avoid a copy) but falls back to copying if a no-copy view
|
||||
/// isn't supported.
|
||||
Result<std::shared_ptr<ArrayData>> ViewOrCopyTo(
|
||||
const std::shared_ptr<MemoryManager>& to) const;
|
||||
|
||||
/// \brief Return the null-ness of a given array element
|
||||
///
|
||||
/// Calling `IsNull(i)` is the same as `!IsValid(i)`.
|
||||
bool IsNull(int64_t i) const { return !IsValid(i); }
|
||||
|
||||
/// \brief Return the validity of a given array element
|
||||
///
|
||||
/// For most data types, this will simply query the validity bitmap.
|
||||
/// For union and run-end-encoded arrays, the underlying child data is
|
||||
/// queried instead.
|
||||
/// For dictionary arrays, this reflects the validity of the dictionary
|
||||
/// index, but the corresponding dictionary value might still be null.
|
||||
/// For null arrays, this always returns false.
|
||||
bool IsValid(int64_t i) const {
|
||||
if (buffers[0] != NULLPTR) {
|
||||
return bit_util::GetBit(buffers[0]->data(), i + offset);
|
||||
}
|
||||
const auto type = this->type->id();
|
||||
if (type == Type::SPARSE_UNION) {
|
||||
return !internal::IsNullSparseUnion(*this, i);
|
||||
}
|
||||
if (type == Type::DENSE_UNION) {
|
||||
return !internal::IsNullDenseUnion(*this, i);
|
||||
}
|
||||
if (type == Type::RUN_END_ENCODED) {
|
||||
return !internal::IsNullRunEndEncoded(*this, i);
|
||||
}
|
||||
return null_count.load() != length;
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
/// \param absolute_offset the offset into the buffer
|
||||
///
|
||||
/// If `absolute_offset` is non-zero, the type `T` must match the
|
||||
/// layout of buffer number `i` for the array's data type; otherwise
|
||||
/// offset computation would be incorrect.
|
||||
///
|
||||
/// If the given buffer is bit-packed (such as a validity bitmap, or
|
||||
/// the data buffer of a boolean array), then `absolute_offset` must be
|
||||
/// zero for correct results, and any bit offset must be applied manually
|
||||
/// by the caller.
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i, int64_t absolute_offset) const {
|
||||
if (buffers[i]) {
|
||||
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
///
|
||||
/// This method uses the array's offset to index into buffer number `i`.
|
||||
///
|
||||
/// Calling this method on a bit-packed buffer (such as a validity bitmap, or
|
||||
/// the data buffer of a boolean array) will lead to incorrect results.
|
||||
/// You should instead call `GetValues(i, 0)` and apply the bit offset manually.
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i) const {
|
||||
return GetValues<T>(i, offset);
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
/// \param absolute_offset the offset into the buffer
|
||||
///
|
||||
/// Like `GetValues(i, absolute_offset)`, but returns nullptr if the given buffer
|
||||
/// is not a CPU buffer.
|
||||
template <typename T>
|
||||
inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
|
||||
if (buffers[i] && buffers[i]->is_cpu()) {
|
||||
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
///
|
||||
/// Like `GetValues(i)`, but returns nullptr if the given buffer is not a CPU buffer.
|
||||
template <typename T>
|
||||
inline const T* GetValuesSafe(int i) const {
|
||||
return GetValuesSafe<T>(i, offset);
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a mutable typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
/// \param absolute_offset the offset into the buffer
|
||||
///
|
||||
/// Like `GetValues(i, absolute_offset)`, but allows mutating buffer contents.
|
||||
/// This should only be used when initially populating the ArrayData, before
|
||||
/// it is attached to a Array instance.
|
||||
template <typename T>
|
||||
inline T* GetMutableValues(int i, int64_t absolute_offset) {
|
||||
if (buffers[i]) {
|
||||
return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a mutable typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
///
|
||||
/// Like `GetValues(i)`, but allows mutating buffer contents.
|
||||
/// This should only be used when initially populating the ArrayData, before
|
||||
/// it is attached to a Array instance.
|
||||
template <typename T>
|
||||
inline T* GetMutableValues(int i) {
|
||||
return GetMutableValues<T>(i, offset);
|
||||
}
|
||||
|
||||
/// \brief Construct a zero-copy slice of the data with the given offset and length
|
||||
///
|
||||
/// This method applies the given slice to this ArrayData, taking into account
|
||||
/// its existing offset and length.
|
||||
/// If the given `length` is too large, the slice length is clamped so as not
|
||||
/// to go past the offset end.
|
||||
/// If the given `often` is too large, or if either `offset` or `length` is negative,
|
||||
/// behavior is undefined.
|
||||
///
|
||||
/// The associated ArrayStatistics is always discarded in a sliced
|
||||
/// ArrayData, even if the slice is trivially equal to the original ArrayData.
|
||||
/// If you want to reuse the statistics from the original ArrayData, you must
|
||||
/// explicitly reattach them.
|
||||
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
/// \brief Construct a zero-copy slice of the data with the given offset and length
|
||||
///
|
||||
/// Like `Slice(offset, length)`, but returns an error if the requested slice
|
||||
/// falls out of bounds.
|
||||
/// Unlike Slice, `length` isn't clamped to the available buffer size.
|
||||
Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
|
||||
|
||||
/// \brief Set the cached physical null count
|
||||
///
|
||||
/// \param v the number of nulls in the ArrayData
|
||||
///
|
||||
/// This should only be used when initially populating the ArrayData, if
|
||||
/// it possible to compute the null count without visiting the entire validity
|
||||
/// bitmap. In most cases, relying on `GetNullCount` is sufficient.
|
||||
void SetNullCount(int64_t v) { null_count.store(v); }
|
||||
|
||||
/// \brief Return the physical null count
|
||||
///
|
||||
/// This method returns the number of array elements for which `IsValid` would
|
||||
/// return false.
|
||||
///
|
||||
/// A cached value is returned if already available, otherwise it is first
|
||||
/// computed and stored.
|
||||
/// How it is is computed depends on the data type, see `IsValid` for details.
|
||||
///
|
||||
/// Note that this method is typically much faster than calling `IsValid`
|
||||
/// for all elements. Therefore, it helps avoid per-element validity bitmap
|
||||
/// lookups in the common cases where the array contains zero or only nulls.
|
||||
int64_t GetNullCount() const;
|
||||
|
||||
/// \brief Return true if the array may have nulls in its validity bitmap
|
||||
///
|
||||
/// This method returns true if the data has a validity bitmap, and the physical
|
||||
/// null count is either known to be non-zero or not yet known.
|
||||
///
|
||||
/// Unlike `MayHaveLogicalNulls`, this does not check for the presence of nulls
|
||||
/// in child data for data types such as unions and run-end encoded types.
|
||||
///
|
||||
/// \see HasValidityBitmap
|
||||
/// \see MayHaveLogicalNulls
|
||||
bool MayHaveNulls() const {
|
||||
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
|
||||
// but no buffer
|
||||
return null_count.load() != 0 && buffers[0] != NULLPTR;
|
||||
}
|
||||
|
||||
/// \brief Return true if the array has a validity bitmap
|
||||
bool HasValidityBitmap() const { return buffers[0] != NULLPTR; }
|
||||
|
||||
/// \brief Return true if the array may have logical nulls
|
||||
///
|
||||
/// Unlike `MayHaveNulls`, this method checks for null child values
|
||||
/// for types without a validity bitmap, such as unions and run-end encoded
|
||||
/// types, and for null dictionary values for dictionary types.
|
||||
///
|
||||
/// This implies that `MayHaveLogicalNulls` may return true for arrays that
|
||||
/// don't have a top-level validity bitmap. It is therefore necessary
|
||||
/// to call `HasValidityBitmap` before accessing a top-level validity bitmap.
|
||||
///
|
||||
/// Code that previously used MayHaveNulls and then dealt with the validity
|
||||
/// bitmap directly can be fixed to handle all types correctly without
|
||||
/// performance degradation when handling most types by adopting
|
||||
/// HasValidityBitmap and MayHaveLogicalNulls.
|
||||
///
|
||||
/// Before:
|
||||
///
|
||||
/// uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
/// for (int64_t i = 0; i < array.length; ++i) {
|
||||
/// if (validity && !bit_util::GetBit(validity, i)) {
|
||||
/// continue; // skip a NULL
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
///
|
||||
/// After:
|
||||
///
|
||||
/// bool all_valid = !array.MayHaveLogicalNulls();
|
||||
/// uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
|
||||
/// for (int64_t i = 0; i < array.length; ++i) {
|
||||
/// bool is_valid = all_valid ||
|
||||
/// (validity && bit_util::GetBit(validity, i)) ||
|
||||
/// array.IsValid(i);
|
||||
/// if (!is_valid) {
|
||||
/// continue; // skip a NULL
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
bool MayHaveLogicalNulls() const {
|
||||
if (buffers[0] != NULLPTR) {
|
||||
return null_count.load() != 0;
|
||||
}
|
||||
const auto t = type->id();
|
||||
if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
|
||||
return internal::UnionMayHaveLogicalNulls(*this);
|
||||
}
|
||||
if (t == Type::RUN_END_ENCODED) {
|
||||
return internal::RunEndEncodedMayHaveLogicalNulls(*this);
|
||||
}
|
||||
if (t == Type::DICTIONARY) {
|
||||
return internal::DictionaryMayHaveLogicalNulls(*this);
|
||||
}
|
||||
return null_count.load() != 0;
|
||||
}
|
||||
|
||||
/// \brief Compute the logical null count for arrays of all types
|
||||
///
|
||||
/// If the array has a validity bitmap, this function behaves the same as
|
||||
/// GetNullCount. For arrays that have no validity bitmap but whose values
|
||||
/// may be logically null (such as union arrays and run-end encoded arrays),
|
||||
/// this function recomputes the null count every time it is called.
|
||||
///
|
||||
/// \see GetNullCount
|
||||
int64_t ComputeLogicalNullCount() const;
|
||||
|
||||
/// \brief Return the device_type of the underlying buffers and children
|
||||
///
|
||||
/// If there are no buffers in this ArrayData object, it just returns
|
||||
/// DeviceAllocationType::kCPU as a default. We also assume that all buffers
|
||||
/// should be allocated on the same device type and perform DCHECKs to confirm
|
||||
/// this in debug mode.
|
||||
///
|
||||
/// \return DeviceAllocationType
|
||||
DeviceAllocationType device_type() const;
|
||||
|
||||
std::shared_ptr<DataType> type;
|
||||
int64_t length = 0;
|
||||
mutable std::atomic<int64_t> null_count{0};
|
||||
// The logical start point into the physical buffers (in values, not bytes).
|
||||
// Note that, for child data, this must be *added* to the child data's own offset.
|
||||
int64_t offset = 0;
|
||||
std::vector<std::shared_ptr<Buffer>> buffers;
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data;
|
||||
|
||||
// The dictionary for this Array, if any. Only used for dictionary type
|
||||
std::shared_ptr<ArrayData> dictionary;
|
||||
|
||||
// The statistics for this Array.
|
||||
std::shared_ptr<ArrayStatistics> statistics;
|
||||
};
|
||||
|
||||
/// \brief A non-owning Buffer reference
|
||||
struct ARROW_EXPORT BufferSpan {
|
||||
// It is the user of this class's responsibility to ensure that
|
||||
// buffers that were const originally are not written to
|
||||
// accidentally.
|
||||
uint8_t* data = NULLPTR;
|
||||
int64_t size = 0;
|
||||
// Pointer back to buffer that owns this memory
|
||||
const std::shared_ptr<Buffer>* owner = NULLPTR;
|
||||
|
||||
template <typename T>
|
||||
const T* data_as() const {
|
||||
return reinterpret_cast<const T*>(data);
|
||||
}
|
||||
template <typename T>
|
||||
T* mutable_data_as() {
|
||||
return reinterpret_cast<T*>(data);
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief EXPERIMENTAL: A non-owning array data container
|
||||
///
|
||||
/// Unlike ArrayData, this class doesn't own its referenced data type nor data buffers.
|
||||
/// It is cheaply copyable and can therefore be suitable for use cases where
|
||||
/// shared_ptr overhead is not acceptable. However, care should be taken to
|
||||
/// keep alive the referenced objects and memory while the ArraySpan object is in use.
|
||||
/// For this reason, this should not be exposed in most public APIs (apart from
|
||||
/// compute kernel interfaces).
|
||||
struct ARROW_EXPORT ArraySpan {
|
||||
const DataType* type = NULLPTR;
|
||||
int64_t length = 0;
|
||||
mutable int64_t null_count = kUnknownNullCount;
|
||||
int64_t offset = 0;
|
||||
BufferSpan buffers[3];
|
||||
|
||||
ArraySpan() = default;
|
||||
|
||||
explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}
|
||||
|
||||
ArraySpan(const ArrayData& data) { // NOLINT implicit conversion
|
||||
SetMembers(data);
|
||||
}
|
||||
explicit ArraySpan(const Scalar& data) { FillFromScalar(data); }
|
||||
|
||||
/// If dictionary-encoded, put dictionary in the first entry
|
||||
std::vector<ArraySpan> child_data;
|
||||
|
||||
/// \brief Populate ArraySpan to look like an array of length 1 pointing at
|
||||
/// the data members of a Scalar value
|
||||
void FillFromScalar(const Scalar& value);
|
||||
|
||||
void SetMembers(const ArrayData& data);
|
||||
|
||||
void SetBuffer(int index, const std::shared_ptr<Buffer>& buffer) {
|
||||
this->buffers[index].data = const_cast<uint8_t*>(buffer->data());
|
||||
this->buffers[index].size = buffer->size();
|
||||
this->buffers[index].owner = &buffer;
|
||||
}
|
||||
|
||||
const ArraySpan& dictionary() const { return child_data[0]; }
|
||||
|
||||
/// \brief Return the number of buffers (out of 3) that are used to
|
||||
/// constitute this array
|
||||
int num_buffers() const;
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline T* GetValues(int i, int64_t absolute_offset) {
|
||||
return reinterpret_cast<T*>(buffers[i].data) + absolute_offset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T* GetValues(int i) {
|
||||
return GetValues<T>(i, this->offset);
|
||||
}
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i, int64_t absolute_offset) const {
|
||||
return reinterpret_cast<const T*>(buffers[i].data) + absolute_offset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i) const {
|
||||
return GetValues<T>(i, this->offset);
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a span
|
||||
///
|
||||
/// \param i The buffer index
|
||||
/// \param length The required length (in number of typed values) of the requested span
|
||||
/// \pre i > 0
|
||||
/// \pre length <= the length of the buffer (in number of values) that's expected for
|
||||
/// this array type
|
||||
/// \return A span<const T> of the requested length
|
||||
template <typename T>
|
||||
util::span<const T> GetSpan(int i, int64_t length) const {
|
||||
const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
|
||||
assert(i > 0 && length + offset <= buffer_length);
|
||||
ARROW_UNUSED(buffer_length);
|
||||
return util::span<const T>(buffers[i].data_as<T>() + this->offset, length);
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a span
|
||||
///
|
||||
/// \param i The buffer index
|
||||
/// \param length The required length (in number of typed values) of the requested span
|
||||
/// \pre i > 0
|
||||
/// \pre length <= the length of the buffer (in number of values) that's expected for
|
||||
/// this array type
|
||||
/// \return A span<T> of the requested length
|
||||
template <typename T>
|
||||
util::span<T> GetSpan(int i, int64_t length) {
|
||||
const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
|
||||
assert(i > 0 && length + offset <= buffer_length);
|
||||
ARROW_UNUSED(buffer_length);
|
||||
return util::span<T>(buffers[i].mutable_data_as<T>() + this->offset, length);
|
||||
}
|
||||
|
||||
inline bool IsNull(int64_t i) const { return !IsValid(i); }
|
||||
|
||||
inline bool IsValid(int64_t i) const {
|
||||
if (this->buffers[0].data != NULLPTR) {
|
||||
return bit_util::GetBit(this->buffers[0].data, i + this->offset);
|
||||
} else {
|
||||
const auto type = this->type->id();
|
||||
if (type == Type::SPARSE_UNION) {
|
||||
return !IsNullSparseUnion(i);
|
||||
}
|
||||
if (type == Type::DENSE_UNION) {
|
||||
return !IsNullDenseUnion(i);
|
||||
}
|
||||
if (type == Type::RUN_END_ENCODED) {
|
||||
return !IsNullRunEndEncoded(i);
|
||||
}
|
||||
return this->null_count != this->length;
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> ToArrayData() const;
|
||||
|
||||
std::shared_ptr<Array> ToArray() const;
|
||||
|
||||
std::shared_ptr<Buffer> GetBuffer(int index) const {
|
||||
const BufferSpan& buf = this->buffers[index];
|
||||
if (buf.owner) {
|
||||
return *buf.owner;
|
||||
} else if (buf.data != NULLPTR) {
|
||||
// Buffer points to some memory without an owning buffer
|
||||
return std::make_shared<Buffer>(buf.data, buf.size);
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
void SetSlice(int64_t offset, int64_t length) {
|
||||
this->offset = offset;
|
||||
this->length = length;
|
||||
if (this->type->id() == Type::NA) {
|
||||
this->null_count = this->length;
|
||||
} else if (this->MayHaveNulls()) {
|
||||
this->null_count = kUnknownNullCount;
|
||||
} else {
|
||||
this->null_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Return physical null count, or compute and set it if it's not known
|
||||
int64_t GetNullCount() const;
|
||||
|
||||
/// \brief Return true if the array has a validity bitmap and the physical null
|
||||
/// count is known to be non-zero or not yet known
|
||||
///
|
||||
/// Note that this is not the same as MayHaveLogicalNulls, which also checks
|
||||
/// for the presence of nulls in child data for types like unions and run-end
|
||||
/// encoded types.
|
||||
///
|
||||
/// \see HasValidityBitmap
|
||||
/// \see MayHaveLogicalNulls
|
||||
bool MayHaveNulls() const {
|
||||
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
|
||||
// but no buffer
|
||||
return null_count != 0 && buffers[0].data != NULLPTR;
|
||||
}
|
||||
|
||||
/// \brief Return true if the array has a validity bitmap
|
||||
bool HasValidityBitmap() const { return buffers[0].data != NULLPTR; }
|
||||
|
||||
/// \brief Return true if the validity bitmap may have 0's in it, or if the
|
||||
/// child arrays (in the case of types without a validity bitmap) may have
|
||||
/// nulls, or if the dictionary of dictionay array may have nulls.
|
||||
///
|
||||
/// \see ArrayData::MayHaveLogicalNulls
|
||||
bool MayHaveLogicalNulls() const {
|
||||
if (buffers[0].data != NULLPTR) {
|
||||
return null_count != 0;
|
||||
}
|
||||
const auto t = type->id();
|
||||
if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
|
||||
return UnionMayHaveLogicalNulls();
|
||||
}
|
||||
if (t == Type::RUN_END_ENCODED) {
|
||||
return RunEndEncodedMayHaveLogicalNulls();
|
||||
}
|
||||
if (t == Type::DICTIONARY) {
|
||||
return DictionaryMayHaveLogicalNulls();
|
||||
}
|
||||
return null_count != 0;
|
||||
}
|
||||
|
||||
/// \brief Compute the logical null count for arrays of all types including
|
||||
/// those that do not have a validity bitmap like union and run-end encoded
|
||||
/// arrays
|
||||
///
|
||||
/// If the array has a validity bitmap, this function behaves the same as
|
||||
/// GetNullCount. For types that have no validity bitmap, this function will
|
||||
/// recompute the logical null count every time it is called.
|
||||
///
|
||||
/// \see GetNullCount
|
||||
int64_t ComputeLogicalNullCount() const;
|
||||
|
||||
/// Some DataTypes (StringView, BinaryView) may have an arbitrary number of variadic
|
||||
/// buffers. Since ArraySpan only has 3 buffers, we pack the variadic buffers into
|
||||
/// buffers[2]; IE buffers[2].data points to the first shared_ptr<Buffer> of the
|
||||
/// variadic set and buffers[2].size is the number of variadic buffers times
|
||||
/// sizeof(shared_ptr<Buffer>).
|
||||
///
|
||||
/// \see HasVariadicBuffers
|
||||
util::span<const std::shared_ptr<Buffer>> GetVariadicBuffers() const;
|
||||
bool HasVariadicBuffers() const;
|
||||
|
||||
private:
|
||||
ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& data,
|
||||
int64_t i);
|
||||
|
||||
bool IsNullSparseUnion(int64_t i) const;
|
||||
bool IsNullDenseUnion(int64_t i) const;
|
||||
|
||||
/// \brief Return true if the value at logical index i is null
|
||||
///
|
||||
/// This function uses binary-search, so it has a O(log N) cost.
|
||||
/// Iterating over the whole array and calling IsNull is O(N log N), so
|
||||
/// for better performance it is recommended to use a
|
||||
/// ree_util::RunEndEncodedArraySpan to iterate run by run instead.
|
||||
bool IsNullRunEndEncoded(int64_t i) const;
|
||||
|
||||
bool UnionMayHaveLogicalNulls() const;
|
||||
bool RunEndEncodedMayHaveLogicalNulls() const;
|
||||
bool DictionaryMayHaveLogicalNulls() const;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
void FillZeroLengthArray(const DataType* type, ArraySpan* span);
|
||||
|
||||
/// Construct a zero-copy view of this ArrayData with the given type.
|
||||
///
|
||||
/// This method checks if the types are layout-compatible.
|
||||
/// Nested types are traversed in depth-first order. Data buffers must have
|
||||
/// the same item sizes, even though the logical types may be different.
|
||||
/// An error is returned if the types are not layout-compatible.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,76 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Compare two arrays, returning an edit script which expresses the difference
|
||||
/// between them
|
||||
///
|
||||
/// An edit script is an array of struct(insert: bool, run_length: int64_t).
|
||||
/// Each element of "insert" determines whether an element was inserted into (true)
|
||||
/// or deleted from (false) base. Each insertion or deletion is followed by a run of
|
||||
/// elements which are unchanged from base to target; the length of this run is stored
|
||||
/// in "run_length". (Note that the edit script begins and ends with a run of shared
|
||||
/// elements but both fields of the struct must have the same length. To accommodate this
|
||||
/// the first element of "insert" should be ignored.)
|
||||
///
|
||||
/// For example for base "hlloo" and target "hello", the edit script would be
|
||||
/// [
|
||||
/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
|
||||
/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
|
||||
/// {"insert": false, "run_length": 0} // delete("o") then an empty run
|
||||
/// ]
|
||||
///
|
||||
/// Diffing arrays containing nulls is not currently supported.
|
||||
///
|
||||
/// \param[in] base baseline for comparison
|
||||
/// \param[in] target an array of identical type to base whose elements differ from base's
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \return an edit script array which can be applied to base to produce target
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief visitor interface for easy traversal of an edit script
|
||||
///
|
||||
/// visitor will be called for each hunk of insertions and deletions.
|
||||
ARROW_EXPORT Status VisitEditScript(
|
||||
const Array& edits,
|
||||
const std::function<Status(int64_t delete_begin, int64_t delete_end,
|
||||
int64_t insert_begin, int64_t insert_end)>& visitor);
|
||||
|
||||
/// \brief return a function which will format an edit script in unified
|
||||
/// diff format to os, given base and target arrays of type
|
||||
ARROW_EXPORT Result<
|
||||
std::function<Status(const Array& edits, const Array& base, const Array& target)>>
|
||||
MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,167 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <variant>
|
||||
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \class ArrayStatistics
|
||||
/// \brief Statistics for an Array
|
||||
///
|
||||
/// Apache Arrow format doesn't have statistics but data source such
|
||||
/// as Apache Parquet may have statistics. Statistics associated with
|
||||
/// data source can be read unified API via this class.
|
||||
struct ARROW_EXPORT ArrayStatistics {
|
||||
/// \brief The type for maximum and minimum values. If the target
|
||||
/// value exists, one of them is used. `std::nullopt` is used
|
||||
/// otherwise.
|
||||
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
|
||||
using NumericType = std::variant<int64_t, double>;
|
||||
using CountType = NumericType;
|
||||
using SizeType = NumericType;
|
||||
|
||||
static const std::shared_ptr<DataType>& ValueToArrowType(
|
||||
const std::optional<ValueType>& value,
|
||||
const std::shared_ptr<DataType>& array_type) {
|
||||
if (!value.has_value()) {
|
||||
return null();
|
||||
}
|
||||
|
||||
struct Visitor {
|
||||
const std::shared_ptr<DataType>& array_type;
|
||||
|
||||
const std::shared_ptr<DataType>& operator()(const bool&) { return boolean(); }
|
||||
const std::shared_ptr<DataType>& operator()(const int64_t&) { return int64(); }
|
||||
const std::shared_ptr<DataType>& operator()(const uint64_t&) { return uint64(); }
|
||||
const std::shared_ptr<DataType>& operator()(const double&) { return float64(); }
|
||||
const std::shared_ptr<DataType>& operator()(const std::string&) {
|
||||
switch (array_type->id()) {
|
||||
case Type::STRING:
|
||||
case Type::BINARY:
|
||||
case Type::FIXED_SIZE_BINARY:
|
||||
case Type::LARGE_STRING:
|
||||
case Type::LARGE_BINARY:
|
||||
case Type::BINARY_VIEW:
|
||||
case Type::STRING_VIEW:
|
||||
return array_type;
|
||||
default:
|
||||
return utf8();
|
||||
}
|
||||
}
|
||||
} visitor{array_type};
|
||||
return std::visit(visitor, value.value());
|
||||
}
|
||||
|
||||
/// \brief The number of null values, may not be set
|
||||
std::optional<int64_t> null_count = std::nullopt;
|
||||
|
||||
/// \brief The number of distinct values, may not be set
|
||||
/// Note: when set to `int64_t`, it represents `exact_distinct_count`,
|
||||
/// and when set to `double`, it represents `approximate_distinct_count`.
|
||||
std::optional<CountType> distinct_count = std::nullopt;
|
||||
|
||||
/// \brief The maximum length in bytes of the rows in an array; may not be set
|
||||
/// Note: when the type is `int64_t`, it represents `max_byte_width_exact`,
|
||||
/// and when the type is `double`, it represents `max_byte_width_approximate`.
|
||||
std::optional<SizeType> max_byte_width = std::nullopt;
|
||||
|
||||
/// \brief The average size in bytes of a row in an array, may not be set.
|
||||
std::optional<double> average_byte_width = std::nullopt;
|
||||
|
||||
/// \brief Whether the average size in bytes is exact or not.
|
||||
bool is_average_byte_width_exact = false;
|
||||
|
||||
/// \brief The minimum value, may not be set
|
||||
std::optional<ValueType> min = std::nullopt;
|
||||
|
||||
/// \brief Compute Arrow type of the minimum value.
|
||||
///
|
||||
/// If \ref ValueType is `std::string`, `array_type` may be
|
||||
/// used. If `array_type` is a binary-like type such as \ref
|
||||
/// arrow::binary and \ref arrow::large_utf8, `array_type` is
|
||||
/// returned. \ref arrow::utf8 is returned otherwise.
|
||||
///
|
||||
/// If \ref ValueType isn't `std::string`, `array_type` isn't used.
|
||||
///
|
||||
/// \param array_type The Arrow type of the associated array.
|
||||
///
|
||||
/// \return \ref arrow::null if the minimum value is `std::nullopt`,
|
||||
/// Arrow type based on \ref ValueType of the \ref min
|
||||
/// otherwise.
|
||||
const std::shared_ptr<DataType>& MinArrowType(
|
||||
const std::shared_ptr<DataType>& array_type) {
|
||||
return ValueToArrowType(min, array_type);
|
||||
}
|
||||
|
||||
/// \brief Whether the minimum value is exact or not
|
||||
bool is_min_exact = false;
|
||||
|
||||
/// \brief The maximum value, may not be set
|
||||
std::optional<ValueType> max = std::nullopt;
|
||||
|
||||
/// \brief Compute Arrow type of the maximum value.
|
||||
///
|
||||
/// If \ref ValueType is `std::string`, `array_type` may be
|
||||
/// used. If `array_type` is a binary-like type such as \ref
|
||||
/// arrow::binary and \ref arrow::large_utf8, `array_type` is
|
||||
/// returned. \ref arrow::utf8 is returned otherwise.
|
||||
///
|
||||
/// If \ref ValueType isn't `std::string`, `array_type` isn't used.
|
||||
///
|
||||
/// \param array_type The Arrow type of the associated array.
|
||||
///
|
||||
/// \return \ref arrow::null if the maximum value is `std::nullopt`,
|
||||
/// Arrow type based on \ref ValueType of the \ref max
|
||||
/// otherwise.
|
||||
const std::shared_ptr<DataType>& MaxArrowType(
|
||||
const std::shared_ptr<DataType>& array_type) {
|
||||
return ValueToArrowType(max, array_type);
|
||||
}
|
||||
|
||||
/// \brief Whether the maximum value is exact or not
|
||||
bool is_max_exact = false;
|
||||
|
||||
/// \brief Check two \ref arrow::ArrayStatistics for equality
|
||||
///
|
||||
/// \param other The \ref arrow::ArrayStatistics instance to compare against.
|
||||
///
|
||||
/// \param equal_options Options used to compare double values for equality.
|
||||
///
|
||||
/// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise,
|
||||
/// false.
|
||||
bool Equals(const ArrayStatistics& other,
|
||||
const EqualOptions& equal_options = EqualOptions::Defaults()) const {
|
||||
return ArrayStatisticsEquals(*this, other, equal_options);
|
||||
}
|
||||
|
||||
/// \brief Check two statistics for equality
|
||||
bool operator==(const ArrayStatistics& other) const { return Equals(other); }
|
||||
|
||||
/// \brief Check two statistics for not equality
|
||||
bool operator!=(const ArrayStatistics& other) const { return !Equals(other); }
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,96 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \defgroup array-factories Array factory functions
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Create a strongly-typed Array instance from generic ArrayData
|
||||
/// \param[in] data the array contents
|
||||
/// \return the resulting Array instance
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
/// \brief Create a strongly-typed Array instance with all elements null
|
||||
/// \param[in] type the array type
|
||||
/// \param[in] length the array length
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
|
||||
int64_t length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Create an Array instance whose slots are the given scalar
|
||||
/// \param[in] scalar the value with which to fill the array
|
||||
/// \param[in] length the array length
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeArrayFromScalar(
|
||||
const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Create an empty Array of a given type
|
||||
///
|
||||
/// The output Array will be of the given type.
|
||||
///
|
||||
/// \param[in] type the data type of the empty Array
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
/// \return the resulting Array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// @}
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Swap endian of each element in a generic ArrayData
|
||||
///
|
||||
/// As dictionaries are often shared between different arrays, dictionaries
|
||||
/// are not swapped by this function and should be handled separately.
|
||||
///
|
||||
/// \param[in] data the array contents
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
/// \return the resulting ArrayData whose elements were swapped
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
|
||||
const std::shared_ptr<ArrayData>& data, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// Given a number of ArrayVectors, treat each ArrayVector as the
|
||||
/// chunks of a chunked array. Then rechunk each ArrayVector such that
|
||||
/// all ArrayVectors are chunked identically. It is mandatory that
|
||||
/// all ArrayVectors contain the same total number of elements.
|
||||
ARROW_EXPORT
|
||||
std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,56 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// Internal functions implementing Array::Validate() and friends.
|
||||
|
||||
// O(1) array metadata validation
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArray(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArray(const ArrayData& data);
|
||||
|
||||
// O(N) array data validation.
|
||||
// Note that, starting from 7.0.0, "full" routines also validate metadata.
|
||||
// Before, ValidateArray() needed to be called before ValidateArrayFull()
|
||||
// to ensure metadata correctness, otherwise invalid memory accesses
|
||||
// may occur.
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArrayFull(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArrayFull(const ArrayData& data);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateUTF8(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateUTF8(const ArrayData& data);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
587
venv/lib/python3.10/site-packages/pyarrow/include/arrow/buffer.h
Normal file
587
venv/lib/python3.10/site-packages/pyarrow/include/arrow/buffer.h
Normal file
@@ -0,0 +1,587 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/device.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/span.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Buffer classes
|
||||
|
||||
/// \class Buffer
|
||||
/// \brief Object containing a pointer to a piece of contiguous memory with a
|
||||
/// particular size.
|
||||
///
|
||||
/// Buffers have two related notions of length: size and capacity. Size is
|
||||
/// the number of bytes that might have valid data. Capacity is the number
|
||||
/// of bytes that were allocated for the buffer in total.
|
||||
///
|
||||
/// The Buffer base class does not own its memory, but subclasses often do.
|
||||
///
|
||||
/// The following invariant is always true: Size <= Capacity
|
||||
class ARROW_EXPORT Buffer {
|
||||
public:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer);
|
||||
|
||||
/// \brief Construct from buffer and size without copying memory
|
||||
///
|
||||
/// \param[in] data a memory buffer
|
||||
/// \param[in] size buffer size
|
||||
///
|
||||
/// \note The passed memory must be kept alive through some other means
|
||||
Buffer(const uint8_t* data, int64_t size)
|
||||
: is_mutable_(false),
|
||||
is_cpu_(true),
|
||||
data_(data),
|
||||
size_(size),
|
||||
capacity_(size),
|
||||
device_type_(DeviceAllocationType::kCPU) {
|
||||
SetMemoryManager(default_cpu_memory_manager());
|
||||
}
|
||||
|
||||
Buffer(const uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm,
|
||||
std::shared_ptr<Buffer> parent = NULLPTR,
|
||||
std::optional<DeviceAllocationType> device_type_override = std::nullopt)
|
||||
: is_mutable_(false),
|
||||
data_(data),
|
||||
size_(size),
|
||||
capacity_(size),
|
||||
parent_(std::move(parent)) {
|
||||
// SetMemoryManager will also set device_type_
|
||||
SetMemoryManager(std::move(mm));
|
||||
// If a device type is specified, use that instead. Example of when this can be
|
||||
// useful: the CudaMemoryManager can set device_type_ to kCUDA, but you can specify
|
||||
// device_type_override=kCUDA_HOST as the device type to override it.
|
||||
if (device_type_override != std::nullopt) {
|
||||
device_type_ = *device_type_override;
|
||||
}
|
||||
}
|
||||
|
||||
Buffer(uintptr_t address, int64_t size, std::shared_ptr<MemoryManager> mm,
|
||||
std::shared_ptr<Buffer> parent = NULLPTR)
|
||||
: Buffer(reinterpret_cast<const uint8_t*>(address), size, std::move(mm),
|
||||
std::move(parent)) {}
|
||||
|
||||
/// \brief Construct from string_view without copying memory
|
||||
///
|
||||
/// \param[in] data a string_view object
|
||||
///
|
||||
/// \note The memory viewed by data must not be deallocated in the lifetime of the
|
||||
/// Buffer; temporary rvalue strings must be stored in an lvalue somewhere
|
||||
explicit Buffer(std::string_view data)
|
||||
: Buffer(reinterpret_cast<const uint8_t*>(data.data()),
|
||||
static_cast<int64_t>(data.size())) {}
|
||||
|
||||
virtual ~Buffer() = default;
|
||||
|
||||
/// An offset into data that is owned by another buffer, but we want to be
|
||||
/// able to retain a valid pointer to it even after other shared_ptr's to the
|
||||
/// parent buffer have been destroyed
|
||||
///
|
||||
/// This method makes no assertions about alignment or padding of the buffer but
|
||||
/// in general we expected buffers to be aligned and padded to 64 bytes. In the future
|
||||
/// we might add utility methods to help determine if a buffer satisfies this contract.
|
||||
Buffer(std::shared_ptr<Buffer> parent, const int64_t offset, const int64_t size)
|
||||
: Buffer(parent->data_ + offset, size) {
|
||||
parent_ = std::move(parent);
|
||||
SetMemoryManager(parent_->memory_manager_);
|
||||
}
|
||||
|
||||
uint8_t operator[](std::size_t i) const { return data_[i]; }
|
||||
|
||||
/// \brief Construct a new std::string with a hexadecimal representation of the buffer.
|
||||
/// \return std::string
|
||||
std::string ToHexString();
|
||||
|
||||
/// Return true if both buffers are the same size and contain the same bytes
|
||||
/// up to the number of compared bytes
|
||||
bool Equals(const Buffer& other, int64_t nbytes) const;
|
||||
|
||||
/// Return true if both buffers are the same size and contain the same bytes
|
||||
bool Equals(const Buffer& other) const;
|
||||
|
||||
/// Copy a section of the buffer into a new Buffer.
|
||||
Result<std::shared_ptr<Buffer>> CopySlice(
|
||||
const int64_t start, const int64_t nbytes,
|
||||
MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// Zero bytes in padding, i.e. bytes between size_ and capacity_.
|
||||
void ZeroPadding() {
|
||||
#ifndef NDEBUG
|
||||
CheckMutable();
|
||||
#endif
|
||||
// A zero-capacity buffer can have a null data pointer
|
||||
if (capacity_ != 0) {
|
||||
memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Construct an immutable buffer that takes ownership of the contents
|
||||
/// of an std::string (without copying it).
|
||||
///
|
||||
/// \param[in] data a string to own
|
||||
/// \return a new Buffer instance
|
||||
static std::shared_ptr<Buffer> FromString(std::string data);
|
||||
|
||||
/// \brief Construct an immutable buffer that takes ownership of the contents
|
||||
/// of an std::vector (without copying it). Only vectors of TrivialType objects
|
||||
/// (integers, floating point numbers, ...) can be wrapped by this function.
|
||||
///
|
||||
/// \param[in] vec a vector to own
|
||||
/// \return a new Buffer instance
|
||||
template <typename T>
|
||||
static std::shared_ptr<Buffer> FromVector(std::vector<T> vec) {
|
||||
static_assert(std::is_trivial_v<T>,
|
||||
"Buffer::FromVector can only wrap vectors of trivial objects");
|
||||
|
||||
if (vec.empty()) {
|
||||
return std::shared_ptr<Buffer>{new Buffer()};
|
||||
}
|
||||
|
||||
auto* data = reinterpret_cast<uint8_t*>(vec.data());
|
||||
auto size_in_bytes = static_cast<int64_t>(vec.size() * sizeof(T));
|
||||
return std::shared_ptr<Buffer>{
|
||||
new Buffer{data, size_in_bytes},
|
||||
// Keep the vector's buffer alive inside the shared_ptr's destructor until after
|
||||
// we have deleted the Buffer. Note we can't use this trick in FromString since
|
||||
// std::string's data is inline for short strings so moving invalidates pointers
|
||||
// into the string's buffer.
|
||||
[vec = std::move(vec)](Buffer* buffer) { delete buffer; }};
|
||||
}
|
||||
|
||||
/// \brief Create buffer referencing typed memory with some length without
|
||||
/// copying
|
||||
/// \param[in] data the typed memory as C array
|
||||
/// \param[in] length the number of values in the array
|
||||
/// \return a new shared_ptr<Buffer>
|
||||
template <typename T, typename SizeType = int64_t>
|
||||
static std::shared_ptr<Buffer> Wrap(const T* data, SizeType length) {
|
||||
return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data),
|
||||
static_cast<int64_t>(sizeof(T) * length));
|
||||
}
|
||||
|
||||
/// \brief Create buffer referencing std::vector with some length without
|
||||
/// copying
|
||||
/// \param[in] data the vector to be referenced. If this vector is changed,
|
||||
/// the buffer may become invalid
|
||||
/// \return a new shared_ptr<Buffer>
|
||||
template <typename T>
|
||||
static std::shared_ptr<Buffer> Wrap(const std::vector<T>& data) {
|
||||
return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data.data()),
|
||||
static_cast<int64_t>(sizeof(T) * data.size()));
|
||||
}
|
||||
|
||||
/// \brief Copy buffer contents into a new std::string
|
||||
/// \return std::string
|
||||
/// \note Can throw std::bad_alloc if buffer is large
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief View buffer contents as a std::string_view
|
||||
/// \return std::string_view
|
||||
explicit operator std::string_view() const {
|
||||
return {reinterpret_cast<const char*>(data_), static_cast<size_t>(size_)};
|
||||
}
|
||||
|
||||
/// \brief Return a pointer to the buffer's data
|
||||
///
|
||||
/// The buffer has to be a CPU buffer (`is_cpu()` is true).
|
||||
/// Otherwise, an assertion may be thrown or a null pointer may be returned.
|
||||
///
|
||||
/// To get the buffer's data address regardless of its device, call `address()`.
|
||||
const uint8_t* data() const {
|
||||
#ifndef NDEBUG
|
||||
CheckCPU();
|
||||
#endif
|
||||
return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
|
||||
}
|
||||
|
||||
/// \brief Return a pointer to the buffer's data cast to a specific type
|
||||
///
|
||||
/// The buffer has to be a CPU buffer (`is_cpu()` is true).
|
||||
/// Otherwise, an assertion may be thrown or a null pointer may be returned.
|
||||
template <typename T>
|
||||
const T* data_as() const {
|
||||
return reinterpret_cast<const T*>(data());
|
||||
}
|
||||
|
||||
/// \brief Return the buffer's data as a span
|
||||
template <typename T>
|
||||
util::span<const T> span_as() const {
|
||||
return util::span(data_as<T>(), static_cast<size_t>(size() / sizeof(T)));
|
||||
}
|
||||
|
||||
/// \brief Return a writable pointer to the buffer's data
|
||||
///
|
||||
/// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
|
||||
/// are true). Otherwise, an assertion may be thrown or a null pointer may
|
||||
/// be returned.
|
||||
///
|
||||
/// To get the buffer's mutable data address regardless of its device, call
|
||||
/// `mutable_address()`.
|
||||
uint8_t* mutable_data() {
|
||||
#ifndef NDEBUG
|
||||
CheckCPU();
|
||||
CheckMutable();
|
||||
#endif
|
||||
return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
|
||||
: NULLPTR;
|
||||
}
|
||||
|
||||
/// \brief Return a writable pointer to the buffer's data cast to a specific type
|
||||
///
|
||||
/// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
|
||||
/// are true). Otherwise, an assertion may be thrown or a null pointer may
|
||||
/// be returned.
|
||||
template <typename T>
|
||||
T* mutable_data_as() {
|
||||
return reinterpret_cast<T*>(mutable_data());
|
||||
}
|
||||
|
||||
/// \brief Return the buffer's mutable data as a span
|
||||
template <typename T>
|
||||
util::span<T> mutable_span_as() {
|
||||
return util::span(mutable_data_as<T>(), static_cast<size_t>(size() / sizeof(T)));
|
||||
}
|
||||
|
||||
/// \brief Return the device address of the buffer's data
|
||||
uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }
|
||||
|
||||
/// \brief Return a writable device address to the buffer's data
|
||||
///
|
||||
/// The buffer has to be a mutable buffer (`is_mutable()` is true).
|
||||
/// Otherwise, an assertion may be thrown or 0 may be returned.
|
||||
uintptr_t mutable_address() const {
|
||||
#ifndef NDEBUG
|
||||
CheckMutable();
|
||||
#endif
|
||||
return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
|
||||
}
|
||||
|
||||
/// \brief Return the buffer's size in bytes
|
||||
int64_t size() const { return size_; }
|
||||
|
||||
/// \brief Return the buffer's capacity (number of allocated bytes)
|
||||
int64_t capacity() const { return capacity_; }
|
||||
|
||||
/// \brief Whether the buffer is directly CPU-accessible
|
||||
///
|
||||
/// If this function returns true, you can read directly from the buffer's
|
||||
/// `data()` pointer. Otherwise, you'll have to `View()` or `Copy()` it.
|
||||
bool is_cpu() const { return is_cpu_; }
|
||||
|
||||
/// \brief Whether the buffer is mutable
|
||||
///
|
||||
/// If this function returns true, you are allowed to modify buffer contents
|
||||
/// using the pointer returned by `mutable_data()` or `mutable_address()`.
|
||||
bool is_mutable() const { return is_mutable_; }
|
||||
|
||||
const std::shared_ptr<Device>& device() const { return memory_manager_->device(); }
|
||||
|
||||
const std::shared_ptr<MemoryManager>& memory_manager() const { return memory_manager_; }
|
||||
|
||||
DeviceAllocationType device_type() const { return device_type_; }
|
||||
|
||||
std::shared_ptr<Buffer> parent() const { return parent_; }
|
||||
|
||||
/// \brief Get a RandomAccessFile for reading a buffer
|
||||
///
|
||||
/// The returned file object reads from this buffer's underlying memory.
|
||||
static Result<std::shared_ptr<io::RandomAccessFile>> GetReader(std::shared_ptr<Buffer>);
|
||||
|
||||
/// \brief Get a OutputStream for writing to a buffer
|
||||
///
|
||||
/// The buffer must be mutable. The returned stream object writes into the buffer's
|
||||
/// underlying memory (but it won't resize it).
|
||||
static Result<std::shared_ptr<io::OutputStream>> GetWriter(std::shared_ptr<Buffer>);
|
||||
|
||||
/// \brief Copy buffer
|
||||
///
|
||||
/// The buffer contents will be copied into a new buffer allocated by the
|
||||
/// given MemoryManager. This function supports cross-device copies.
|
||||
static Result<std::shared_ptr<Buffer>> Copy(std::shared_ptr<Buffer> source,
|
||||
const std::shared_ptr<MemoryManager>& to);
|
||||
|
||||
/// \brief Copy a non-owned buffer
|
||||
///
|
||||
/// This is useful for cases where the source memory area is externally managed
|
||||
/// (its lifetime not tied to the source Buffer), otherwise please use Copy().
|
||||
static Result<std::unique_ptr<Buffer>> CopyNonOwned(
|
||||
const Buffer& source, const std::shared_ptr<MemoryManager>& to);
|
||||
|
||||
/// \brief View buffer
|
||||
///
|
||||
/// Return a Buffer that reflects this buffer, seen potentially from another
|
||||
/// device, without making an explicit copy of the contents. The underlying
|
||||
/// mechanism is typically implemented by the kernel or device driver, and may
|
||||
/// involve lazy caching of parts of the buffer contents on the destination
|
||||
/// device's memory.
|
||||
///
|
||||
/// If a non-copy view is unsupported for the buffer on the given device,
|
||||
/// nullptr is returned. An error can be returned if some low-level
|
||||
/// operation fails (such as an out-of-memory condition).
|
||||
static Result<std::shared_ptr<Buffer>> View(std::shared_ptr<Buffer> source,
|
||||
const std::shared_ptr<MemoryManager>& to);
|
||||
|
||||
/// \brief View or copy buffer
|
||||
///
|
||||
/// Try to view buffer contents on the given MemoryManager's device, but
|
||||
/// fall back to copying if a no-copy view isn't supported.
|
||||
static Result<std::shared_ptr<Buffer>> ViewOrCopy(
|
||||
std::shared_ptr<Buffer> source, const std::shared_ptr<MemoryManager>& to);
|
||||
|
||||
virtual std::shared_ptr<Device::SyncEvent> device_sync_event() const { return NULLPTR; }
|
||||
|
||||
protected:
|
||||
bool is_mutable_;
|
||||
bool is_cpu_;
|
||||
const uint8_t* data_;
|
||||
int64_t size_;
|
||||
int64_t capacity_;
|
||||
DeviceAllocationType device_type_;
|
||||
|
||||
// null by default, but may be set
|
||||
std::shared_ptr<Buffer> parent_;
|
||||
|
||||
private:
|
||||
// private so that subclasses are forced to call SetMemoryManager()
|
||||
std::shared_ptr<MemoryManager> memory_manager_;
|
||||
|
||||
protected:
|
||||
Buffer();
|
||||
|
||||
void CheckMutable() const;
|
||||
void CheckCPU() const;
|
||||
|
||||
void SetMemoryManager(std::shared_ptr<MemoryManager> mm) {
|
||||
memory_manager_ = std::move(mm);
|
||||
is_cpu_ = memory_manager_->is_cpu();
|
||||
device_type_ = memory_manager_->device()->device_type();
|
||||
}
|
||||
};
|
||||
|
||||
/// \defgroup buffer-slicing-functions Functions for slicing buffers
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Construct a view on a buffer at the given offset and length.
|
||||
///
|
||||
/// This function cannot fail and does not check for errors (except in debug builds)
|
||||
static inline std::shared_ptr<Buffer> SliceBuffer(std::shared_ptr<Buffer> buffer,
|
||||
const int64_t offset,
|
||||
const int64_t length) {
|
||||
return std::make_shared<Buffer>(std::move(buffer), offset, length);
|
||||
}
|
||||
|
||||
/// \brief Construct a view on a buffer at the given offset, up to the buffer's end.
|
||||
///
|
||||
/// This function cannot fail and does not check for errors (except in debug builds)
|
||||
static inline std::shared_ptr<Buffer> SliceBuffer(std::shared_ptr<Buffer> buffer,
|
||||
const int64_t offset) {
|
||||
int64_t length = buffer->size() - offset;
|
||||
return SliceBuffer(std::move(buffer), offset, length);
|
||||
}
|
||||
|
||||
/// \brief Input-checking version of SliceBuffer
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SliceBufferSafe(std::shared_ptr<Buffer> buffer,
|
||||
int64_t offset);
|
||||
/// \brief Input-checking version of SliceBuffer
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SliceBufferSafe(std::shared_ptr<Buffer> buffer,
|
||||
int64_t offset, int64_t length);
|
||||
|
||||
/// \brief Like SliceBuffer, but construct a mutable buffer slice.
|
||||
///
|
||||
/// If the parent buffer is not mutable, behavior is undefined (it may abort
|
||||
/// in debug builds).
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<Buffer> SliceMutableBuffer(std::shared_ptr<Buffer> buffer,
|
||||
const int64_t offset, const int64_t length);
|
||||
|
||||
/// \brief Like SliceBuffer, but construct a mutable buffer slice.
|
||||
///
|
||||
/// If the parent buffer is not mutable, behavior is undefined (it may abort
|
||||
/// in debug builds).
|
||||
static inline std::shared_ptr<Buffer> SliceMutableBuffer(std::shared_ptr<Buffer> buffer,
|
||||
const int64_t offset) {
|
||||
int64_t length = buffer->size() - offset;
|
||||
return SliceMutableBuffer(std::move(buffer), offset, length);
|
||||
}
|
||||
|
||||
/// \brief Input-checking version of SliceMutableBuffer
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(std::shared_ptr<Buffer> buffer,
|
||||
int64_t offset);
|
||||
/// \brief Input-checking version of SliceMutableBuffer
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(std::shared_ptr<Buffer> buffer,
|
||||
int64_t offset, int64_t length);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \class MutableBuffer
|
||||
/// \brief A Buffer whose contents can be mutated. May or may not own its data.
|
||||
class ARROW_EXPORT MutableBuffer : public Buffer {
|
||||
public:
|
||||
MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) {
|
||||
is_mutable_ = true;
|
||||
}
|
||||
|
||||
MutableBuffer(uint8_t* data, const int64_t size, std::shared_ptr<MemoryManager> mm)
|
||||
: Buffer(data, size, std::move(mm)) {
|
||||
is_mutable_ = true;
|
||||
}
|
||||
|
||||
MutableBuffer(const std::shared_ptr<Buffer>& parent, const int64_t offset,
|
||||
const int64_t size);
|
||||
|
||||
/// \brief Create buffer referencing typed memory with some length
|
||||
/// \param[in] data the typed memory as C array
|
||||
/// \param[in] length the number of values in the array
|
||||
/// \return a new shared_ptr<Buffer>
|
||||
template <typename T, typename SizeType = int64_t>
|
||||
static std::shared_ptr<Buffer> Wrap(T* data, SizeType length) {
|
||||
return std::make_shared<MutableBuffer>(reinterpret_cast<uint8_t*>(data),
|
||||
static_cast<int64_t>(sizeof(T) * length));
|
||||
}
|
||||
|
||||
protected:
|
||||
MutableBuffer() : Buffer(NULLPTR, 0) {}
|
||||
};
|
||||
|
||||
/// \class ResizableBuffer
|
||||
/// \brief A mutable buffer that can be resized
|
||||
class ARROW_EXPORT ResizableBuffer : public MutableBuffer {
|
||||
public:
|
||||
/// Change buffer reported size to indicated size, allocating memory if
|
||||
/// necessary. This will ensure that the capacity of the buffer is a multiple
|
||||
/// of 64 bytes as defined in Layout.md.
|
||||
/// Consider using ZeroPadding afterwards, to conform to the Arrow layout
|
||||
/// specification.
|
||||
///
|
||||
/// @param new_size The new size for the buffer.
|
||||
/// @param shrink_to_fit Whether to shrink the capacity if new size < current size
|
||||
virtual Status Resize(const int64_t new_size, bool shrink_to_fit) = 0;
|
||||
Status Resize(const int64_t new_size) {
|
||||
return Resize(new_size, /*shrink_to_fit=*/true);
|
||||
}
|
||||
|
||||
/// Ensure that buffer has enough memory allocated to fit the indicated
|
||||
/// capacity (and meets the 64 byte padding requirement in Layout.md).
|
||||
/// It does not change buffer's reported size and doesn't zero the padding.
|
||||
virtual Status Reserve(const int64_t new_capacity) = 0;
|
||||
|
||||
template <class T>
|
||||
Status TypedResize(const int64_t new_nb_elements, bool shrink_to_fit = true) {
|
||||
return Resize(sizeof(T) * new_nb_elements, shrink_to_fit);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
Status TypedReserve(const int64_t new_nb_elements) {
|
||||
return Reserve(sizeof(T) * new_nb_elements);
|
||||
}
|
||||
|
||||
protected:
|
||||
ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {}
|
||||
ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm)
|
||||
: MutableBuffer(data, size, std::move(mm)) {}
|
||||
};
|
||||
|
||||
/// \defgroup buffer-allocation-functions Functions for allocating buffers
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding.
|
||||
///
|
||||
/// \param[in] size size of buffer to allocate
|
||||
/// \param[in] pool a memory pool
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size, int64_t alignment,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// \brief Allocate a resizeable buffer from a memory pool, zero its padding.
|
||||
///
|
||||
/// \param[in] size size of buffer to allocate
|
||||
/// \param[in] pool a memory pool
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(
|
||||
const int64_t size, MemoryPool* pool = NULLPTR);
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(
|
||||
const int64_t size, const int64_t alignment, MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// \brief Allocate a bitmap buffer from a memory pool
|
||||
/// no guarantee on values is provided.
|
||||
///
|
||||
/// \param[in] length size in bits of bitmap to allocate
|
||||
/// \param[in] pool memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> AllocateBitmap(int64_t length,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// \brief Allocate a zero-initialized bitmap buffer from a memory pool
|
||||
///
|
||||
/// \param[in] length size in bits of bitmap to allocate
|
||||
/// \param[in] pool memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> AllocateEmptyBitmap(int64_t length,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> AllocateEmptyBitmap(int64_t length, int64_t alignment,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// \brief Concatenate multiple buffers into a single buffer
|
||||
///
|
||||
/// \param[in] buffers to be concatenated
|
||||
/// \param[in] pool memory pool to allocate the new buffer from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> ConcatenateBuffers(const BufferVector& buffers,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,488 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/bitmap_generate.h"
|
||||
#include "arrow/util/bitmap_ops.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/ubsan.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Buffer builder classes
|
||||
|
||||
/// \class BufferBuilder
|
||||
/// \brief A class for incrementally building a contiguous chunk of in-memory
|
||||
/// data
|
||||
class ARROW_EXPORT BufferBuilder {
|
||||
public:
|
||||
explicit BufferBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: pool_(pool),
|
||||
data_(/*ensure never null to make ubsan happy and avoid check penalties below*/
|
||||
util::MakeNonNull<uint8_t>()),
|
||||
capacity_(0),
|
||||
size_(0),
|
||||
alignment_(alignment) {}
|
||||
|
||||
/// \brief Constructs new Builder that will start using
|
||||
/// the provided buffer until Finish/Reset are called.
|
||||
/// The buffer is not resized.
|
||||
explicit BufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: buffer_(std::move(buffer)),
|
||||
pool_(pool),
|
||||
data_(buffer_->mutable_data()),
|
||||
capacity_(buffer_->capacity()),
|
||||
size_(buffer_->size()),
|
||||
alignment_(alignment) {}
|
||||
|
||||
/// \brief Resize the buffer to the nearest multiple of 64 bytes
|
||||
///
|
||||
/// \param new_capacity the new capacity of the builder. Will be
|
||||
/// rounded up to a multiple of 64 bytes for padding
|
||||
/// \param shrink_to_fit if new capacity is smaller than the existing,
|
||||
/// reallocate internal buffer. Set to false to avoid reallocations when
|
||||
/// shrinking the builder.
|
||||
/// \return Status
|
||||
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
|
||||
if (buffer_ == NULLPTR) {
|
||||
ARROW_ASSIGN_OR_RAISE(buffer_,
|
||||
AllocateResizableBuffer(new_capacity, alignment_, pool_));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit));
|
||||
}
|
||||
capacity_ = buffer_->capacity();
|
||||
data_ = buffer_->mutable_data();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Ensure that builder can accommodate the additional number of bytes
|
||||
/// without the need to perform allocations
|
||||
///
|
||||
/// \param[in] additional_bytes number of additional bytes to make space for
|
||||
/// \return Status
|
||||
Status Reserve(const int64_t additional_bytes) {
|
||||
auto min_capacity = size_ + additional_bytes;
|
||||
if (min_capacity <= capacity_) {
|
||||
return Status::OK();
|
||||
}
|
||||
return Resize(GrowByFactor(capacity_, min_capacity), false);
|
||||
}
|
||||
|
||||
/// \brief Return a capacity expanded by the desired growth factor
|
||||
static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) {
|
||||
// Doubling capacity except for large Reserve requests. 2x growth strategy
|
||||
// (versus 1.5x) seems to have slightly better performance when using
|
||||
// jemalloc, but significantly better performance when using the system
|
||||
// allocator. See ARROW-6450 for further discussion
|
||||
return std::max(new_capacity, current_capacity * 2);
|
||||
}
|
||||
|
||||
/// \brief Append the given data to the buffer
|
||||
///
|
||||
/// The buffer is automatically expanded if necessary.
|
||||
Status Append(const void* data, const int64_t length) {
|
||||
if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) {
|
||||
ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false));
|
||||
}
|
||||
UnsafeAppend(data, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append the given data to the buffer
|
||||
///
|
||||
/// The buffer is automatically expanded if necessary.
|
||||
Status Append(std::string_view v) { return Append(v.data(), v.size()); }
|
||||
|
||||
/// \brief Append copies of a value to the buffer
|
||||
///
|
||||
/// The buffer is automatically expanded if necessary.
|
||||
Status Append(const int64_t num_copies, uint8_t value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(num_copies));
|
||||
UnsafeAppend(num_copies, value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Advance pointer and zero out memory
|
||||
Status Advance(const int64_t length) { return Append(length, 0); }
|
||||
|
||||
// Advance pointer, but don't allocate or zero memory
|
||||
void UnsafeAdvance(const int64_t length) { size_ += length; }
|
||||
|
||||
// Unsafe methods don't check existing size
|
||||
void UnsafeAppend(const void* data, const int64_t length) {
|
||||
memcpy(data_ + size_, data, static_cast<size_t>(length));
|
||||
size_ += length;
|
||||
}
|
||||
|
||||
void UnsafeAppend(std::string_view v) {
|
||||
UnsafeAppend(v.data(), static_cast<int64_t>(v.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(const int64_t num_copies, uint8_t value) {
|
||||
memset(data_ + size_, value, static_cast<size_t>(num_copies));
|
||||
size_ += num_copies;
|
||||
}
|
||||
|
||||
/// \brief Return result of builder as a Buffer object.
|
||||
///
|
||||
/// The builder is reset and can be reused afterwards.
|
||||
///
|
||||
/// \param[out] out the finalized Buffer object
|
||||
/// \param shrink_to_fit if the buffer size is smaller than its capacity,
|
||||
/// reallocate to fit more tightly in memory. Set to false to avoid
|
||||
/// a reallocation, at the expense of potentially more memory consumption.
|
||||
/// \return Status
|
||||
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
|
||||
ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
|
||||
if (size_ != 0) buffer_->ZeroPadding();
|
||||
*out = buffer_;
|
||||
if (*out == NULLPTR) {
|
||||
ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, alignment_, pool_));
|
||||
}
|
||||
Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
|
||||
std::shared_ptr<Buffer> out;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Like Finish, but override the final buffer size
|
||||
///
|
||||
/// This is useful after writing data directly into the builder memory
|
||||
/// without calling the Append methods (basically, when using BufferBuilder
|
||||
/// mostly for memory allocation).
|
||||
Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
|
||||
bool shrink_to_fit = true) {
|
||||
size_ = final_length;
|
||||
return Finish(shrink_to_fit);
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
buffer_ = NULLPTR;
|
||||
capacity_ = size_ = 0;
|
||||
}
|
||||
|
||||
/// \brief Set size to a smaller value without modifying builder
|
||||
/// contents. For reusable BufferBuilder classes
|
||||
/// \param[in] position must be non-negative and less than or equal
|
||||
/// to the current length()
|
||||
void Rewind(int64_t position) { size_ = position; }
|
||||
|
||||
int64_t capacity() const { return capacity_; }
|
||||
int64_t length() const { return size_; }
|
||||
const uint8_t* data() const { return data_; }
|
||||
uint8_t* mutable_data() { return data_; }
|
||||
template <typename T>
|
||||
const T* data_as() const {
|
||||
return reinterpret_cast<const T*>(data_);
|
||||
}
|
||||
template <typename T>
|
||||
T* mutable_data_as() {
|
||||
return reinterpret_cast<T*>(data_);
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<ResizableBuffer> buffer_;
|
||||
MemoryPool* pool_;
|
||||
uint8_t* data_;
|
||||
int64_t capacity_;
|
||||
int64_t size_;
|
||||
int64_t alignment_;
|
||||
};
|
||||
|
||||
template <typename T, typename Enable = void>
|
||||
class TypedBufferBuilder;
|
||||
|
||||
/// \brief A BufferBuilder for building a buffer of arithmetic elements
|
||||
template <typename T>
|
||||
class TypedBufferBuilder<
|
||||
T, typename std::enable_if<std::is_arithmetic<T>::value ||
|
||||
std::is_standard_layout<T>::value>::type> {
|
||||
public:
|
||||
explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: bytes_builder_(pool, alignment) {}
|
||||
|
||||
explicit TypedBufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: bytes_builder_(std::move(buffer), pool) {}
|
||||
|
||||
explicit TypedBufferBuilder(BufferBuilder builder)
|
||||
: bytes_builder_(std::move(builder)) {}
|
||||
|
||||
BufferBuilder* bytes_builder() { return &bytes_builder_; }
|
||||
|
||||
Status Append(T value) {
|
||||
return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T));
|
||||
}
|
||||
|
||||
Status Append(const T* values, int64_t num_elements) {
|
||||
return bytes_builder_.Append(reinterpret_cast<const uint8_t*>(values),
|
||||
num_elements * sizeof(T));
|
||||
}
|
||||
|
||||
Status Append(const int64_t num_copies, T value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(num_copies + length()));
|
||||
UnsafeAppend(num_copies, value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void UnsafeAppend(T value) {
|
||||
bytes_builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&value), sizeof(T));
|
||||
}
|
||||
|
||||
void UnsafeAppend(const T* values, int64_t num_elements) {
|
||||
bytes_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values),
|
||||
num_elements * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename Iter>
|
||||
void UnsafeAppend(Iter values_begin, Iter values_end) {
|
||||
auto num_elements = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
auto data = mutable_data() + length();
|
||||
bytes_builder_.UnsafeAdvance(num_elements * sizeof(T));
|
||||
std::copy(values_begin, values_end, data);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const int64_t num_copies, T value) {
|
||||
auto data = mutable_data() + length();
|
||||
bytes_builder_.UnsafeAdvance(num_copies * sizeof(T));
|
||||
std::fill(data, data + num_copies, value);
|
||||
}
|
||||
|
||||
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
|
||||
return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit);
|
||||
}
|
||||
|
||||
Status Reserve(const int64_t additional_elements) {
|
||||
return bytes_builder_.Reserve(additional_elements * sizeof(T));
|
||||
}
|
||||
|
||||
Status Advance(const int64_t length) {
|
||||
return bytes_builder_.Advance(length * sizeof(T));
|
||||
}
|
||||
|
||||
void UnsafeAdvance(const int64_t length) {
|
||||
bytes_builder_.UnsafeAdvance(length * sizeof(T));
|
||||
}
|
||||
|
||||
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
|
||||
return bytes_builder_.Finish(out, shrink_to_fit);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
|
||||
std::shared_ptr<Buffer> out;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Like Finish, but override the final buffer size
|
||||
///
|
||||
/// This is useful after writing data directly into the builder memory
|
||||
/// without calling the Append methods (basically, when using TypedBufferBuilder
|
||||
/// only for memory allocation).
|
||||
Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
|
||||
bool shrink_to_fit = true) {
|
||||
return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
|
||||
}
|
||||
|
||||
void Reset() { bytes_builder_.Reset(); }
|
||||
|
||||
int64_t length() const { return bytes_builder_.length() / sizeof(T); }
|
||||
int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); }
|
||||
const T* data() const { return reinterpret_cast<const T*>(bytes_builder_.data()); }
|
||||
T* mutable_data() { return reinterpret_cast<T*>(bytes_builder_.mutable_data()); }
|
||||
|
||||
private:
|
||||
BufferBuilder bytes_builder_;
|
||||
};
|
||||
|
||||
/// \brief A BufferBuilder for building a buffer containing a bitmap
|
||||
template <>
|
||||
class TypedBufferBuilder<bool> {
|
||||
public:
|
||||
explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: bytes_builder_(pool, alignment) {}
|
||||
|
||||
explicit TypedBufferBuilder(BufferBuilder builder)
|
||||
: bytes_builder_(std::move(builder)) {}
|
||||
|
||||
BufferBuilder* bytes_builder() { return &bytes_builder_; }
|
||||
|
||||
Status Append(bool value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const uint8_t* valid_bytes, int64_t num_elements) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(num_elements));
|
||||
UnsafeAppend(valid_bytes, num_elements);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const int64_t num_copies, bool value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(num_copies));
|
||||
UnsafeAppend(num_copies, value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void UnsafeAppend(bool value) {
|
||||
bit_util::SetBitTo(mutable_data(), bit_length_, value);
|
||||
if (!value) {
|
||||
++false_count_;
|
||||
}
|
||||
++bit_length_;
|
||||
}
|
||||
|
||||
/// \brief Append bits from an array of bytes (one value per byte)
|
||||
void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) {
|
||||
if (num_elements == 0) return;
|
||||
int64_t i = 0;
|
||||
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
|
||||
bool value = bytes[i++];
|
||||
false_count_ += !value;
|
||||
return value;
|
||||
});
|
||||
bit_length_ += num_elements;
|
||||
}
|
||||
|
||||
/// \brief Append bits from a packed bitmap
|
||||
void UnsafeAppend(const uint8_t* bitmap, int64_t offset, int64_t num_elements) {
|
||||
if (num_elements == 0) return;
|
||||
internal::CopyBitmap(bitmap, offset, num_elements, mutable_data(), bit_length_);
|
||||
false_count_ += num_elements - internal::CountSetBits(bitmap, offset, num_elements);
|
||||
bit_length_ += num_elements;
|
||||
}
|
||||
|
||||
void UnsafeAppend(const int64_t num_copies, bool value) {
|
||||
bit_util::SetBitsTo(mutable_data(), bit_length_, num_copies, value);
|
||||
false_count_ += num_copies * !value;
|
||||
bit_length_ += num_copies;
|
||||
}
|
||||
|
||||
template <bool count_falses, typename Generator>
|
||||
void UnsafeAppend(const int64_t num_elements, Generator&& gen) {
|
||||
if (num_elements == 0) return;
|
||||
|
||||
if (count_falses) {
|
||||
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
|
||||
bool value = gen();
|
||||
false_count_ += !value;
|
||||
return value;
|
||||
});
|
||||
} else {
|
||||
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements,
|
||||
std::forward<Generator>(gen));
|
||||
}
|
||||
bit_length_ += num_elements;
|
||||
}
|
||||
|
||||
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
|
||||
const int64_t old_byte_capacity = bytes_builder_.capacity();
|
||||
ARROW_RETURN_NOT_OK(
|
||||
bytes_builder_.Resize(bit_util::BytesForBits(new_capacity), shrink_to_fit));
|
||||
// Resize() may have chosen a larger capacity (e.g. for padding),
|
||||
// so ask it again before calling memset().
|
||||
const int64_t new_byte_capacity = bytes_builder_.capacity();
|
||||
if (new_byte_capacity > old_byte_capacity) {
|
||||
// The additional buffer space is 0-initialized for convenience,
|
||||
// so that other methods can simply bump the length.
|
||||
memset(mutable_data() + old_byte_capacity, 0,
|
||||
static_cast<size_t>(new_byte_capacity - old_byte_capacity));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Reserve(const int64_t additional_elements) {
|
||||
return Resize(
|
||||
BufferBuilder::GrowByFactor(bit_length_, bit_length_ + additional_elements),
|
||||
false);
|
||||
}
|
||||
|
||||
Status Advance(const int64_t length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
bit_length_ += length;
|
||||
false_count_ += length;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
|
||||
// set bytes_builder_.size_ == byte size of data
|
||||
bytes_builder_.UnsafeAdvance(bit_util::BytesForBits(bit_length_) -
|
||||
bytes_builder_.length());
|
||||
bit_length_ = false_count_ = 0;
|
||||
return bytes_builder_.Finish(out, shrink_to_fit);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
|
||||
std::shared_ptr<Buffer> out;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Like Finish, but override the final buffer size
|
||||
///
|
||||
/// This is useful after writing data directly into the builder memory
|
||||
/// without calling the Append methods (basically, when using TypedBufferBuilder
|
||||
/// only for memory allocation).
|
||||
Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
|
||||
bool shrink_to_fit = true) {
|
||||
const auto final_byte_length = bit_util::BytesForBits(final_length);
|
||||
bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
|
||||
bit_length_ = false_count_ = 0;
|
||||
return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
bytes_builder_.Reset();
|
||||
bit_length_ = false_count_ = 0;
|
||||
}
|
||||
|
||||
int64_t length() const { return bit_length_; }
|
||||
int64_t capacity() const { return bytes_builder_.capacity() * 8; }
|
||||
const uint8_t* data() const { return bytes_builder_.data(); }
|
||||
uint8_t* mutable_data() { return bytes_builder_.mutable_data(); }
|
||||
int64_t false_count() const { return false_count_; }
|
||||
|
||||
private:
|
||||
BufferBuilder bytes_builder_;
|
||||
int64_t bit_length_ = 0;
|
||||
int64_t false_count_ = 0;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,33 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/builder_adaptive.h" // IWYU pragma: keep
|
||||
#include "arrow/array/builder_base.h" // IWYU pragma: keep
|
||||
#include "arrow/array/builder_binary.h" // IWYU pragma: keep
|
||||
#include "arrow/array/builder_decimal.h" // IWYU pragma: keep
|
||||
#include "arrow/array/builder_dict.h" // IWYU pragma: keep
|
||||
#include "arrow/array/builder_nested.h" // IWYU pragma: keep
|
||||
#include "arrow/array/builder_primitive.h" // IWYU pragma: keep
|
||||
#include "arrow/array/builder_run_end.h" // IWYU pragma: keep
|
||||
#include "arrow/array/builder_time.h" // IWYU pragma: keep
|
||||
#include "arrow/array/builder_union.h" // IWYU pragma: keep
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
460
venv/lib/python3.10/site-packages/pyarrow/include/arrow/c/abi.h
Normal file
460
venv/lib/python3.10/site-packages/pyarrow/include/arrow/c/abi.h
Normal file
@@ -0,0 +1,460 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
/// \file abi.h Arrow C Data Interface
|
||||
///
|
||||
/// The Arrow C Data interface defines a very small, stable set
|
||||
/// of C definitions which can be easily copied into any project's
|
||||
/// source code and vendored to be used for columnar data interchange
|
||||
/// in the Arrow format. For non-C/C++ languages and runtimes,
|
||||
/// it should be almost as easy to translate the C definitions into
|
||||
/// the corresponding C FFI declarations.
|
||||
///
|
||||
/// Applications and libraries can therefore work with Arrow memory
|
||||
/// without necessarily using the Arrow libraries or reinventing
|
||||
/// the wheel. Developers can choose between tight integration
|
||||
/// with the Arrow software project or minimal integration with
|
||||
/// the Arrow format only.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef ARROW_C_DATA_INTERFACE
|
||||
# define ARROW_C_DATA_INTERFACE
|
||||
|
||||
# define ARROW_FLAG_DICTIONARY_ORDERED 1
|
||||
# define ARROW_FLAG_NULLABLE 2
|
||||
# define ARROW_FLAG_MAP_KEYS_SORTED 4
|
||||
|
||||
struct ArrowSchema {
|
||||
// Array type description
|
||||
const char* format;
|
||||
const char* name;
|
||||
const char* metadata;
|
||||
int64_t flags;
|
||||
int64_t n_children;
|
||||
struct ArrowSchema** children;
|
||||
struct ArrowSchema* dictionary;
|
||||
|
||||
// Release callback
|
||||
void (*release)(struct ArrowSchema*);
|
||||
// Opaque producer-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
struct ArrowArray {
|
||||
// Array data description
|
||||
int64_t length;
|
||||
int64_t null_count;
|
||||
int64_t offset;
|
||||
int64_t n_buffers;
|
||||
int64_t n_children;
|
||||
const void** buffers;
|
||||
struct ArrowArray** children;
|
||||
struct ArrowArray* dictionary;
|
||||
|
||||
// Release callback
|
||||
void (*release)(struct ArrowArray*);
|
||||
// Opaque producer-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
# define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT "ARROW:average_byte_width:exact"
|
||||
# define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE \
|
||||
"ARROW:average_byte_width:approximate"
|
||||
# define ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT "ARROW:distinct_count:exact"
|
||||
# define ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE \
|
||||
"ARROW:distinct_count:approximate"
|
||||
# define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT "ARROW:max_byte_width:exact"
|
||||
# define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE \
|
||||
"ARROW:max_byte_width:approximate"
|
||||
# define ARROW_STATISTICS_KEY_MAX_VALUE_EXACT "ARROW:max_value:exact"
|
||||
# define ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE "ARROW:max_value:approximate"
|
||||
# define ARROW_STATISTICS_KEY_MIN_VALUE_EXACT "ARROW:min_value:exact"
|
||||
# define ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE "ARROW:min_value:approximate"
|
||||
# define ARROW_STATISTICS_KEY_NULL_COUNT_EXACT "ARROW:null_count:exact"
|
||||
# define ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE "ARROW:null_count:approximate"
|
||||
# define ARROW_STATISTICS_KEY_ROW_COUNT_EXACT "ARROW:row_count:exact"
|
||||
# define ARROW_STATISTICS_KEY_ROW_COUNT_APPROXIMATE "ARROW:row_count:approximate"
|
||||
|
||||
#endif // ARROW_C_DATA_INTERFACE
|
||||
|
||||
#ifndef ARROW_C_DEVICE_DATA_INTERFACE
|
||||
# define ARROW_C_DEVICE_DATA_INTERFACE
|
||||
|
||||
// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html
|
||||
|
||||
// DeviceType for the allocated memory
|
||||
typedef int32_t ArrowDeviceType;
|
||||
|
||||
// CPU device, same as using ArrowArray directly
|
||||
# define ARROW_DEVICE_CPU 1
|
||||
// CUDA GPU Device
|
||||
# define ARROW_DEVICE_CUDA 2
|
||||
// Pinned CUDA CPU memory by cudaMallocHost
|
||||
# define ARROW_DEVICE_CUDA_HOST 3
|
||||
// OpenCL Device
|
||||
# define ARROW_DEVICE_OPENCL 4
|
||||
// Vulkan buffer for next-gen graphics
|
||||
# define ARROW_DEVICE_VULKAN 7
|
||||
// Metal for Apple GPU
|
||||
# define ARROW_DEVICE_METAL 8
|
||||
// Verilog simulator buffer
|
||||
# define ARROW_DEVICE_VPI 9
|
||||
// ROCm GPUs for AMD GPUs
|
||||
# define ARROW_DEVICE_ROCM 10
|
||||
// Pinned ROCm CPU memory allocated by hipMallocHost
|
||||
# define ARROW_DEVICE_ROCM_HOST 11
|
||||
// Reserved for extension
|
||||
# define ARROW_DEVICE_EXT_DEV 12
|
||||
// CUDA managed/unified memory allocated by cudaMallocManaged
|
||||
# define ARROW_DEVICE_CUDA_MANAGED 13
|
||||
// unified shared memory allocated on a oneAPI non-partitioned device.
|
||||
# define ARROW_DEVICE_ONEAPI 14
|
||||
// GPU support for next-gen WebGPU standard
|
||||
# define ARROW_DEVICE_WEBGPU 15
|
||||
// Qualcomm Hexagon DSP
|
||||
# define ARROW_DEVICE_HEXAGON 16
|
||||
|
||||
struct ArrowDeviceArray {
|
||||
// the Allocated Array
|
||||
//
|
||||
// the buffers in the array (along with the buffers of any
|
||||
// children) are what is allocated on the device.
|
||||
struct ArrowArray array;
|
||||
// The device id to identify a specific device
|
||||
int64_t device_id;
|
||||
// The type of device which can access this memory.
|
||||
ArrowDeviceType device_type;
|
||||
// An event-like object to synchronize on if needed.
|
||||
void* sync_event;
|
||||
// Reserved bytes for future expansion.
|
||||
int64_t reserved[3];
|
||||
};
|
||||
|
||||
#endif // ARROW_C_DEVICE_DATA_INTERFACE
|
||||
|
||||
#ifndef ARROW_C_STREAM_INTERFACE
|
||||
# define ARROW_C_STREAM_INTERFACE
|
||||
|
||||
struct ArrowArrayStream {
|
||||
// Callback to get the stream type
|
||||
// (will be the same for all arrays in the stream).
|
||||
//
|
||||
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
|
||||
//
|
||||
// If successful, the ArrowSchema must be released independently from the stream.
|
||||
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
|
||||
|
||||
// Callback to get the next array
|
||||
// (if no error and the array is released, the stream has ended)
|
||||
//
|
||||
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
|
||||
//
|
||||
// If successful, the ArrowArray must be released independently from the stream.
|
||||
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
|
||||
|
||||
// Callback to get optional detailed error information.
|
||||
// This must only be called if the last stream operation failed
|
||||
// with a non-0 return code.
|
||||
//
|
||||
// Return value: pointer to a null-terminated character array describing
|
||||
// the last error, or NULL if no description is available.
|
||||
//
|
||||
// The returned pointer is only valid until the next operation on this stream
|
||||
// (including release).
|
||||
const char* (*get_last_error)(struct ArrowArrayStream*);
|
||||
|
||||
// Release callback: release the stream's own resources.
|
||||
// Note that arrays returned by `get_next` must be individually released.
|
||||
void (*release)(struct ArrowArrayStream*);
|
||||
|
||||
// Opaque producer-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
#endif // ARROW_C_STREAM_INTERFACE
|
||||
|
||||
#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
|
||||
# define ARROW_C_DEVICE_STREAM_INTERFACE
|
||||
|
||||
// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
|
||||
//
|
||||
// This stream is intended to provide a stream of data on a single
|
||||
// device, if a producer wants data to be produced on multiple devices
|
||||
// then multiple streams should be provided. One per device.
|
||||
struct ArrowDeviceArrayStream {
|
||||
// The device that this stream produces data on.
|
||||
ArrowDeviceType device_type;
|
||||
|
||||
// Callback to get the stream schema
|
||||
// (will be the same for all arrays in the stream).
|
||||
//
|
||||
// Return value 0 if successful, an `errno`-compatible error code otherwise.
|
||||
//
|
||||
// If successful, the ArrowSchema must be released independently from the stream.
|
||||
// The schema should be accessible via CPU memory.
|
||||
int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
|
||||
|
||||
// Callback to get the next array
|
||||
// (if no error and the array is released, the stream has ended)
|
||||
//
|
||||
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
|
||||
//
|
||||
// If successful, the ArrowDeviceArray must be released independently from the stream.
|
||||
int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
|
||||
|
||||
// Callback to get optional detailed error information.
|
||||
// This must only be called if the last stream operation failed
|
||||
// with a non-0 return code.
|
||||
//
|
||||
// Return value: pointer to a null-terminated character array describing
|
||||
// the last error, or NULL if no description is available.
|
||||
//
|
||||
// The returned pointer is only valid until the next operation on this stream
|
||||
// (including release).
|
||||
const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
|
||||
|
||||
// Release callback: release the stream's own resources.
|
||||
// Note that arrays returned by `get_next` must be individually released.
|
||||
void (*release)(struct ArrowDeviceArrayStream* self);
|
||||
|
||||
// Opaque producer-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
#endif // ARROW_C_DEVICE_STREAM_INTERFACE
|
||||
|
||||
#ifndef ARROW_C_ASYNC_STREAM_INTERFACE
|
||||
# define ARROW_C_ASYNC_STREAM_INTERFACE
|
||||
|
||||
// EXPERIMENTAL: ArrowAsyncTask represents available data from a producer that was passed
|
||||
// to an invocation of `on_next_task` on the ArrowAsyncDeviceStreamHandler.
|
||||
//
|
||||
// The reason for this Task approach instead of the Async interface returning
|
||||
// the Array directly is to allow for more complex thread handling and reducing
|
||||
// context switching and data transfers between CPU cores (e.g. from one L1/L2
|
||||
// cache to another) if desired.
|
||||
//
|
||||
// For example, the `on_next_task` callback can be called when data is ready, while
|
||||
// the producer puts potential "decoding" logic in the `ArrowAsyncTask` object. This
|
||||
// allows for the producer to manage the I/O on one thread which calls `on_next_task`
|
||||
// and the consumer can determine when the decoding (producer logic in the `extract_data`
|
||||
// callback of the task) occurs and on which thread, to avoid a CPU core transfer
|
||||
// (data staying in the L2 cache).
|
||||
struct ArrowAsyncTask {
|
||||
// This callback should populate the ArrowDeviceArray associated with this task.
|
||||
// The order of ArrowAsyncTasks provided by the producer enables a consumer to
|
||||
// ensure the order of data to process.
|
||||
//
|
||||
// This function is expected to be synchronous, but should not perform any blocking
|
||||
// I/O. Ideally it should be as cheap as possible so as to not tie up the consumer
|
||||
// thread unnecessarily.
|
||||
//
|
||||
// Returns: 0 if successful, errno-compatible error otherwise.
|
||||
//
|
||||
// If a non-0 value is returned then it should be followed by a call to `on_error`
|
||||
// on the appropriate ArrowAsyncDeviceStreamHandler. This is because it's highly
|
||||
// likely that whatever is calling this function may be entirely disconnected from
|
||||
// the current control flow. Indicating an error here with a non-zero return allows
|
||||
// the current flow to be aware of the error occurring, while still allowing any
|
||||
// logging or error handling to still be centralized in the `on_error` callback of
|
||||
// the original Async handler.
|
||||
//
|
||||
// Rather than a release callback, any required cleanup should be performed as part
|
||||
// of the invocation of `extract_data`. Ownership of the Array is passed to the consumer
|
||||
// calling this, and so it must be released separately.
|
||||
//
|
||||
// It is only valid to call this method exactly once.
|
||||
int (*extract_data)(struct ArrowAsyncTask* self, struct ArrowDeviceArray* out);
|
||||
|
||||
// opaque task-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
// EXPERIMENTAL: ArrowAsyncProducer represents a 1-to-1 relationship between an async
|
||||
// producer and consumer. This object allows the consumer to perform backpressure and flow
|
||||
// control on the asynchronous stream processing. This object must be owned by the
|
||||
// producer who creates it, and thus is responsible for cleaning it up.
|
||||
struct ArrowAsyncProducer {
|
||||
// The device type that this stream produces data on.
|
||||
ArrowDeviceType device_type;
|
||||
|
||||
// A consumer must call this function to start receiving on_next_task calls.
|
||||
//
|
||||
// It *must* be valid to call this synchronously from within `on_next_task` or
|
||||
// `on_schema`, but this function *must not* immediately call `on_next_task` so as
|
||||
// to avoid recursion and reentrant callbacks.
|
||||
//
|
||||
// After cancel has been called, additional calls to this function must be NOPs,
|
||||
// but allowed. While not cancelled, calling this function must register the
|
||||
// given number of additional arrays/batches to be produced with the producer.
|
||||
// The producer should only call `on_next_task` at most the registered number
|
||||
// of arrays before propagating backpressure.
|
||||
//
|
||||
// Any error encountered by calling request must be propagated by calling the `on_error`
|
||||
// callback of the ArrowAsyncDeviceStreamHandler.
|
||||
//
|
||||
// While not cancelled, any subsequent calls to `on_next_task`, `on_error` or
|
||||
// `release` should be scheduled by the producer to be called later.
|
||||
//
|
||||
// It is invalid for a consumer to call this with a value of n <= 0, producers should
|
||||
// error if given such a value.
|
||||
void (*request)(struct ArrowAsyncProducer* self, int64_t n);
|
||||
|
||||
// This cancel callback signals a producer that it must eventually stop making calls
|
||||
// to on_next_task. It must be idempotent and thread-safe. After calling cancel once,
|
||||
// subsequent calls must be NOPs. This must not call any consumer-side handlers other
|
||||
// than `on_error`.
|
||||
//
|
||||
// It is not required that calling cancel affect the producer immediately, only that it
|
||||
// must eventually stop calling on_next_task and subsequently call release on the
|
||||
// async handler. As such, a consumer must be prepared to receive one or more calls to
|
||||
// `on_next_task` even after calling cancel if there are still requested arrays pending.
|
||||
//
|
||||
// Successful cancellation should *not* result in the producer calling `on_error`, it
|
||||
// should finish out any remaining tasks and eventually call `release`.
|
||||
//
|
||||
// Any error encountered during handling a call to cancel must be reported via the
|
||||
// on_error callback on the async stream handler.
|
||||
void (*cancel)(struct ArrowAsyncProducer* self);
|
||||
|
||||
// Any additional metadata tied to a specific stream of data. This must either be NULL
|
||||
// or a valid pointer to metadata which is encoded in the same way schema metadata
|
||||
// would be. Non-null metadata must be valid for the lifetime of this object. As an
|
||||
// example a producer could use this to provide the total number of rows and/or batches
|
||||
// in the stream if known.
|
||||
const char* additional_metadata;
|
||||
|
||||
// producer-specific opaque data.
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
// EXPERIMENTAL: Similar to ArrowDeviceArrayStream, except designed for an asynchronous
|
||||
// style of interaction. While ArrowDeviceArrayStream provides producer
|
||||
// defined callbacks, this is intended to be created by the consumer instead.
|
||||
// The consumer passes this handler to the producer, which in turn uses the
|
||||
// callbacks to inform the consumer of events in the stream.
|
||||
struct ArrowAsyncDeviceStreamHandler {
|
||||
// Handler for receiving a schema. The passed in stream_schema must be
|
||||
// released or moved by the handler (producer is giving ownership of the schema to
|
||||
// the handler, but not ownership of the top level object itself).
|
||||
//
|
||||
// With the exception of an error occurring (on_error), this must be the first
|
||||
// callback function which is called by a producer and must only be called exactly
|
||||
// once. As such, the producer should provide a valid ArrowAsyncProducer instance
|
||||
// so the consumer can control the flow. See the documentation on ArrowAsyncProducer
|
||||
// for how it works. The ArrowAsyncProducer is owned by the producer who calls this
|
||||
// function and thus the producer is responsible for cleaning it up when calling
|
||||
// the release callback of this handler.
|
||||
//
|
||||
// If there is any additional metadata tied to this stream, it will be provided as
|
||||
// a non-null value for the `additional_metadata` field of the ArrowAsyncProducer
|
||||
// which will be valid at least until the release callback is called.
|
||||
//
|
||||
// Return value: 0 if successful, `errno`-compatible error otherwise
|
||||
//
|
||||
// A producer that receives a non-zero return here should stop producing and eventually
|
||||
// call release instead.
|
||||
int (*on_schema)(struct ArrowAsyncDeviceStreamHandler* self,
|
||||
struct ArrowSchema* stream_schema);
|
||||
|
||||
// Handler for receiving data. This is called when data is available providing an
|
||||
// ArrowAsyncTask struct to signify it. The producer indicates the end of the stream
|
||||
// by passing NULL as the value for the task rather than a valid pointer to a task.
|
||||
// The task object is only valid for the lifetime of this function call, if a consumer
|
||||
// wants to utilize it after this function returns, it must copy or move the contents
|
||||
// of it to a new ArrowAsyncTask object.
|
||||
//
|
||||
// The `request` callback of a provided ArrowAsyncProducer must be called in order
|
||||
// to start receiving calls to this handler.
|
||||
//
|
||||
// The metadata argument can be null or can be used by a producer
|
||||
// to pass arbitrary extra information to the consumer (such as total number
|
||||
// of rows, context info, or otherwise). The data should be passed using the same
|
||||
// encoding as the metadata within the ArrowSchema struct itself (defined in
|
||||
// the spec at
|
||||
// https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata)
|
||||
//
|
||||
// If metadata is non-null then it only needs to exist for the lifetime of this call,
|
||||
// a consumer who wants it to live after that must copy it to ensure lifetime.
|
||||
//
|
||||
// A producer *must not* call this concurrently from multiple different threads.
|
||||
//
|
||||
// A consumer must be prepared to receive one or more calls to this callback even
|
||||
// after calling cancel on the corresponding ArrowAsyncProducer, as cancel does not
|
||||
// guarantee it happens immediately.
|
||||
//
|
||||
// Return value: 0 if successful, `errno`-compatible error otherwise.
|
||||
//
|
||||
// If the consumer returns a non-zero return from this method, that indicates to the
|
||||
// producer that it should stop propagating data as an error occurred. After receiving
|
||||
// such a return, the only interaction with this object is for the producer to call
|
||||
// the `release` callback.
|
||||
int (*on_next_task)(struct ArrowAsyncDeviceStreamHandler* self,
|
||||
struct ArrowAsyncTask* task, const char* metadata);
|
||||
|
||||
// Handler for encountering an error. The producer should call release after
|
||||
// this returns to clean up any resources. The `code` passed in can be any error
|
||||
// code that a producer wants, but should be errno-compatible for consistency.
|
||||
//
|
||||
// If the message or metadata are non-null, they will only last as long as this
|
||||
// function call. The consumer would need to perform a copy of the data if it is
|
||||
// necessary for them to live past the lifetime of this call.
|
||||
//
|
||||
// Error metadata should be encoded as with metadata in ArrowSchema, defined in
|
||||
// the spec at
|
||||
// https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata
|
||||
//
|
||||
// It is valid for this to be called by a producer with or without a preceding call
|
||||
// to ArrowAsyncProducer.request.
|
||||
//
|
||||
// This callback must not call any methods of an ArrowAsyncProducer object.
|
||||
void (*on_error)(struct ArrowAsyncDeviceStreamHandler* self, int code,
|
||||
const char* message, const char* metadata);
|
||||
|
||||
// Release callback to release any resources for the handler. Should always be
|
||||
// called by a producer when it is done utilizing a handler. No callbacks should
|
||||
// be called after this is called.
|
||||
//
|
||||
// It is valid for the release callback to be called by a producer with or without
|
||||
// a preceding call to ArrowAsyncProducer.request.
|
||||
//
|
||||
// The release callback must not call any methods of an ArrowAsyncProducer object.
|
||||
void (*release)(struct ArrowAsyncDeviceStreamHandler* self);
|
||||
|
||||
// MUST be populated by the producer BEFORE calling any callbacks other than release.
|
||||
// This provides the connection between a handler and its producer, and must exist until
|
||||
// the release callback is called.
|
||||
struct ArrowAsyncProducer* producer;
|
||||
|
||||
// Opaque handler-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
#endif // ARROW_C_ASYNC_STREAM_INTERFACE
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,489 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/c/abi.h"
|
||||
#include "arrow/device.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/async_generator_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \defgroup c-data-interface Functions for working with the C data interface.
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Export C++ DataType using the C data interface format.
|
||||
///
|
||||
/// The root type is considered to have empty name and metadata.
|
||||
/// If you want the root type to have a name and/or metadata, pass
|
||||
/// a Field instead.
|
||||
///
|
||||
/// \param[in] type DataType object to export
|
||||
/// \param[out] out C struct where to export the datatype
|
||||
ARROW_EXPORT
|
||||
Status ExportType(const DataType& type, struct ArrowSchema* out);
|
||||
|
||||
/// \brief Export C++ Field using the C data interface format.
|
||||
///
|
||||
/// \param[in] field Field object to export
|
||||
/// \param[out] out C struct where to export the field
|
||||
ARROW_EXPORT
|
||||
Status ExportField(const Field& field, struct ArrowSchema* out);
|
||||
|
||||
/// \brief Export C++ Schema using the C data interface format.
|
||||
///
|
||||
/// \param[in] schema Schema object to export
|
||||
/// \param[out] out C struct where to export the field
|
||||
ARROW_EXPORT
|
||||
Status ExportSchema(const Schema& schema, struct ArrowSchema* out);
|
||||
|
||||
/// \brief Export C++ Array using the C data interface format.
|
||||
///
|
||||
/// The resulting ArrowArray struct keeps the array data and buffers alive
|
||||
/// until its release callback is called by the consumer.
|
||||
///
|
||||
/// \param[in] array Array object to export
|
||||
/// \param[out] out C struct where to export the array
|
||||
/// \param[out] out_schema optional C struct where to export the array type
|
||||
ARROW_EXPORT
|
||||
Status ExportArray(const Array& array, struct ArrowArray* out,
|
||||
struct ArrowSchema* out_schema = NULLPTR);
|
||||
|
||||
/// \brief Export C++ RecordBatch using the C data interface format.
|
||||
///
|
||||
/// The record batch is exported as if it were a struct array.
|
||||
/// The resulting ArrowArray struct keeps the record batch data and buffers alive
|
||||
/// until its release callback is called by the consumer.
|
||||
///
|
||||
/// \param[in] batch Record batch to export
|
||||
/// \param[out] out C struct where to export the record batch
|
||||
/// \param[out] out_schema optional C struct where to export the record batch schema
|
||||
ARROW_EXPORT
|
||||
Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out,
|
||||
struct ArrowSchema* out_schema = NULLPTR);
|
||||
|
||||
/// \brief Import C++ DataType from the C data interface.
|
||||
///
|
||||
/// The given ArrowSchema struct is released (as per the C data interface
|
||||
/// specification), even if this function fails.
|
||||
///
|
||||
/// \param[in,out] schema C data interface struct representing the data type
|
||||
/// \return Imported type object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<DataType>> ImportType(struct ArrowSchema* schema);
|
||||
|
||||
/// \brief Import C++ Field from the C data interface.
|
||||
///
|
||||
/// The given ArrowSchema struct is released (as per the C data interface
|
||||
/// specification), even if this function fails.
|
||||
///
|
||||
/// \param[in,out] schema C data interface struct representing the field
|
||||
/// \return Imported field object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Field>> ImportField(struct ArrowSchema* schema);
|
||||
|
||||
/// \brief Import C++ Schema from the C data interface.
|
||||
///
|
||||
/// The given ArrowSchema struct is released (as per the C data interface
|
||||
/// specification), even if this function fails.
|
||||
///
|
||||
/// \param[in,out] schema C data interface struct representing the field
|
||||
/// \return Imported field object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Schema>> ImportSchema(struct ArrowSchema* schema);
|
||||
|
||||
/// \brief Import C++ array from the C data interface.
|
||||
///
|
||||
/// The ArrowArray struct has its contents moved (as per the C data interface
|
||||
/// specification) to a private object held alive by the resulting array.
|
||||
///
|
||||
/// \param[in,out] array C data interface struct holding the array data
|
||||
/// \param[in] type type of the imported array
|
||||
/// \return Imported array object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> ImportArray(struct ArrowArray* array,
|
||||
std::shared_ptr<DataType> type);
|
||||
|
||||
/// \brief Import C++ array and its type from the C data interface.
|
||||
///
|
||||
/// The ArrowArray struct has its contents moved (as per the C data interface
|
||||
/// specification) to a private object held alive by the resulting array.
|
||||
/// The ArrowSchema struct is released, even if this function fails.
|
||||
///
|
||||
/// \param[in,out] array C data interface struct holding the array data
|
||||
/// \param[in,out] type C data interface struct holding the array type
|
||||
/// \return Imported array object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> ImportArray(struct ArrowArray* array,
|
||||
struct ArrowSchema* type);
|
||||
|
||||
/// \brief Import C++ record batch from the C data interface.
|
||||
///
|
||||
/// The ArrowArray struct has its contents moved (as per the C data interface
|
||||
/// specification) to a private object held alive by the resulting record batch.
|
||||
///
|
||||
/// \param[in,out] array C data interface struct holding the record batch data
|
||||
/// \param[in] schema schema of the imported record batch
|
||||
/// \return Imported record batch object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
|
||||
std::shared_ptr<Schema> schema);
|
||||
|
||||
/// \brief Import C++ record batch and its schema from the C data interface.
|
||||
///
|
||||
/// The type represented by the ArrowSchema struct must be a struct type array.
|
||||
/// The ArrowArray struct has its contents moved (as per the C data interface
|
||||
/// specification) to a private object held alive by the resulting record batch.
|
||||
/// The ArrowSchema struct is released, even if this function fails.
|
||||
///
|
||||
/// \param[in,out] array C data interface struct holding the record batch data
|
||||
/// \param[in,out] schema C data interface struct holding the record batch schema
|
||||
/// \return Imported record batch object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
|
||||
struct ArrowSchema* schema);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup c-data-device-interface Functions for working with the C data device
|
||||
/// interface.
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief EXPERIMENTAL: Export C++ Array as an ArrowDeviceArray.
|
||||
///
|
||||
/// The resulting ArrowDeviceArray struct keeps the array data and buffers alive
|
||||
/// until its release callback is called by the consumer. All buffers in
|
||||
/// the provided array MUST have the same device_type, otherwise an error
|
||||
/// will be returned.
|
||||
///
|
||||
/// If sync is non-null, get_event will be called on it in order to
|
||||
/// potentially provide an event for consumers to synchronize on.
|
||||
///
|
||||
/// \param[in] array Array object to export
|
||||
/// \param[in] sync shared_ptr to object derived from Device::SyncEvent or null
|
||||
/// \param[out] out C struct to export the array to
|
||||
/// \param[out] out_schema optional C struct to export the array type to
|
||||
ARROW_EXPORT
|
||||
Status ExportDeviceArray(const Array& array, std::shared_ptr<Device::SyncEvent> sync,
|
||||
struct ArrowDeviceArray* out,
|
||||
struct ArrowSchema* out_schema = NULLPTR);
|
||||
|
||||
/// \brief EXPERIMENTAL: Export C++ RecordBatch as an ArrowDeviceArray.
|
||||
///
|
||||
/// The record batch is exported as if it were a struct array.
|
||||
/// The resulting ArrowDeviceArray struct keeps the record batch data and buffers alive
|
||||
/// until its release callback is called by the consumer.
|
||||
///
|
||||
/// All buffers of all columns in the record batch must have the same device_type
|
||||
/// otherwise an error will be returned. If columns are on different devices,
|
||||
/// they should be exported using different ArrowDeviceArray instances.
|
||||
///
|
||||
/// If sync is non-null, get_event will be called on it in order to
|
||||
/// potentially provide an event for consumers to synchronize on.
|
||||
///
|
||||
/// \param[in] batch Record batch to export
|
||||
/// \param[in] sync shared_ptr to object derived from Device::SyncEvent or null
|
||||
/// \param[out] out C struct where to export the record batch
|
||||
/// \param[out] out_schema optional C struct where to export the record batch schema
|
||||
ARROW_EXPORT
|
||||
Status ExportDeviceRecordBatch(const RecordBatch& batch,
|
||||
std::shared_ptr<Device::SyncEvent> sync,
|
||||
struct ArrowDeviceArray* out,
|
||||
struct ArrowSchema* out_schema = NULLPTR);
|
||||
|
||||
using DeviceMemoryMapper =
|
||||
std::function<Result<std::shared_ptr<MemoryManager>>(ArrowDeviceType, int64_t)>;
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<MemoryManager>> DefaultDeviceMemoryMapper(
|
||||
ArrowDeviceType device_type, int64_t device_id);
|
||||
|
||||
/// \brief EXPERIMENTAL: Import C++ device array from the C data interface.
|
||||
///
|
||||
/// The ArrowArray struct has its contents moved (as per the C data interface
|
||||
/// specification) to a private object held alive by the resulting array. The
|
||||
/// buffers of the Array are located on the device indicated by the device_type.
|
||||
///
|
||||
/// \param[in,out] array C data interface struct holding the array data
|
||||
/// \param[in] type type of the imported array
|
||||
/// \param[in] mapper A function to map device + id to memory manager. If not
|
||||
/// specified, defaults to map "cpu" to the built-in default memory manager.
|
||||
/// \return Imported array object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> ImportDeviceArray(
|
||||
struct ArrowDeviceArray* array, std::shared_ptr<DataType> type,
|
||||
const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
|
||||
|
||||
/// \brief EXPERIMENTAL: Import C++ device array and its type from the C data interface.
|
||||
///
|
||||
/// The ArrowArray struct has its contents moved (as per the C data interface
|
||||
/// specification) to a private object held alive by the resulting array.
|
||||
/// The ArrowSchema struct is released, even if this function fails. The
|
||||
/// buffers of the Array are located on the device indicated by the device_type.
|
||||
///
|
||||
/// \param[in,out] array C data interface struct holding the array data
|
||||
/// \param[in,out] type C data interface struct holding the array type
|
||||
/// \param[in] mapper A function to map device + id to memory manager. If not
|
||||
/// specified, defaults to map "cpu" to the built-in default memory manager.
|
||||
/// \return Imported array object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> ImportDeviceArray(
|
||||
struct ArrowDeviceArray* array, struct ArrowSchema* type,
|
||||
const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
|
||||
|
||||
/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device from the C data
|
||||
/// interface.
|
||||
///
|
||||
/// The ArrowArray struct has its contents moved (as per the C data interface
|
||||
/// specification) to a private object held alive by the resulting record batch.
|
||||
/// The buffers of all columns of the record batch are located on the device
|
||||
/// indicated by the device type.
|
||||
///
|
||||
/// \param[in,out] array C data interface struct holding the record batch data
|
||||
/// \param[in] schema schema of the imported record batch
|
||||
/// \param[in] mapper A function to map device + id to memory manager. If not
|
||||
/// specified, defaults to map "cpu" to the built-in default memory manager.
|
||||
/// \return Imported record batch object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatch>> ImportDeviceRecordBatch(
|
||||
struct ArrowDeviceArray* array, std::shared_ptr<Schema> schema,
|
||||
const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
|
||||
|
||||
/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device and its schema
|
||||
/// from the C data interface.
|
||||
///
|
||||
/// The type represented by the ArrowSchema struct must be a struct type array.
|
||||
/// The ArrowArray struct has its contents moved (as per the C data interface
|
||||
/// specification) to a private object held alive by the resulting record batch.
|
||||
/// The ArrowSchema struct is released, even if this function fails. The buffers
|
||||
/// of all columns of the record batch are located on the device indicated by the
|
||||
/// device type.
|
||||
///
|
||||
/// \param[in,out] array C data interface struct holding the record batch data
|
||||
/// \param[in,out] schema C data interface struct holding the record batch schema
|
||||
/// \param[in] mapper A function to map device + id to memory manager. If not
|
||||
/// specified, defaults to map "cpu" to the built-in default memory manager.
|
||||
/// \return Imported record batch object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatch>> ImportDeviceRecordBatch(
|
||||
struct ArrowDeviceArray* array, struct ArrowSchema* schema,
|
||||
const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup c-stream-interface Functions for working with the C data interface.
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Export C++ RecordBatchReader using the C stream interface.
|
||||
///
|
||||
/// The resulting ArrowArrayStream struct keeps the record batch reader alive
|
||||
/// until its release callback is called by the consumer.
|
||||
///
|
||||
/// \param[in] reader RecordBatchReader object to export
|
||||
/// \param[out] out C struct where to export the stream
|
||||
ARROW_EXPORT
|
||||
Status ExportRecordBatchReader(std::shared_ptr<RecordBatchReader> reader,
|
||||
struct ArrowArrayStream* out);
|
||||
|
||||
/// \brief Export C++ ChunkedArray using the C data interface format.
|
||||
///
|
||||
/// The resulting ArrowArrayStream struct keeps the chunked array data and buffers alive
|
||||
/// until its release callback is called by the consumer.
|
||||
///
|
||||
/// \param[in] chunked_array ChunkedArray object to export
|
||||
/// \param[out] out C struct where to export the stream
|
||||
ARROW_EXPORT
|
||||
Status ExportChunkedArray(std::shared_ptr<ChunkedArray> chunked_array,
|
||||
struct ArrowArrayStream* out);
|
||||
|
||||
/// \brief Export C++ RecordBatchReader using the C device stream interface
|
||||
///
|
||||
/// The resulting ArrowDeviceArrayStream struct keeps the record batch reader
|
||||
/// alive until its release callback is called by the consumer. The device
|
||||
/// type is determined by calling device_type() on the RecordBatchReader.
|
||||
///
|
||||
/// \param[in] reader RecordBatchReader object to export
|
||||
/// \param[out] out C struct to export the stream to
|
||||
ARROW_EXPORT
|
||||
Status ExportDeviceRecordBatchReader(std::shared_ptr<RecordBatchReader> reader,
|
||||
struct ArrowDeviceArrayStream* out);
|
||||
|
||||
/// \brief Export C++ ChunkedArray using the C device data interface format.
|
||||
///
|
||||
/// The resulting ArrowDeviceArrayStream keeps the chunked array data and buffers
|
||||
/// alive until its release callback is called by the consumer.
|
||||
///
|
||||
/// \param[in] chunked_array ChunkedArray object to export
|
||||
/// \param[in] device_type the device type the data is located on
|
||||
/// \param[out] out C struct to export the stream to
|
||||
ARROW_EXPORT
|
||||
Status ExportDeviceChunkedArray(std::shared_ptr<ChunkedArray> chunked_array,
|
||||
DeviceAllocationType device_type,
|
||||
struct ArrowDeviceArrayStream* out);
|
||||
|
||||
/// \brief Import C++ RecordBatchReader from the C stream interface.
|
||||
///
|
||||
/// The ArrowArrayStream struct has its contents moved to a private object
|
||||
/// held alive by the resulting record batch reader.
|
||||
///
|
||||
/// \param[in,out] stream C stream interface struct
|
||||
/// \return Imported RecordBatchReader object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatchReader>> ImportRecordBatchReader(
|
||||
struct ArrowArrayStream* stream);
|
||||
|
||||
/// \brief Import C++ ChunkedArray from the C stream interface
|
||||
///
|
||||
/// The ArrowArrayStream struct has its contents moved to a private object,
|
||||
/// is consumed in its entirity, and released before returning all chunks
|
||||
/// as a ChunkedArray.
|
||||
///
|
||||
/// \param[in,out] stream C stream interface struct
|
||||
/// \return Imported ChunkedArray object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ChunkedArray>> ImportChunkedArray(struct ArrowArrayStream* stream);
|
||||
|
||||
/// \brief Import C++ RecordBatchReader from the C device stream interface
|
||||
///
|
||||
/// The ArrowDeviceArrayStream struct has its contents moved to a private object
|
||||
/// held alive by the resulting record batch reader.
|
||||
///
|
||||
/// \note If there was a required sync event, sync events are accessible by individual
|
||||
/// buffers of columns. We are not yet bubbling the sync events from the buffers up to
|
||||
/// the `GetSyncEvent` method of an imported RecordBatch. This will be added in a future
|
||||
/// update.
|
||||
///
|
||||
/// \param[in,out] stream C device stream interface struct
|
||||
/// \param[in] mapper mapping from device type and ID to memory manager
|
||||
/// \return Imported RecordBatchReader object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatchReader>> ImportDeviceRecordBatchReader(
|
||||
struct ArrowDeviceArrayStream* stream,
|
||||
const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
|
||||
|
||||
/// \brief Import C++ ChunkedArray from the C device stream interface
|
||||
///
|
||||
/// The ArrowDeviceArrayStream struct has its contents moved to a private object,
|
||||
/// is consumed in its entirety, and released before returning all chunks as a
|
||||
/// ChunkedArray.
|
||||
///
|
||||
/// \note Any chunks that require synchronization for their device memory will have
|
||||
/// the SyncEvent objects available by checking the individual buffers of each chunk.
|
||||
/// These SyncEvents should be checked before accessing the data in those buffers.
|
||||
///
|
||||
/// \param[in,out] stream C device stream interface struct
|
||||
/// \param[in] mapper mapping from device type and ID to memory manager
|
||||
/// \return Imported ChunkedArray object
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ChunkedArray>> ImportDeviceChunkedArray(
|
||||
struct ArrowDeviceArrayStream* stream,
|
||||
const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup c-async-stream-interface Functions for working with the async C data
|
||||
/// interface.
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief EXPERIMENTAL: AsyncErrorDetail is a StatusDetail that contains an error code
|
||||
/// and message from an asynchronous operation.
|
||||
class AsyncErrorDetail : public StatusDetail {
|
||||
public:
|
||||
AsyncErrorDetail(int code, std::string message, std::string metadata)
|
||||
: code_(code), message_(std::move(message)), metadata_(std::move(metadata)) {}
|
||||
const char* type_id() const override { return "AsyncErrorDetail"; }
|
||||
// ToString just returns the error message that was returned with the error
|
||||
std::string ToString() const override { return message_; }
|
||||
// code is an errno-compatible error code
|
||||
int code() const { return code_; }
|
||||
// returns any metadata that was returned with the error, likely in a
|
||||
// key-value format similar to ArrowSchema metadata
|
||||
const std::string& ErrorMetadataString() const { return metadata_; }
|
||||
std::shared_ptr<KeyValueMetadata> ErrorMetadata() const;
|
||||
|
||||
private:
|
||||
int code_{0};
|
||||
std::string message_;
|
||||
std::string metadata_;
|
||||
};
|
||||
|
||||
struct AsyncRecordBatchGenerator {
|
||||
std::shared_ptr<Schema> schema;
|
||||
DeviceAllocationType device_type;
|
||||
AsyncGenerator<RecordBatchWithMetadata> generator;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
class Executor;
|
||||
}
|
||||
|
||||
/// \brief EXPERIMENTAL: Create an AsyncRecordBatchReader and populate a corresponding
|
||||
/// handler to pass to a producer
|
||||
///
|
||||
/// The ArrowAsyncDeviceStreamHandler struct is intended to have its callbacks populated
|
||||
/// and then be passed to a producer to call the appropriate callbacks when data is ready.
|
||||
/// This inverts the traditional flow of control, and so we construct a corresponding
|
||||
/// AsyncRecordBatchGenerator to provide an interface for the consumer to retrieve data as
|
||||
/// it is pushed to the handler.
|
||||
///
|
||||
/// \param[in,out] handler C struct to be populated
|
||||
/// \param[in] executor the executor to use for waiting and populating record batches
|
||||
/// \param[in] queue_size initial number of record batches to request for queueing
|
||||
/// \param[in] mapper mapping from device type and ID to memory manager
|
||||
/// \return Future that resolves to either an error or AsyncRecordBatchGenerator once a
|
||||
/// schema is available or an error is received.
|
||||
ARROW_EXPORT
|
||||
Future<AsyncRecordBatchGenerator> CreateAsyncDeviceStreamHandler(
|
||||
struct ArrowAsyncDeviceStreamHandler* handler, internal::Executor* executor,
|
||||
uint64_t queue_size = 5, DeviceMemoryMapper mapper = DefaultDeviceMemoryMapper);
|
||||
|
||||
/// \brief EXPERIMENTAL: Export an AsyncGenerator of record batches using a provided
|
||||
/// handler
|
||||
///
|
||||
/// This function calls the callbacks on the consumer-provided async handler as record
|
||||
/// batches become available from the AsyncGenerator which is provided. It will first call
|
||||
/// on_schema using the provided schema, and then serially visit each record batch from
|
||||
/// the generator, calling the on_next_task callback. If an error occurs, on_error will be
|
||||
/// called appropriately.
|
||||
///
|
||||
/// \param[in] schema the schema of the stream being exported
|
||||
/// \param[in] generator a generator that asynchronously produces record batches
|
||||
/// \param[in] device_type the device type that the record batches will be located on
|
||||
/// \param[in] handler the handler whose callbacks to utilize as data is available
|
||||
/// \return Future that will resolve once the generator is exhausted or an error occurs
|
||||
ARROW_EXPORT
|
||||
Future<> ExportAsyncRecordBatchReader(
|
||||
std::shared_ptr<Schema> schema,
|
||||
AsyncGenerator<std::shared_ptr<RecordBatch>> generator,
|
||||
DeviceAllocationType device_type, struct ArrowAsyncDeviceStreamHandler* handler);
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,57 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/c/dlpack_abi.h"
|
||||
|
||||
namespace arrow::dlpack {
|
||||
|
||||
/// \brief Export Arrow array as DLPack tensor.
|
||||
///
|
||||
/// DLMangedTensor is produced as defined by the DLPack protocol,
|
||||
/// see https://dmlc.github.io/dlpack/latest/.
|
||||
///
|
||||
/// Data types for which the protocol is supported are
|
||||
/// integer and floating-point data types.
|
||||
///
|
||||
/// DLPack protocol only supports arrays with one contiguous
|
||||
/// memory region which means Arrow Arrays with validity buffers
|
||||
/// are not supported.
|
||||
///
|
||||
/// \param[in] arr Arrow array
|
||||
/// \return DLManagedTensor struct
|
||||
ARROW_EXPORT
|
||||
Result<DLManagedTensor*> ExportArray(const std::shared_ptr<Array>& arr);
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<DLManagedTensor*> ExportTensor(const std::shared_ptr<Tensor>& t);
|
||||
|
||||
/// \brief Get DLDevice with enumerator specifying the
|
||||
/// type of the device data is stored on and index of the
|
||||
/// device which is 0 by default for CPU.
|
||||
///
|
||||
/// \param[in] arr Arrow array
|
||||
/// \return DLDevice struct
|
||||
ARROW_EXPORT
|
||||
Result<DLDevice> ExportDevice(const std::shared_ptr<Array>& arr);
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<DLDevice> ExportDevice(const std::shared_ptr<Tensor>& t);
|
||||
|
||||
} // namespace arrow::dlpack
|
||||
@@ -0,0 +1,321 @@
|
||||
// Taken from:
|
||||
// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h
|
||||
/*!
|
||||
* Copyright (c) 2017 by Contributors
|
||||
* \file dlpack.h
|
||||
* \brief The common header of DLPack.
|
||||
*/
|
||||
#ifndef DLPACK_DLPACK_H_
|
||||
#define DLPACK_DLPACK_H_
|
||||
|
||||
/**
|
||||
* \brief Compatibility with C++
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
# define DLPACK_EXTERN_C extern "C"
|
||||
#else
|
||||
# define DLPACK_EXTERN_C
|
||||
#endif
|
||||
|
||||
/*! \brief The current major version of dlpack */
|
||||
#define DLPACK_MAJOR_VERSION 1
|
||||
|
||||
/*! \brief The current minor version of dlpack */
|
||||
#define DLPACK_MINOR_VERSION 0
|
||||
|
||||
/*! \brief DLPACK_DLL prefix for windows */
|
||||
#ifdef _WIN32
|
||||
# ifdef DLPACK_EXPORTS
|
||||
# define DLPACK_DLL __declspec(dllexport)
|
||||
# else
|
||||
# define DLPACK_DLL __declspec(dllimport)
|
||||
# endif
|
||||
#else
|
||||
# define DLPACK_DLL
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*!
|
||||
* \brief The DLPack version.
|
||||
*
|
||||
* A change in major version indicates that we have changed the
|
||||
* data layout of the ABI - DLManagedTensorVersioned.
|
||||
*
|
||||
* A change in minor version indicates that we have added new
|
||||
* code, such as a new device type, but the ABI is kept the same.
|
||||
*
|
||||
* If an obtained DLPack tensor has a major version that disagrees
|
||||
* with the version number specified in this header file
|
||||
* (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
|
||||
* (and it is safe to do so). It is not safe to access any other fields
|
||||
* as the memory layout will have changed.
|
||||
*
|
||||
* In the case of a minor version mismatch, the tensor can be safely used as
|
||||
* long as the consumer knows how to interpret all fields. Minor version
|
||||
* updates indicate the addition of enumeration values.
|
||||
*/
|
||||
typedef struct {
|
||||
/*! \brief DLPack major version. */
|
||||
uint32_t major;
|
||||
/*! \brief DLPack minor version. */
|
||||
uint32_t minor;
|
||||
} DLPackVersion;
|
||||
|
||||
/*!
|
||||
* \brief The device type in DLDevice.
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
typedef enum : int32_t {
|
||||
#else
|
||||
typedef enum {
|
||||
#endif
|
||||
/*! \brief CPU device */
|
||||
kDLCPU = 1,
|
||||
/*! \brief CUDA GPU device */
|
||||
kDLCUDA = 2,
|
||||
/*!
|
||||
* \brief Pinned CUDA CPU memory by cudaMallocHost
|
||||
*/
|
||||
kDLCUDAHost = 3,
|
||||
/*! \brief OpenCL devices. */
|
||||
kDLOpenCL = 4,
|
||||
/*! \brief Vulkan buffer for next generation graphics. */
|
||||
kDLVulkan = 7,
|
||||
/*! \brief Metal for Apple GPU. */
|
||||
kDLMetal = 8,
|
||||
/*! \brief Verilog simulator buffer */
|
||||
kDLVPI = 9,
|
||||
/*! \brief ROCm GPUs for AMD GPUs */
|
||||
kDLROCM = 10,
|
||||
/*!
|
||||
* \brief Pinned ROCm CPU memory allocated by hipMallocHost
|
||||
*/
|
||||
kDLROCMHost = 11,
|
||||
/*!
|
||||
* \brief Reserved extension device type,
|
||||
* used for quickly test extension device
|
||||
* The semantics can differ depending on the implementation.
|
||||
*/
|
||||
kDLExtDev = 12,
|
||||
/*!
|
||||
* \brief CUDA managed/unified memory allocated by cudaMallocManaged
|
||||
*/
|
||||
kDLCUDAManaged = 13,
|
||||
/*!
|
||||
* \brief Unified shared memory allocated on a oneAPI non-partititioned
|
||||
* device. Call to oneAPI runtime is required to determine the device
|
||||
* type, the USM allocation type and the sycl context it is bound to.
|
||||
*
|
||||
*/
|
||||
kDLOneAPI = 14,
|
||||
/*! \brief GPU support for next generation WebGPU standard. */
|
||||
kDLWebGPU = 15,
|
||||
/*! \brief Qualcomm Hexagon DSP */
|
||||
kDLHexagon = 16,
|
||||
} DLDeviceType;
|
||||
|
||||
/*!
|
||||
* \brief A Device for Tensor and operator.
|
||||
*/
|
||||
typedef struct {
|
||||
/*! \brief The device type used in the device. */
|
||||
DLDeviceType device_type;
|
||||
/*!
|
||||
* \brief The device index.
|
||||
* For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
|
||||
*/
|
||||
int32_t device_id;
|
||||
} DLDevice;
|
||||
|
||||
/*!
|
||||
* \brief The type code options DLDataType.
|
||||
*/
|
||||
typedef enum {
|
||||
/*! \brief signed integer */
|
||||
kDLInt = 0U,
|
||||
/*! \brief unsigned integer */
|
||||
kDLUInt = 1U,
|
||||
/*! \brief IEEE floating point */
|
||||
kDLFloat = 2U,
|
||||
/*!
|
||||
* \brief Opaque handle type, reserved for testing purposes.
|
||||
* Frameworks need to agree on the handle data type for the exchange to be well-defined.
|
||||
*/
|
||||
kDLOpaqueHandle = 3U,
|
||||
/*! \brief bfloat16 */
|
||||
kDLBfloat = 4U,
|
||||
/*!
|
||||
* \brief complex number
|
||||
* (C/C++/Python layout: compact struct per complex number)
|
||||
*/
|
||||
kDLComplex = 5U,
|
||||
/*! \brief boolean */
|
||||
kDLBool = 6U,
|
||||
} DLDataTypeCode;
|
||||
|
||||
/*!
|
||||
* \brief The data type the tensor can hold. The data type is assumed to follow the
|
||||
* native endian-ness. An explicit error message should be raised when attempting to
|
||||
* export an array with non-native endianness
|
||||
*
|
||||
* Examples
|
||||
* - float: type_code = 2, bits = 32, lanes = 1
|
||||
* - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
|
||||
* - int8: type_code = 0, bits = 8, lanes = 1
|
||||
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
|
||||
* - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention,
|
||||
* the underlying storage size of bool is 8 bits)
|
||||
*/
|
||||
typedef struct {
|
||||
/*!
|
||||
* \brief Type code of base types.
|
||||
* We keep it uint8_t instead of DLDataTypeCode for minimal memory
|
||||
* footprint, but the value should be one of DLDataTypeCode enum values.
|
||||
* */
|
||||
uint8_t code;
|
||||
/*!
|
||||
* \brief Number of bits, common choices are 8, 16, 32.
|
||||
*/
|
||||
uint8_t bits;
|
||||
/*! \brief Number of lanes in the type, used for vector types. */
|
||||
uint16_t lanes;
|
||||
} DLDataType;
|
||||
|
||||
/*!
|
||||
* \brief Plain C Tensor object, does not manage memory.
|
||||
*/
|
||||
typedef struct {
|
||||
/*!
|
||||
* \brief The data pointer points to the allocated data. This will be CUDA
|
||||
* device pointer or cl_mem handle in OpenCL. It may be opaque on some device
|
||||
* types. This pointer is always aligned to 256 bytes as in CUDA. The
|
||||
* `byte_offset` field should be used to point to the beginning of the data.
|
||||
*
|
||||
* Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
|
||||
* TVM, perhaps others) do not adhere to this 256 byte aligment requirement
|
||||
* on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
|
||||
* (after which this note will be updated); at the moment it is recommended
|
||||
* to not rely on the data pointer being correctly aligned.
|
||||
*
|
||||
* For given DLTensor, the size of memory required to store the contents of
|
||||
* data is calculated as follows:
|
||||
*
|
||||
* \code{.c}
|
||||
* static inline size_t GetDataSize(const DLTensor* t) {
|
||||
* size_t size = 1;
|
||||
* for (tvm_index_t i = 0; i < t->ndim; ++i) {
|
||||
* size *= t->shape[i];
|
||||
* }
|
||||
* size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
|
||||
* return size;
|
||||
* }
|
||||
* \endcode
|
||||
*/
|
||||
void* data;
|
||||
/*! \brief The device of the tensor */
|
||||
DLDevice device;
|
||||
/*! \brief Number of dimensions */
|
||||
int32_t ndim;
|
||||
/*! \brief The data type of the pointer*/
|
||||
DLDataType dtype;
|
||||
/*! \brief The shape of the tensor */
|
||||
int64_t* shape;
|
||||
/*!
|
||||
* \brief strides of the tensor (in number of elements, not bytes)
|
||||
* can be NULL, indicating tensor is compact and row-majored.
|
||||
*/
|
||||
int64_t* strides;
|
||||
/*! \brief The offset in bytes to the beginning pointer to data */
|
||||
uint64_t byte_offset;
|
||||
} DLTensor;
|
||||
|
||||
/*!
|
||||
* \brief C Tensor object, manage memory of DLTensor. This data structure is
|
||||
* intended to facilitate the borrowing of DLTensor by another framework. It is
|
||||
* not meant to transfer the tensor. When the borrowing framework doesn't need
|
||||
* the tensor, it should call the deleter to notify the host that the resource
|
||||
* is no longer needed.
|
||||
*
|
||||
* \note This data structure is used as Legacy DLManagedTensor
|
||||
* in DLPack exchange and is deprecated after DLPack v0.8
|
||||
* Use DLManagedTensorVersioned instead.
|
||||
* This data structure may get renamed or deleted in future versions.
|
||||
*
|
||||
* \sa DLManagedTensorVersioned
|
||||
*/
|
||||
typedef struct DLManagedTensor {
|
||||
/*! \brief DLTensor which is being memory managed */
|
||||
DLTensor dl_tensor;
|
||||
/*! \brief the context of the original host framework of DLManagedTensor in
|
||||
* which DLManagedTensor is used in the framework. It can also be NULL.
|
||||
*/
|
||||
void* manager_ctx;
|
||||
/*!
|
||||
* \brief Destructor - this should be called
|
||||
* to destruct the manager_ctx which backs the DLManagedTensor. It can be
|
||||
* NULL if there is no way for the caller to provide a reasonable destructor.
|
||||
* The destructors deletes the argument self as well.
|
||||
*/
|
||||
void (*deleter)(struct DLManagedTensor* self);
|
||||
} DLManagedTensor;
|
||||
|
||||
// bit masks used in in the DLManagedTensorVersioned
|
||||
|
||||
/*! \brief bit mask to indicate that the tensor is read only. */
|
||||
#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
|
||||
|
||||
/*!
|
||||
* \brief A versioned and managed C Tensor object, manage memory of DLTensor.
|
||||
*
|
||||
* This data structure is intended to facilitate the borrowing of DLTensor by
|
||||
* another framework. It is not meant to transfer the tensor. When the borrowing
|
||||
* framework doesn't need the tensor, it should call the deleter to notify the
|
||||
* host that the resource is no longer needed.
|
||||
*
|
||||
* \note This is the current standard DLPack exchange data structure.
|
||||
*/
|
||||
struct DLManagedTensorVersioned {
|
||||
/*!
|
||||
* \brief The API and ABI version of the current managed Tensor
|
||||
*/
|
||||
DLPackVersion version;
|
||||
/*!
|
||||
* \brief the context of the original host framework.
|
||||
*
|
||||
* Stores DLManagedTensorVersioned is used in the
|
||||
* framework. It can also be NULL.
|
||||
*/
|
||||
void* manager_ctx;
|
||||
/*!
|
||||
* \brief Destructor.
|
||||
*
|
||||
* This should be called to destruct manager_ctx which holds the
|
||||
* DLManagedTensorVersioned. It can be NULL if there is no way for the caller to provide
|
||||
* a reasonable destructor. The destructors deletes the argument self as well.
|
||||
*/
|
||||
void (*deleter)(struct DLManagedTensorVersioned* self);
|
||||
/*!
|
||||
* \brief Additional bitmask flags information about the tensor.
|
||||
*
|
||||
* By default the flags should be set to 0.
|
||||
*
|
||||
* \note Future ABI changes should keep everything until this field
|
||||
* stable, to ensure that deleter can be correctly called.
|
||||
*
|
||||
* \sa DLPACK_FLAG_BITMASK_READ_ONLY
|
||||
*/
|
||||
uint64_t flags;
|
||||
/*! \brief DLTensor which is being memory managed */
|
||||
DLTensor dl_tensor;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // DLPACK_EXTERN_C
|
||||
#endif
|
||||
#endif // DLPACK_DLPACK_H_
|
||||
@@ -0,0 +1,178 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "arrow/c/abi.h"
|
||||
|
||||
#define ARROW_C_ASSERT(condition, msg) \
|
||||
do { \
|
||||
if (!(condition)) { \
|
||||
fprintf(stderr, "%s:%d:: %s", __FILE__, __LINE__, (msg)); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/// Query whether the C schema is released
|
||||
inline int ArrowSchemaIsReleased(const struct ArrowSchema* schema) {
|
||||
return schema->release == NULL;
|
||||
}
|
||||
|
||||
/// Mark the C schema released (for use in release callbacks)
|
||||
inline void ArrowSchemaMarkReleased(struct ArrowSchema* schema) {
|
||||
schema->release = NULL;
|
||||
}
|
||||
|
||||
/// Move the C schema from `src` to `dest`
|
||||
///
|
||||
/// Note `dest` must *not* point to a valid schema already, otherwise there
|
||||
/// will be a memory leak.
|
||||
inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dest) {
|
||||
assert(dest != src);
|
||||
assert(!ArrowSchemaIsReleased(src));
|
||||
memcpy(dest, src, sizeof(struct ArrowSchema));
|
||||
ArrowSchemaMarkReleased(src);
|
||||
}
|
||||
|
||||
/// Release the C schema, if necessary, by calling its release callback
|
||||
inline void ArrowSchemaRelease(struct ArrowSchema* schema) {
|
||||
if (!ArrowSchemaIsReleased(schema)) {
|
||||
schema->release(schema);
|
||||
ARROW_C_ASSERT(ArrowSchemaIsReleased(schema),
|
||||
"ArrowSchemaRelease did not cleanup release callback");
|
||||
}
|
||||
}
|
||||
|
||||
/// Query whether the C array is released
|
||||
inline int ArrowArrayIsReleased(const struct ArrowArray* array) {
|
||||
return array->release == NULL;
|
||||
}
|
||||
|
||||
inline int ArrowDeviceArrayIsReleased(const struct ArrowDeviceArray* array) {
|
||||
return ArrowArrayIsReleased(&array->array);
|
||||
}
|
||||
|
||||
/// Mark the C array released (for use in release callbacks)
|
||||
inline void ArrowArrayMarkReleased(struct ArrowArray* array) { array->release = NULL; }
|
||||
|
||||
inline void ArrowDeviceArrayMarkReleased(struct ArrowDeviceArray* array) {
|
||||
ArrowArrayMarkReleased(&array->array);
|
||||
}
|
||||
|
||||
/// Move the C array from `src` to `dest`
|
||||
///
|
||||
/// Note `dest` must *not* point to a valid array already, otherwise there
|
||||
/// will be a memory leak.
|
||||
inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) {
|
||||
assert(dest != src);
|
||||
assert(!ArrowArrayIsReleased(src));
|
||||
memcpy(dest, src, sizeof(struct ArrowArray));
|
||||
ArrowArrayMarkReleased(src);
|
||||
}
|
||||
|
||||
inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
|
||||
struct ArrowDeviceArray* dest) {
|
||||
assert(dest != src);
|
||||
assert(!ArrowDeviceArrayIsReleased(src));
|
||||
memcpy(dest, src, sizeof(struct ArrowDeviceArray));
|
||||
ArrowDeviceArrayMarkReleased(src);
|
||||
}
|
||||
|
||||
/// Release the C array, if necessary, by calling its release callback
|
||||
inline void ArrowArrayRelease(struct ArrowArray* array) {
|
||||
if (!ArrowArrayIsReleased(array)) {
|
||||
array->release(array);
|
||||
ARROW_C_ASSERT(ArrowArrayIsReleased(array),
|
||||
"ArrowArrayRelease did not cleanup release callback");
|
||||
}
|
||||
}
|
||||
|
||||
inline void ArrowDeviceArrayRelease(struct ArrowDeviceArray* array) {
|
||||
if (!ArrowDeviceArrayIsReleased(array)) {
|
||||
array->array.release(&array->array);
|
||||
ARROW_C_ASSERT(ArrowDeviceArrayIsReleased(array),
|
||||
"ArrowDeviceArrayRelease did not cleanup release callback");
|
||||
}
|
||||
}
|
||||
|
||||
/// Query whether the C array stream is released
|
||||
inline int ArrowArrayStreamIsReleased(const struct ArrowArrayStream* stream) {
|
||||
return stream->release == NULL;
|
||||
}
|
||||
|
||||
inline int ArrowDeviceArrayStreamIsReleased(const struct ArrowDeviceArrayStream* stream) {
|
||||
return stream->release == NULL;
|
||||
}
|
||||
|
||||
/// Mark the C array stream released (for use in release callbacks)
|
||||
inline void ArrowArrayStreamMarkReleased(struct ArrowArrayStream* stream) {
|
||||
stream->release = NULL;
|
||||
}
|
||||
|
||||
inline void ArrowDeviceArrayStreamMarkReleased(struct ArrowDeviceArrayStream* stream) {
|
||||
stream->release = NULL;
|
||||
}
|
||||
|
||||
/// Move the C array stream from `src` to `dest`
|
||||
///
|
||||
/// Note `dest` must *not* point to a valid stream already, otherwise there
|
||||
/// will be a memory leak.
|
||||
inline void ArrowArrayStreamMove(struct ArrowArrayStream* src,
|
||||
struct ArrowArrayStream* dest) {
|
||||
assert(dest != src);
|
||||
assert(!ArrowArrayStreamIsReleased(src));
|
||||
memcpy(dest, src, sizeof(struct ArrowArrayStream));
|
||||
ArrowArrayStreamMarkReleased(src);
|
||||
}
|
||||
|
||||
inline void ArrowDeviceArrayStreamMove(struct ArrowDeviceArrayStream* src,
|
||||
struct ArrowDeviceArrayStream* dest) {
|
||||
assert(dest != src);
|
||||
assert(!ArrowDeviceArrayStreamIsReleased(src));
|
||||
memcpy(dest, src, sizeof(struct ArrowDeviceArrayStream));
|
||||
ArrowDeviceArrayStreamMarkReleased(src);
|
||||
}
|
||||
|
||||
/// Release the C array stream, if necessary, by calling its release callback
|
||||
inline void ArrowArrayStreamRelease(struct ArrowArrayStream* stream) {
|
||||
if (!ArrowArrayStreamIsReleased(stream)) {
|
||||
stream->release(stream);
|
||||
ARROW_C_ASSERT(ArrowArrayStreamIsReleased(stream),
|
||||
"ArrowArrayStreamRelease did not cleanup release callback");
|
||||
}
|
||||
}
|
||||
|
||||
inline void ArrowDeviceArrayStreamRelease(struct ArrowDeviceArrayStream* stream) {
|
||||
if (!ArrowDeviceArrayStreamIsReleased(stream)) {
|
||||
stream->release(stream);
|
||||
ARROW_C_ASSERT(ArrowDeviceArrayStreamIsReleased(stream),
|
||||
"ArrowDeviceArrayStreamRelease did not cleanup release callback");
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,294 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/span.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ChunkResolver;
|
||||
|
||||
template <typename IndexType>
|
||||
struct ARROW_EXPORT TypedChunkLocation {
|
||||
/// \brief Index of the chunk in the array of chunks
|
||||
///
|
||||
/// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used
|
||||
/// to represent out-of-bounds locations.
|
||||
IndexType chunk_index = 0;
|
||||
|
||||
/// \brief Index of the value in the chunk
|
||||
///
|
||||
/// The value is UNDEFINED if `chunk_index >= chunks.size()`
|
||||
IndexType index_in_chunk = 0;
|
||||
|
||||
TypedChunkLocation() = default;
|
||||
|
||||
TypedChunkLocation(IndexType chunk_index, IndexType index_in_chunk)
|
||||
: chunk_index(chunk_index), index_in_chunk(index_in_chunk) {
|
||||
static_assert(sizeof(TypedChunkLocation<IndexType>) == 2 * sizeof(IndexType));
|
||||
static_assert(alignof(TypedChunkLocation<IndexType>) == alignof(IndexType));
|
||||
}
|
||||
|
||||
bool operator==(TypedChunkLocation other) const {
|
||||
return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk;
|
||||
}
|
||||
};
|
||||
|
||||
using ChunkLocation = TypedChunkLocation<int64_t>;
|
||||
|
||||
/// \brief An utility that incrementally resolves logical indices into
|
||||
/// physical indices in a chunked array.
|
||||
class ARROW_EXPORT ChunkResolver {
|
||||
private:
|
||||
/// \brief Array containing `chunks.size() + 1` offsets.
|
||||
///
|
||||
/// `offsets_[i]` is the starting logical index of chunk `i`. `offsets_[0]` is always 0
|
||||
/// and `offsets_[chunks.size()]` is the logical length of the chunked array.
|
||||
std::vector<int64_t> offsets_;
|
||||
|
||||
/// \brief Cache of the index of the last resolved chunk.
|
||||
///
|
||||
/// \invariant `cached_chunk_ in [0, chunks.size()]`
|
||||
mutable std::atomic<int32_t> cached_chunk_;
|
||||
|
||||
public:
|
||||
explicit ChunkResolver(const ArrayVector& chunks) noexcept;
|
||||
explicit ChunkResolver(util::span<const Array* const> chunks) noexcept;
|
||||
explicit ChunkResolver(const RecordBatchVector& batches) noexcept;
|
||||
|
||||
/// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets.
|
||||
///
|
||||
/// The first offset must be 0 and the last offset must be the logical length of the
|
||||
/// chunked array. Each offset before the last represents the starting logical index of
|
||||
/// the corresponding chunk.
|
||||
explicit ChunkResolver(std::vector<int64_t> offsets) noexcept
|
||||
: offsets_(std::move(offsets)), cached_chunk_(0) {
|
||||
#ifndef NDEBUG
|
||||
assert(offsets_.size() >= 1);
|
||||
assert(offsets_[0] == 0);
|
||||
for (size_t i = 1; i < offsets_.size(); i++) {
|
||||
assert(offsets_[i] >= offsets_[i - 1]);
|
||||
}
|
||||
assert(offsets_.size() - 1 <=
|
||||
static_cast<size_t>(std::numeric_limits<int32_t>::max()));
|
||||
#endif
|
||||
}
|
||||
|
||||
ChunkResolver(ChunkResolver&& other) noexcept;
|
||||
ChunkResolver& operator=(ChunkResolver&& other) noexcept;
|
||||
|
||||
ChunkResolver(const ChunkResolver& other) noexcept;
|
||||
ChunkResolver& operator=(const ChunkResolver& other) noexcept;
|
||||
|
||||
int64_t logical_array_length() const { return offsets_.back(); }
|
||||
int32_t num_chunks() const { return static_cast<int32_t>(offsets_.size() - 1); }
|
||||
|
||||
int64_t chunk_length(int64_t chunk_index) const {
|
||||
return offsets_[chunk_index + 1] - offsets_[chunk_index];
|
||||
}
|
||||
|
||||
/// \brief Resolve a logical index to a ChunkLocation.
|
||||
///
|
||||
/// The returned ChunkLocation contains the chunk index and the within-chunk index
|
||||
/// equivalent to the logical index.
|
||||
///
|
||||
/// \pre `index >= 0`
|
||||
/// \post `location.chunk_index` in `[0, chunks.size()]`
|
||||
/// \param index The logical index to resolve
|
||||
/// \return ChunkLocation with a valid chunk_index if index is within
|
||||
/// bounds, or with `chunk_index == chunks.size()` if logical index is
|
||||
/// `>= chunked_array.length()`.
|
||||
inline ChunkLocation Resolve(int64_t index) const {
|
||||
const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed);
|
||||
const auto chunk_index =
|
||||
ResolveChunkIndex</*StoreCachedChunk=*/true>(index, cached_chunk);
|
||||
return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
|
||||
}
|
||||
|
||||
/// \brief Resolve a logical index to a ChunkLocation.
|
||||
///
|
||||
/// The returned ChunkLocation contains the chunk index and the within-chunk index
|
||||
/// equivalent to the logical index.
|
||||
///
|
||||
/// \pre `index >= 0`
|
||||
/// \post `location.chunk_index` in `[0, chunks.size()]`
|
||||
/// \param index The logical index to resolve
|
||||
/// \param hint ChunkLocation{} or the last ChunkLocation returned by
|
||||
/// this ChunkResolver.
|
||||
/// \return ChunkLocation with a valid chunk_index if index is within
|
||||
/// bounds, or with `chunk_index == chunks.size()` if logical index is
|
||||
/// `>= chunked_array.length()`.
|
||||
inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const {
|
||||
assert(hint.chunk_index < static_cast<uint32_t>(offsets_.size()));
|
||||
const auto chunk_index = ResolveChunkIndex</*StoreCachedChunk=*/false>(
|
||||
index, static_cast<int32_t>(hint.chunk_index));
|
||||
return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
|
||||
}
|
||||
|
||||
/// \brief Resolve `n_indices` logical indices to chunk indices.
|
||||
///
|
||||
/// \pre 0 <= logical_index_vec[i] < logical_array_length()
|
||||
/// (for well-defined and valid chunk index results)
|
||||
/// \pre out_chunk_location_vec has space for `n_indices` locations
|
||||
/// \pre chunk_hint in [0, chunks.size()]
|
||||
/// \post out_chunk_location_vec[i].chunk_index in [0, chunks.size()] for i in [0, n)
|
||||
/// \post if logical_index_vec[i] >= chunked_array.length(), then
|
||||
/// out_chunk_location_vec[i].chunk_index == chunks.size()
|
||||
/// and out_chunk_location_vec[i].index_in_chunk is UNDEFINED (can be
|
||||
/// out-of-bounds)
|
||||
/// \post if logical_index_vec[i] < 0, then both values in out_chunk_index_vec[i]
|
||||
/// are UNDEFINED
|
||||
///
|
||||
/// \param n_indices The number of logical indices to resolve
|
||||
/// \param logical_index_vec The logical indices to resolve
|
||||
/// \param out_chunk_location_vec The output array where the locations will be written
|
||||
/// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
|
||||
/// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
|
||||
template <typename IndexType>
|
||||
[[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec,
|
||||
TypedChunkLocation<IndexType>* out_chunk_location_vec,
|
||||
IndexType chunk_hint = 0) const {
|
||||
if constexpr (sizeof(IndexType) < sizeof(uint32_t)) {
|
||||
// The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()).
|
||||
constexpr int64_t kMaxIndexTypeValue = std::numeric_limits<IndexType>::max();
|
||||
// A ChunkedArray with enough empty chunks can make the index of a chunk
|
||||
// exceed the logical index and thus the maximum value of IndexType.
|
||||
const bool chunk_index_fits_on_type = num_chunks() <= kMaxIndexTypeValue;
|
||||
if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) {
|
||||
return false;
|
||||
}
|
||||
// Since an index-in-chunk cannot possibly exceed the logical index being
|
||||
// queried, we don't have to worry about these values not fitting on IndexType.
|
||||
}
|
||||
if constexpr (std::is_signed_v<IndexType>) {
|
||||
// We interpret signed integers as unsigned and avoid having to generate double
|
||||
// the amount of binary code to handle each integer width.
|
||||
//
|
||||
// Negative logical indices can become large values when cast to unsigned, and
|
||||
// they are gracefully handled by ResolveManyImpl, but both the chunk index
|
||||
// and the index in chunk values will be undefined in these cases. This
|
||||
// happend because int8_t(-1) == uint8_t(255) and 255 could be a valid
|
||||
// logical index in the chunked array.
|
||||
using U = std::make_unsigned_t<IndexType>;
|
||||
ResolveManyImpl(n_indices, reinterpret_cast<const U*>(logical_index_vec),
|
||||
reinterpret_cast<TypedChunkLocation<U>*>(out_chunk_location_vec),
|
||||
static_cast<int32_t>(chunk_hint));
|
||||
} else {
|
||||
static_assert(std::is_unsigned_v<IndexType>);
|
||||
ResolveManyImpl(n_indices, logical_index_vec, out_chunk_location_vec,
|
||||
static_cast<int32_t>(chunk_hint));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
template <bool StoreCachedChunk>
|
||||
inline int64_t ResolveChunkIndex(int64_t index, int32_t cached_chunk) const {
|
||||
// It is common for algorithms sequentially processing arrays to make consecutive
|
||||
// accesses at a relatively small distance from each other, hence often falling in the
|
||||
// same chunk.
|
||||
//
|
||||
// This is guaranteed when merging (assuming each side of the merge uses its
|
||||
// own resolver), and is the most common case in recursive invocations of
|
||||
// partitioning.
|
||||
const auto num_offsets = static_cast<uint32_t>(offsets_.size());
|
||||
const int64_t* offsets = offsets_.data();
|
||||
if (ARROW_PREDICT_TRUE(index >= offsets[cached_chunk]) &&
|
||||
(static_cast<uint32_t>(cached_chunk + 1) == num_offsets ||
|
||||
index < offsets[cached_chunk + 1])) {
|
||||
return cached_chunk;
|
||||
}
|
||||
// lo < hi is guaranteed by `num_offsets = chunks.size() + 1`
|
||||
const auto chunk_index = Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets);
|
||||
if constexpr (StoreCachedChunk) {
|
||||
assert(static_cast<uint32_t>(chunk_index) < static_cast<uint32_t>(offsets_.size()));
|
||||
cached_chunk_.store(chunk_index, std::memory_order_relaxed);
|
||||
}
|
||||
return chunk_index;
|
||||
}
|
||||
|
||||
/// \pre all the pre-conditions of ChunkResolver::ResolveMany()
|
||||
/// \pre num_offsets - 1 <= std::numeric_limits<IndexType>::max()
|
||||
void ResolveManyImpl(int64_t, const uint8_t*, TypedChunkLocation<uint8_t>*,
|
||||
int32_t) const;
|
||||
void ResolveManyImpl(int64_t, const uint16_t*, TypedChunkLocation<uint16_t>*,
|
||||
int32_t) const;
|
||||
void ResolveManyImpl(int64_t, const uint32_t*, TypedChunkLocation<uint32_t>*,
|
||||
int32_t) const;
|
||||
void ResolveManyImpl(int64_t, const uint64_t*, TypedChunkLocation<uint64_t>*,
|
||||
int32_t) const;
|
||||
|
||||
public:
|
||||
/// \brief Find the index of the chunk that contains the logical index.
|
||||
///
|
||||
/// Any non-negative index is accepted. When `hi=num_offsets`, the largest
|
||||
/// possible return value is `num_offsets-1` which is equal to
|
||||
/// `chunks.size()`. Which is returned when the logical index is greater or
|
||||
/// equal the logical length of the chunked array.
|
||||
///
|
||||
/// \pre index >= 0 (otherwise, when index is negative, hi-1 is returned)
|
||||
/// \pre lo < hi
|
||||
/// \pre lo >= 0 && hi <= offsets_.size()
|
||||
static inline int32_t Bisect(int64_t index, const int64_t* offsets, int32_t lo,
|
||||
int32_t hi) {
|
||||
return Bisect(static_cast<uint64_t>(index),
|
||||
reinterpret_cast<const uint64_t*>(offsets), static_cast<uint32_t>(lo),
|
||||
static_cast<uint32_t>(hi));
|
||||
}
|
||||
|
||||
static inline int32_t Bisect(uint64_t index, const uint64_t* offsets, uint32_t lo,
|
||||
uint32_t hi) {
|
||||
// Similar to std::upper_bound(), but slightly different as our offsets
|
||||
// array always starts with 0.
|
||||
auto n = hi - lo;
|
||||
// First iteration does not need to check for n > 1
|
||||
// (lo < hi is guaranteed by the precondition).
|
||||
assert(n > 1 && "lo < hi is a precondition of Bisect");
|
||||
do {
|
||||
const uint32_t m = n >> 1;
|
||||
const uint32_t mid = lo + m;
|
||||
if (index >= offsets[mid]) {
|
||||
lo = mid;
|
||||
n -= m;
|
||||
} else {
|
||||
n = m;
|
||||
}
|
||||
} while (n > 1);
|
||||
return lo;
|
||||
}
|
||||
};
|
||||
|
||||
// Explicitly instantiate template base struct, for DLL linking on Windows
|
||||
template struct TypedChunkLocation<int32_t>;
|
||||
template struct TypedChunkLocation<int16_t>;
|
||||
template struct TypedChunkLocation<int8_t>;
|
||||
template struct TypedChunkLocation<uint8_t>;
|
||||
template struct TypedChunkLocation<uint16_t>;
|
||||
template struct TypedChunkLocation<uint32_t>;
|
||||
template struct TypedChunkLocation<int64_t>;
|
||||
template struct TypedChunkLocation<uint64_t>;
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,283 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/chunk_resolver.h"
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/device_allocation_type_set.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class DataType;
|
||||
class MemoryPool;
|
||||
namespace stl {
|
||||
template <typename T, typename V>
|
||||
class ChunkedArrayIterator;
|
||||
} // namespace stl
|
||||
|
||||
/// \class ChunkedArray
|
||||
/// \brief A data structure managing a list of primitive Arrow arrays logically
|
||||
/// as one large array
|
||||
///
|
||||
/// Data chunking is treated throughout this project largely as an
|
||||
/// implementation detail for performance and memory use optimization.
|
||||
/// ChunkedArray allows Array objects to be collected and interpreted
|
||||
/// as a single logical array without requiring an expensive concatenation
|
||||
/// step.
|
||||
///
|
||||
/// In some cases, data produced by a function may exceed the capacity of an
|
||||
/// Array (like BinaryArray or StringArray) and so returning multiple Arrays is
|
||||
/// the only possibility. In these cases, we recommend returning a ChunkedArray
|
||||
/// instead of vector of Arrays or some alternative.
|
||||
///
|
||||
/// When data is processed in parallel, it may not be practical or possible to
|
||||
/// create large contiguous memory allocations and write output into them. With
|
||||
/// some data types, like binary and string types, it is not possible at all to
|
||||
/// produce non-chunked array outputs without requiring a concatenation step at
|
||||
/// the end of processing.
|
||||
///
|
||||
/// Application developers may tune chunk sizes based on analysis of
|
||||
/// performance profiles but many developer-users will not need to be
|
||||
/// especially concerned with the chunking details.
|
||||
///
|
||||
/// Preserving the chunk layout/sizes in processing steps is generally not
|
||||
/// considered to be a contract in APIs. A function may decide to alter the
|
||||
/// chunking of its result. Similarly, APIs accepting multiple ChunkedArray
|
||||
/// inputs should not expect the chunk layout to be the same in each input.
|
||||
class ARROW_EXPORT ChunkedArray {
|
||||
public:
|
||||
ChunkedArray(ChunkedArray&&) = default;
|
||||
ChunkedArray& operator=(ChunkedArray&&) = default;
|
||||
|
||||
/// \brief Construct a chunked array from a single Array
|
||||
explicit ChunkedArray(std::shared_ptr<Array> chunk)
|
||||
: ChunkedArray(ArrayVector{std::move(chunk)}) {}
|
||||
|
||||
/// \brief Construct a chunked array from a vector of arrays and an optional data type
|
||||
///
|
||||
/// The vector elements must have the same data type.
|
||||
/// If the data type is passed explicitly, the vector may be empty.
|
||||
/// If the data type is omitted, the vector must be non-empty.
|
||||
explicit ChunkedArray(ArrayVector chunks, std::shared_ptr<DataType> type = NULLPTR);
|
||||
|
||||
// \brief Constructor with basic input validation.
|
||||
static Result<std::shared_ptr<ChunkedArray>> Make(
|
||||
ArrayVector chunks, std::shared_ptr<DataType> type = NULLPTR);
|
||||
|
||||
/// \brief Create an empty ChunkedArray of a given type
|
||||
///
|
||||
/// The output ChunkedArray will have one chunk with an empty
|
||||
/// array of the given type.
|
||||
///
|
||||
/// \param[in] type the data type of the empty ChunkedArray
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
/// \return the resulting ChunkedArray
|
||||
static Result<std::shared_ptr<ChunkedArray>> MakeEmpty(
|
||||
std::shared_ptr<DataType> type, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \return the total length of the chunked array; computed on construction
|
||||
int64_t length() const { return length_; }
|
||||
|
||||
/// \return the total number of nulls among all chunks
|
||||
int64_t null_count() const { return null_count_; }
|
||||
|
||||
/// \return the total number of chunks in the chunked array
|
||||
int num_chunks() const { return static_cast<int>(chunks_.size()); }
|
||||
|
||||
/// \return chunk a particular chunk from the chunked array
|
||||
const std::shared_ptr<Array>& chunk(int i) const { return chunks_[i]; }
|
||||
|
||||
/// \return an ArrayVector of chunks
|
||||
const ArrayVector& chunks() const { return chunks_; }
|
||||
|
||||
/// \return The set of device allocation types used by the chunks in this
|
||||
/// chunked array.
|
||||
DeviceAllocationTypeSet device_types() const;
|
||||
|
||||
/// \return true if all chunks are allocated on CPU-accessible memory.
|
||||
bool is_cpu() const { return device_types().is_cpu_only(); }
|
||||
|
||||
/// \brief Construct a zero-copy slice of the chunked array with the
|
||||
/// indicated offset and length
|
||||
///
|
||||
/// \param[in] offset the position of the first element in the constructed
|
||||
/// slice
|
||||
/// \param[in] length the length of the slice. If there are not enough
|
||||
/// elements in the chunked array, the length will be adjusted accordingly
|
||||
///
|
||||
/// \return a new object wrapped in std::shared_ptr<ChunkedArray>
|
||||
std::shared_ptr<ChunkedArray> Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
/// \brief Slice from offset until end of the chunked array
|
||||
std::shared_ptr<ChunkedArray> Slice(int64_t offset) const;
|
||||
|
||||
/// \brief Flatten this chunked array as a vector of chunked arrays, one
|
||||
/// for each struct field
|
||||
///
|
||||
/// \param[in] pool The pool for buffer allocations, if any
|
||||
Result<std::vector<std::shared_ptr<ChunkedArray>>> Flatten(
|
||||
MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// Construct a zero-copy view of this chunked array with the given
|
||||
/// type. Calls Array::View on each constituent chunk. Always succeeds if
|
||||
/// there are zero chunks
|
||||
Result<std::shared_ptr<ChunkedArray>> View(const std::shared_ptr<DataType>& type) const;
|
||||
|
||||
/// \brief Return the type of the chunked array
|
||||
const std::shared_ptr<DataType>& type() const { return type_; }
|
||||
|
||||
/// \brief Return a Scalar containing the value of this array at index
|
||||
Result<std::shared_ptr<Scalar>> GetScalar(int64_t index) const;
|
||||
|
||||
/// \brief Determine if two chunked arrays are equal.
|
||||
///
|
||||
/// Two chunked arrays can be equal only if they have equal datatypes.
|
||||
/// However, they may be equal even if they have different chunkings.
|
||||
bool Equals(const ChunkedArray& other,
|
||||
const EqualOptions& opts = EqualOptions::Defaults()) const;
|
||||
/// \brief Determine if two chunked arrays are equal.
|
||||
bool Equals(const std::shared_ptr<ChunkedArray>& other,
|
||||
const EqualOptions& opts = EqualOptions::Defaults()) const;
|
||||
/// \brief Determine if two chunked arrays approximately equal
|
||||
bool ApproxEquals(const ChunkedArray& other,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// \return PrettyPrint representation suitable for debugging
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Perform cheap validation checks to determine obvious inconsistencies
|
||||
/// within the chunk array's internal data.
|
||||
///
|
||||
/// This is O(k*m) where k is the number of array descendents,
|
||||
/// and m is the number of chunks.
|
||||
///
|
||||
/// \return Status
|
||||
Status Validate() const;
|
||||
|
||||
/// \brief Perform extensive validation checks to determine inconsistencies
|
||||
/// within the chunk array's internal data.
|
||||
///
|
||||
/// This is O(k*n) where k is the number of array descendents,
|
||||
/// and n is the length in elements.
|
||||
///
|
||||
/// \return Status
|
||||
Status ValidateFull() const;
|
||||
|
||||
protected:
|
||||
ArrayVector chunks_;
|
||||
std::shared_ptr<DataType> type_;
|
||||
int64_t length_;
|
||||
int64_t null_count_;
|
||||
|
||||
private:
|
||||
template <typename T, typename V>
|
||||
friend class ::arrow::stl::ChunkedArrayIterator;
|
||||
ChunkResolver chunk_resolver_;
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray);
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief EXPERIMENTAL: Utility for incremental iteration over contiguous
|
||||
/// pieces of potentially differently-chunked ChunkedArray objects
|
||||
class ARROW_EXPORT MultipleChunkIterator {
|
||||
public:
|
||||
MultipleChunkIterator(const ChunkedArray& left, const ChunkedArray& right)
|
||||
: left_(left),
|
||||
right_(right),
|
||||
pos_(0),
|
||||
length_(left.length()),
|
||||
chunk_idx_left_(0),
|
||||
chunk_idx_right_(0),
|
||||
chunk_pos_left_(0),
|
||||
chunk_pos_right_(0) {}
|
||||
|
||||
bool Next(std::shared_ptr<Array>* next_left, std::shared_ptr<Array>* next_right);
|
||||
|
||||
int64_t position() const { return pos_; }
|
||||
|
||||
private:
|
||||
const ChunkedArray& left_;
|
||||
const ChunkedArray& right_;
|
||||
|
||||
// The amount of the entire ChunkedArray consumed
|
||||
int64_t pos_;
|
||||
|
||||
// Length of the chunked array(s)
|
||||
int64_t length_;
|
||||
|
||||
// Current left chunk
|
||||
int chunk_idx_left_;
|
||||
|
||||
// Current right chunk
|
||||
int chunk_idx_right_;
|
||||
|
||||
// Offset into the current left chunk
|
||||
int64_t chunk_pos_left_;
|
||||
|
||||
// Offset into the current right chunk
|
||||
int64_t chunk_pos_right_;
|
||||
};
|
||||
|
||||
/// \brief Evaluate binary function on two ChunkedArray objects having possibly
|
||||
/// different chunk layouts. The passed binary function / functor should have
|
||||
/// the following signature.
|
||||
///
|
||||
/// Status(const Array&, const Array&, int64_t)
|
||||
///
|
||||
/// The third argument is the absolute position relative to the start of each
|
||||
/// ChunkedArray. The function is executed against each contiguous pair of
|
||||
/// array segments, slicing if necessary.
|
||||
///
|
||||
/// For example, if two arrays have chunk sizes
|
||||
///
|
||||
/// left: [10, 10, 20]
|
||||
/// right: [15, 10, 15]
|
||||
///
|
||||
/// Then the following invocations take place (pseudocode)
|
||||
///
|
||||
/// func(left.chunk[0][0:10], right.chunk[0][0:10], 0)
|
||||
/// func(left.chunk[1][0:5], right.chunk[0][10:15], 10)
|
||||
/// func(left.chunk[1][5:10], right.chunk[1][0:5], 15)
|
||||
/// func(left.chunk[2][0:5], right.chunk[1][5:10], 20)
|
||||
/// func(left.chunk[2][5:20], right.chunk[2][:], 25)
|
||||
template <typename Action>
|
||||
Status ApplyBinaryChunked(const ChunkedArray& left, const ChunkedArray& right,
|
||||
Action&& action) {
|
||||
MultipleChunkIterator iterator(left, right);
|
||||
std::shared_ptr<Array> left_piece, right_piece;
|
||||
while (iterator.Next(&left_piece, &right_piece)) {
|
||||
ARROW_RETURN_NOT_OK(action(*left_piece, *right_piece, iterator.position()));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,213 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Functions for comparing Arrow data structures
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <iosfwd>
|
||||
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
struct ArrayStatistics;
|
||||
class Array;
|
||||
class DataType;
|
||||
class Tensor;
|
||||
class SparseTensor;
|
||||
struct Scalar;
|
||||
|
||||
static constexpr double kDefaultAbsoluteTolerance = 1E-5;
|
||||
|
||||
/// A container of options for equality comparisons
|
||||
class EqualOptions {
|
||||
public:
|
||||
/// Whether or not NaNs are considered equal.
|
||||
bool nans_equal() const { return nans_equal_; }
|
||||
|
||||
/// Return a new EqualOptions object with the "nans_equal" property changed.
|
||||
EqualOptions nans_equal(bool v) const {
|
||||
auto res = EqualOptions(*this);
|
||||
res.nans_equal_ = v;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Whether or not zeros with differing signs are considered equal.
|
||||
bool signed_zeros_equal() const { return signed_zeros_equal_; }
|
||||
|
||||
/// Return a new EqualOptions object with the "signed_zeros_equal" property changed.
|
||||
EqualOptions signed_zeros_equal(bool v) const {
|
||||
auto res = EqualOptions(*this);
|
||||
res.signed_zeros_equal_ = v;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Whether the "atol" property is used in the comparison.
|
||||
///
|
||||
/// This option only affects the Equals methods
|
||||
/// and has no effect on ApproxEquals methods.
|
||||
bool use_atol() const { return use_atol_; }
|
||||
|
||||
/// Return a new EqualOptions object with the "use_atol" property changed.
|
||||
EqualOptions use_atol(bool v) const {
|
||||
auto res = EqualOptions(*this);
|
||||
res.use_atol_ = v;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// The absolute tolerance for approximate comparisons of floating-point values.
|
||||
/// Note that this option is ignored if "use_atol" is set to false.
|
||||
double atol() const { return atol_; }
|
||||
|
||||
/// Return a new EqualOptions object with the "atol" property changed.
|
||||
EqualOptions atol(double v) const {
|
||||
auto res = EqualOptions(*this);
|
||||
res.atol_ = v;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Whether the \ref arrow::Schema property is used in the comparison.
|
||||
///
|
||||
/// This option only affects the Equals methods
|
||||
/// and has no effect on ApproxEquals methods.
|
||||
bool use_schema() const { return use_schema_; }
|
||||
|
||||
/// Return a new EqualOptions object with the "use_schema_" property changed.
|
||||
///
|
||||
/// Setting this option is false making the value of \ref EqualOptions::use_metadata
|
||||
/// is ignored.
|
||||
EqualOptions use_schema(bool v) const {
|
||||
auto res = EqualOptions(*this);
|
||||
res.use_schema_ = v;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Whether the "metadata" in \ref arrow::Schema is used in the comparison.
|
||||
///
|
||||
/// This option only affects the Equals methods
|
||||
/// and has no effect on the ApproxEquals methods.
|
||||
///
|
||||
/// Note: This option is only considered when \ref arrow::EqualOptions::use_schema is
|
||||
/// set to true.
|
||||
bool use_metadata() const { return use_metadata_; }
|
||||
|
||||
/// Return a new EqualOptions object with the "use_metadata" property changed.
|
||||
EqualOptions use_metadata(bool v) const {
|
||||
auto res = EqualOptions(*this);
|
||||
res.use_metadata_ = v;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// The ostream to which a diff will be formatted if arrays disagree.
|
||||
/// If this is null (the default) no diff will be formatted.
|
||||
std::ostream* diff_sink() const { return diff_sink_; }
|
||||
|
||||
/// Return a new EqualOptions object with the "diff_sink" property changed.
|
||||
/// This option will be ignored if diff formatting of the types of compared arrays is
|
||||
/// not supported.
|
||||
EqualOptions diff_sink(std::ostream* diff_sink) const {
|
||||
auto res = EqualOptions(*this);
|
||||
res.diff_sink_ = diff_sink;
|
||||
return res;
|
||||
}
|
||||
|
||||
static EqualOptions Defaults() { return {}; }
|
||||
|
||||
protected:
|
||||
double atol_ = kDefaultAbsoluteTolerance;
|
||||
bool nans_equal_ = false;
|
||||
bool signed_zeros_equal_ = true;
|
||||
bool use_atol_ = false;
|
||||
bool use_schema_ = true;
|
||||
bool use_metadata_ = false;
|
||||
|
||||
std::ostream* diff_sink_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// Returns true if the arrays are exactly equal
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
ARROW_EXPORT bool ArrayEquals(const Array& left, const Array& right,
|
||||
const EqualOptions& = EqualOptions::Defaults());
|
||||
|
||||
/// Returns true if the arrays are approximately equal. For non-floating point
|
||||
/// types, this is equivalent to ArrayEquals(left, right)
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
ARROW_EXPORT bool ArrayApproxEquals(const Array& left, const Array& right,
|
||||
const EqualOptions& = EqualOptions::Defaults());
|
||||
|
||||
/// Returns true if indicated equal-length segment of arrays are exactly equal
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
ARROW_EXPORT bool ArrayRangeEquals(const Array& left, const Array& right,
|
||||
int64_t start_idx, int64_t end_idx,
|
||||
int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults());
|
||||
|
||||
/// Returns true if indicated equal-length segment of arrays are approximately equal
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
ARROW_EXPORT bool ArrayRangeApproxEquals(const Array& left, const Array& right,
|
||||
int64_t start_idx, int64_t end_idx,
|
||||
int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults());
|
||||
|
||||
ARROW_EXPORT bool TensorEquals(const Tensor& left, const Tensor& right,
|
||||
const EqualOptions& = EqualOptions::Defaults());
|
||||
|
||||
/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal
|
||||
ARROW_EXPORT bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right,
|
||||
const EqualOptions& = EqualOptions::Defaults());
|
||||
|
||||
/// Returns true if the type metadata are exactly equal
|
||||
/// \param[in] left a DataType
|
||||
/// \param[in] right a DataType
|
||||
/// \param[in] check_metadata whether to compare KeyValueMetadata for child
|
||||
/// fields
|
||||
ARROW_EXPORT bool TypeEquals(const DataType& left, const DataType& right,
|
||||
bool check_metadata = true);
|
||||
|
||||
/// \brief Check two \ref arrow::ArrayStatistics for equality
|
||||
/// \param[in] left an \ref arrow::ArrayStatistics
|
||||
/// \param[in] right an \ref arrow::ArrayStatistics
|
||||
/// \param[in] options Options used to compare double values for equality.
|
||||
/// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise,
|
||||
/// false.
|
||||
ARROW_EXPORT bool ArrayStatisticsEquals(
|
||||
const ArrayStatistics& left, const ArrayStatistics& right,
|
||||
const EqualOptions& options = EqualOptions::Defaults());
|
||||
|
||||
/// Returns true if scalars are equal
|
||||
/// \param[in] left a Scalar
|
||||
/// \param[in] right a Scalar
|
||||
/// \param[in] options comparison options
|
||||
ARROW_EXPORT bool ScalarEquals(const Scalar& left, const Scalar& right,
|
||||
const EqualOptions& options = EqualOptions::Defaults());
|
||||
|
||||
/// Returns true if scalars are approximately equal
|
||||
/// \param[in] left a Scalar
|
||||
/// \param[in] right a Scalar
|
||||
/// \param[in] options comparison options
|
||||
ARROW_EXPORT bool ScalarApproxEquals(
|
||||
const Scalar& left, const Scalar& right,
|
||||
const EqualOptions& options = EqualOptions::Defaults());
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,54 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle
|
||||
|
||||
#pragma once
|
||||
|
||||
/// \defgroup compute-functions Abstract compute function API
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup compute-concrete-options Concrete option classes for compute functions
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
#include "arrow/compute/api_aggregate.h" // IWYU pragma: export
|
||||
#include "arrow/compute/api_scalar.h" // IWYU pragma: export
|
||||
#include "arrow/compute/api_vector.h" // IWYU pragma: export
|
||||
#include "arrow/compute/cast.h" // IWYU pragma: export
|
||||
#include "arrow/compute/function.h" // IWYU pragma: export
|
||||
#include "arrow/compute/function_options.h" // IWYU pragma: export
|
||||
#include "arrow/compute/initialize.h" // IWYU pragma: export
|
||||
#include "arrow/compute/kernel.h" // IWYU pragma: export
|
||||
#include "arrow/compute/registry.h" // IWYU pragma: export
|
||||
#include "arrow/datum.h" // IWYU pragma: export
|
||||
|
||||
#include "arrow/compute/expression.h" // IWYU pragma: export
|
||||
|
||||
/// \defgroup execnode-row Utilities for working with data in a row-major format
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
#include "arrow/compute/row/grouper.h" // IWYU pragma: export
|
||||
|
||||
/// \defgroup acero-internals Acero internals, useful for those extending Acero
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
#include "arrow/compute/exec.h" // IWYU pragma: export
|
||||
@@ -0,0 +1,596 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Eager evaluation convenience APIs for invoking common functions, including
|
||||
// necessary memory allocations
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/function_options.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
|
||||
namespace compute {
|
||||
|
||||
class ExecContext;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Aggregate functions
|
||||
|
||||
/// \addtogroup compute-concrete-options
|
||||
/// @{
|
||||
|
||||
/// \brief Control general scalar aggregate kernel behavior
|
||||
///
|
||||
/// By default, null values are ignored (skip_nulls = true).
|
||||
class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
|
||||
static constexpr const char kTypeName[] = "ScalarAggregateOptions";
|
||||
static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
|
||||
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control count aggregate kernel behavior.
|
||||
///
|
||||
/// By default, only non-null values are counted.
|
||||
class ARROW_EXPORT CountOptions : public FunctionOptions {
|
||||
public:
|
||||
enum CountMode {
|
||||
/// Count only non-null values.
|
||||
ONLY_VALID = 0,
|
||||
/// Count only null values.
|
||||
ONLY_NULL,
|
||||
/// Count both non-null and null values.
|
||||
ALL,
|
||||
};
|
||||
explicit CountOptions(CountMode mode = CountMode::ONLY_VALID);
|
||||
static constexpr const char kTypeName[] = "CountOptions";
|
||||
static CountOptions Defaults() { return CountOptions{}; }
|
||||
|
||||
CountMode mode;
|
||||
};
|
||||
|
||||
/// \brief Control Mode kernel behavior
|
||||
///
|
||||
/// Returns top-n common values and counts.
|
||||
/// By default, returns the most common value and count.
|
||||
class ARROW_EXPORT ModeOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0);
|
||||
static constexpr const char kTypeName[] = "ModeOptions";
|
||||
static ModeOptions Defaults() { return ModeOptions{}; }
|
||||
|
||||
int64_t n = 1;
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
|
||||
///
|
||||
/// The divisor used in calculations is N - ddof, where N is the number of elements.
|
||||
/// By default, ddof is zero, and population variance or stddev is returned.
|
||||
class ARROW_EXPORT VarianceOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0);
|
||||
static constexpr const char kTypeName[] = "VarianceOptions";
|
||||
static VarianceOptions Defaults() { return VarianceOptions{}; }
|
||||
|
||||
int ddof = 0;
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control Skew and Kurtosis kernel behavior
|
||||
class ARROW_EXPORT SkewOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit SkewOptions(bool skip_nulls = true, bool biased = true,
|
||||
uint32_t min_count = 0);
|
||||
static constexpr const char kTypeName[] = "SkewOptions";
|
||||
static SkewOptions Defaults() { return SkewOptions{}; }
|
||||
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If true (the default), the calculated value is biased. If false, the calculated
|
||||
/// value includes a correction factor to reduce bias, making it more accurate for
|
||||
/// small sample sizes.
|
||||
bool biased;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control Quantile kernel behavior
|
||||
///
|
||||
/// By default, returns the median value.
|
||||
class ARROW_EXPORT QuantileOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Interpolation method to use when quantile lies between two data points
|
||||
enum Interpolation {
|
||||
LINEAR = 0,
|
||||
LOWER,
|
||||
HIGHER,
|
||||
NEAREST,
|
||||
MIDPOINT,
|
||||
};
|
||||
|
||||
explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR,
|
||||
bool skip_nulls = true, uint32_t min_count = 0);
|
||||
|
||||
explicit QuantileOptions(std::vector<double> q,
|
||||
enum Interpolation interpolation = LINEAR,
|
||||
bool skip_nulls = true, uint32_t min_count = 0);
|
||||
|
||||
static constexpr const char kTypeName[] = "QuantileOptions";
|
||||
static QuantileOptions Defaults() { return QuantileOptions{}; }
|
||||
|
||||
/// probability level of quantile must be between 0 and 1 inclusive
|
||||
std::vector<double> q;
|
||||
enum Interpolation interpolation;
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control TDigest approximate quantile kernel behavior
|
||||
///
|
||||
/// By default, returns the median value.
|
||||
class ARROW_EXPORT TDigestOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
|
||||
uint32_t buffer_size = 500, bool skip_nulls = true,
|
||||
uint32_t min_count = 0);
|
||||
explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
|
||||
uint32_t buffer_size = 500, bool skip_nulls = true,
|
||||
uint32_t min_count = 0);
|
||||
static constexpr const char kTypeName[] = "TDigestOptions";
|
||||
static TDigestOptions Defaults() { return TDigestOptions{}; }
|
||||
|
||||
/// probability level of quantile must be between 0 and 1 inclusive
|
||||
std::vector<double> q;
|
||||
/// compression parameter, default 100
|
||||
uint32_t delta;
|
||||
/// input buffer size, default 500
|
||||
uint32_t buffer_size;
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control Pivot kernel behavior
|
||||
///
|
||||
/// These options apply to the "pivot_wider" and "hash_pivot_wider" functions.
|
||||
///
|
||||
/// Constraints:
|
||||
/// - The corresponding `Aggregate::target` must have two FieldRef elements;
|
||||
/// the first one points to the pivot key column, the second points to the
|
||||
/// pivoted data column.
|
||||
/// - The pivot key column can be string, binary or integer; its values will be
|
||||
/// matched against `key_names` in order to dispatch the pivoted data into
|
||||
/// the output. If the pivot key column is not string-like, the `key_names`
|
||||
/// will be cast to the pivot key type.
|
||||
///
|
||||
/// "pivot_wider" example
|
||||
/// ---------------------
|
||||
///
|
||||
/// Assuming the following two input columns with types utf8 and int16 (respectively):
|
||||
/// ```
|
||||
/// width | 11
|
||||
/// height | 13
|
||||
/// ```
|
||||
/// and the options `PivotWiderOptions(.key_names = {"height", "width"})`
|
||||
///
|
||||
/// then the output will be a scalar with the type
|
||||
/// `struct{"height": int16, "width": int16}`
|
||||
/// and the value `{"height": 13, "width": 11}`.
|
||||
///
|
||||
/// "hash_pivot_wider" example
|
||||
/// --------------------------
|
||||
///
|
||||
/// Assuming the following input with schema
|
||||
/// `{"group": int32, "key": utf8, "value": int16}`:
|
||||
/// ```
|
||||
/// group | key | value
|
||||
/// -----------------------------
|
||||
/// 1 | height | 11
|
||||
/// 1 | width | 12
|
||||
/// 2 | width | 13
|
||||
/// 3 | height | 14
|
||||
/// 3 | depth | 15
|
||||
/// ```
|
||||
/// and the following settings:
|
||||
/// - a hash grouping key "group"
|
||||
/// - Aggregate(
|
||||
/// .function = "hash_pivot_wider",
|
||||
/// .options = PivotWiderOptions(.key_names = {"height", "width"}),
|
||||
/// .target = {"key", "value"},
|
||||
/// .name = {"properties"})
|
||||
///
|
||||
/// then the output will have the schema
|
||||
/// `{"group": int32, "properties": struct{"height": int16, "width": int16}}`
|
||||
/// and the following value:
|
||||
/// ```
|
||||
/// group | properties
|
||||
/// | height | width
|
||||
/// -----------------------------
|
||||
/// 1 | 11 | 12
|
||||
/// 2 | null | 13
|
||||
/// 3 | 14 | null
|
||||
/// ```
|
||||
class ARROW_EXPORT PivotWiderOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Configure the behavior of pivot keys not in `key_names`
|
||||
enum UnexpectedKeyBehavior {
|
||||
/// Unexpected pivot keys are ignored silently
|
||||
kIgnore,
|
||||
/// Unexpected pivot keys return a KeyError
|
||||
kRaise
|
||||
};
|
||||
|
||||
explicit PivotWiderOptions(std::vector<std::string> key_names,
|
||||
UnexpectedKeyBehavior unexpected_key_behavior = kIgnore);
|
||||
// Default constructor for serialization
|
||||
PivotWiderOptions();
|
||||
static constexpr const char kTypeName[] = "PivotWiderOptions";
|
||||
static PivotWiderOptions Defaults() { return PivotWiderOptions{}; }
|
||||
|
||||
/// The values expected in the pivot key column
|
||||
std::vector<std::string> key_names;
|
||||
/// The behavior when pivot keys not in `key_names` are encountered
|
||||
UnexpectedKeyBehavior unexpected_key_behavior = kIgnore;
|
||||
};
|
||||
|
||||
/// \brief Control Index kernel behavior
|
||||
class ARROW_EXPORT IndexOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit IndexOptions(std::shared_ptr<Scalar> value);
|
||||
// Default constructor for serialization
|
||||
IndexOptions();
|
||||
static constexpr const char kTypeName[] = "IndexOptions";
|
||||
|
||||
std::shared_ptr<Scalar> value;
|
||||
};
|
||||
|
||||
/// \brief Configure a grouped aggregation
|
||||
struct ARROW_EXPORT Aggregate {
|
||||
Aggregate() = default;
|
||||
|
||||
Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
|
||||
std::vector<FieldRef> target, std::string name = "")
|
||||
: function(std::move(function)),
|
||||
options(std::move(options)),
|
||||
target(std::move(target)),
|
||||
name(std::move(name)) {}
|
||||
|
||||
Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
|
||||
FieldRef target, std::string name = "")
|
||||
: Aggregate(std::move(function), std::move(options),
|
||||
std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
|
||||
|
||||
Aggregate(std::string function, FieldRef target, std::string name)
|
||||
: Aggregate(std::move(function), /*options=*/NULLPTR,
|
||||
std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
|
||||
|
||||
Aggregate(std::string function, std::string name)
|
||||
: Aggregate(std::move(function), /*options=*/NULLPTR,
|
||||
/*target=*/std::vector<FieldRef>{}, std::move(name)) {}
|
||||
|
||||
/// the name of the aggregation function
|
||||
std::string function;
|
||||
|
||||
/// options for the aggregation function
|
||||
std::shared_ptr<FunctionOptions> options;
|
||||
|
||||
/// zero or more fields to which aggregations will be applied
|
||||
std::vector<FieldRef> target;
|
||||
|
||||
/// optional output field name for aggregations
|
||||
std::string name;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Count values in an array.
|
||||
///
|
||||
/// \param[in] options counting options, see CountOptions for more information
|
||||
/// \param[in] datum to count
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return out resulting datum
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Count(const Datum& datum,
|
||||
const CountOptions& options = CountOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the mean of a numeric array.
|
||||
///
|
||||
/// \param[in] value datum to compute the mean, expecting Array
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed mean as a DoubleScalar
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Mean(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the product of values of a numeric array.
|
||||
///
|
||||
/// \param[in] value datum to compute product of, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed sum as a Scalar
|
||||
///
|
||||
/// \since 6.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Product(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Sum values of a numeric array.
|
||||
///
|
||||
/// \param[in] value datum to sum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed sum as a Scalar
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Sum(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the first value of an array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed first as Scalar
|
||||
///
|
||||
/// \since 13.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> First(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the last value of an array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed last as a Scalar
|
||||
///
|
||||
/// \since 13.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Last(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the min / max of a numeric array
|
||||
///
|
||||
/// This function returns both the min and max as a struct scalar, with type
|
||||
/// struct<min: T, max: T>, where T is the input type
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as a struct<min: T, max: T> scalar
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> MinMax(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Test whether any element in a boolean array evaluates to true.
|
||||
///
|
||||
/// This function returns true if any of the elements in the array evaluates
|
||||
/// to true and false otherwise. Null values are ignored by default.
|
||||
/// If null values are taken into account by setting ScalarAggregateOptions
|
||||
/// parameter skip_nulls = false then Kleene logic is used.
|
||||
/// See KleeneOr for more details on Kleene logic.
|
||||
///
|
||||
/// \param[in] value input datum, expecting a boolean array
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as a BooleanScalar
|
||||
///
|
||||
/// \since 3.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Any(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Test whether all elements in a boolean array evaluate to true.
|
||||
///
|
||||
/// This function returns true if all of the elements in the array evaluate
|
||||
/// to true and false otherwise. Null values are ignored by default.
|
||||
/// If null values are taken into account by setting ScalarAggregateOptions
|
||||
/// parameter skip_nulls = false then Kleene logic is used.
|
||||
/// See KleeneAnd for more details on Kleene logic.
|
||||
///
|
||||
/// \param[in] value input datum, expecting a boolean array
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as a BooleanScalar
|
||||
|
||||
/// \since 3.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> All(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the modal (most common) value of a numeric array
|
||||
///
|
||||
/// This function returns top-n most common values and number of times they occur as
|
||||
/// an array of `struct<mode: T, count: int64>`, where T is the input type.
|
||||
/// Values with larger counts are returned before smaller ones.
|
||||
/// If there are more than one values with same count, smaller value is returned first.
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ModeOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as an array of struct<mode: T, count: int64>
|
||||
///
|
||||
/// \since 2.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Mode(const Datum& value,
|
||||
const ModeOptions& options = ModeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the standard deviation of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see VarianceOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed standard deviation as a DoubleScalar
|
||||
///
|
||||
/// \since 2.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Stddev(const Datum& value,
|
||||
const VarianceOptions& options = VarianceOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the variance of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see VarianceOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed variance as a DoubleScalar
|
||||
///
|
||||
/// \since 2.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Variance(const Datum& value,
|
||||
const VarianceOptions& options = VarianceOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the skewness of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see SkewOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed skewness as a DoubleScalar
|
||||
///
|
||||
/// \since 20.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Skew(const Datum& value,
|
||||
const SkewOptions& options = SkewOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the kurtosis of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see SkewOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed kurtosis as a DoubleScalar
|
||||
///
|
||||
/// \since 20.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Kurtosis(const Datum& value,
|
||||
const SkewOptions& options = SkewOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the quantiles of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see QuantileOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as an array
|
||||
///
|
||||
/// \since 4.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Quantile(const Datum& value,
|
||||
const QuantileOptions& options = QuantileOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see TDigestOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as an array
|
||||
///
|
||||
/// \since 4.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> TDigest(const Datum& value,
|
||||
const TDigestOptions& options = TDigestOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Find the first index of a value in an array.
|
||||
///
|
||||
/// \param[in] value The array to search.
|
||||
/// \param[in] options The array to search for. See IndexOptions.
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return out a Scalar containing the index (or -1 if not found).
|
||||
///
|
||||
/// \since 5.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Index(const Datum& value, const IndexOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,834 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/compute/function_options.h"
|
||||
#include "arrow/compute/ordering.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
class ExecContext;
|
||||
|
||||
/// \addtogroup compute-concrete-options
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT FilterOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Configure the action taken when a slot of the selection mask is null
|
||||
enum NullSelectionBehavior {
|
||||
/// The corresponding filtered value will be removed in the output.
|
||||
DROP,
|
||||
/// The corresponding filtered value will be null in the output.
|
||||
EMIT_NULL,
|
||||
};
|
||||
|
||||
explicit FilterOptions(NullSelectionBehavior null_selection = DROP);
|
||||
static constexpr const char kTypeName[] = "FilterOptions";
|
||||
static FilterOptions Defaults() { return FilterOptions(); }
|
||||
|
||||
NullSelectionBehavior null_selection_behavior = DROP;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT TakeOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit TakeOptions(bool boundscheck = true);
|
||||
static constexpr const char kTypeName[] = "TakeOptions";
|
||||
static TakeOptions BoundsCheck() { return TakeOptions(true); }
|
||||
static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
|
||||
static TakeOptions Defaults() { return BoundsCheck(); }
|
||||
|
||||
bool boundscheck = true;
|
||||
};
|
||||
|
||||
/// \brief Options for the dictionary encode function
|
||||
class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Configure how null values will be encoded
|
||||
enum NullEncodingBehavior {
|
||||
/// The null value will be added to the dictionary with a proper index.
|
||||
ENCODE,
|
||||
/// The null value will be masked in the indices array.
|
||||
MASK
|
||||
};
|
||||
|
||||
explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK);
|
||||
static constexpr const char kTypeName[] = "DictionaryEncodeOptions";
|
||||
static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
|
||||
|
||||
NullEncodingBehavior null_encoding_behavior = MASK;
|
||||
};
|
||||
|
||||
/// \brief Options for the run-end encode function
|
||||
class ARROW_EXPORT RunEndEncodeOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit RunEndEncodeOptions(std::shared_ptr<DataType> run_end_type = int32());
|
||||
static constexpr const char kTypeName[] = "RunEndEncodeOptions";
|
||||
static RunEndEncodeOptions Defaults() { return RunEndEncodeOptions(); }
|
||||
|
||||
std::shared_ptr<DataType> run_end_type;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ArraySortOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ArraySortOptions(SortOrder order = SortOrder::Ascending,
|
||||
NullPlacement null_placement = NullPlacement::AtEnd);
|
||||
static constexpr const char kTypeName[] = "ArraySortOptions";
|
||||
static ArraySortOptions Defaults() { return ArraySortOptions(); }
|
||||
|
||||
/// Sorting order
|
||||
SortOrder order;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT SortOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit SortOptions(std::vector<SortKey> sort_keys = {},
|
||||
NullPlacement null_placement = NullPlacement::AtEnd);
|
||||
explicit SortOptions(const Ordering& ordering);
|
||||
static constexpr const char kTypeName[] = "SortOptions";
|
||||
static SortOptions Defaults() { return SortOptions(); }
|
||||
/// Convenience constructor to create an ordering from SortOptions
|
||||
///
|
||||
/// Note: Both classes contain the exact same information. However,
|
||||
/// sort_options should only be used in a "function options" context while Ordering
|
||||
/// is used more generally.
|
||||
Ordering AsOrdering() && { return Ordering(std::move(sort_keys), null_placement); }
|
||||
Ordering AsOrdering() const& { return Ordering(sort_keys, null_placement); }
|
||||
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
};
|
||||
|
||||
/// \brief SelectK options
|
||||
class ARROW_EXPORT SelectKOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit SelectKOptions(int64_t k = -1, std::vector<SortKey> sort_keys = {});
|
||||
static constexpr const char kTypeName[] = "SelectKOptions";
|
||||
static SelectKOptions Defaults() { return SelectKOptions(); }
|
||||
|
||||
static SelectKOptions TopKDefault(int64_t k, std::vector<std::string> key_names = {}) {
|
||||
std::vector<SortKey> keys;
|
||||
for (const auto& name : key_names) {
|
||||
keys.emplace_back(SortKey(name, SortOrder::Descending));
|
||||
}
|
||||
if (key_names.empty()) {
|
||||
keys.emplace_back(SortKey("not-used", SortOrder::Descending));
|
||||
}
|
||||
return SelectKOptions{k, keys};
|
||||
}
|
||||
static SelectKOptions BottomKDefault(int64_t k,
|
||||
std::vector<std::string> key_names = {}) {
|
||||
std::vector<SortKey> keys;
|
||||
for (const auto& name : key_names) {
|
||||
keys.emplace_back(SortKey(name, SortOrder::Ascending));
|
||||
}
|
||||
if (key_names.empty()) {
|
||||
keys.emplace_back(SortKey("not-used", SortOrder::Ascending));
|
||||
}
|
||||
return SelectKOptions{k, keys};
|
||||
}
|
||||
|
||||
/// The number of `k` elements to keep.
|
||||
int64_t k;
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys;
|
||||
};
|
||||
|
||||
/// \brief Rank options
|
||||
class ARROW_EXPORT RankOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Configure how ties between equal values are handled
|
||||
enum Tiebreaker {
|
||||
/// Ties get the smallest possible rank in sorted order.
|
||||
Min,
|
||||
/// Ties get the largest possible rank in sorted order.
|
||||
Max,
|
||||
/// Ranks are assigned in order of when ties appear in the input.
|
||||
/// This ensures the ranks are a stable permutation of the input.
|
||||
First,
|
||||
/// The ranks span a dense [1, M] interval where M is the number
|
||||
/// of distinct values in the input.
|
||||
Dense
|
||||
};
|
||||
|
||||
explicit RankOptions(std::vector<SortKey> sort_keys = {},
|
||||
NullPlacement null_placement = NullPlacement::AtEnd,
|
||||
Tiebreaker tiebreaker = RankOptions::First);
|
||||
/// Convenience constructor for array inputs
|
||||
explicit RankOptions(SortOrder order,
|
||||
NullPlacement null_placement = NullPlacement::AtEnd,
|
||||
Tiebreaker tiebreaker = RankOptions::First)
|
||||
: RankOptions({SortKey("", order)}, null_placement, tiebreaker) {}
|
||||
|
||||
static constexpr const char kTypeName[] = "RankOptions";
|
||||
static RankOptions Defaults() { return RankOptions(); }
|
||||
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
/// Tiebreaker for dealing with equal values in ranks
|
||||
Tiebreaker tiebreaker;
|
||||
};
|
||||
|
||||
/// \brief Quantile rank options
|
||||
class ARROW_EXPORT RankQuantileOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit RankQuantileOptions(std::vector<SortKey> sort_keys = {},
|
||||
NullPlacement null_placement = NullPlacement::AtEnd);
|
||||
/// Convenience constructor for array inputs
|
||||
explicit RankQuantileOptions(SortOrder order,
|
||||
NullPlacement null_placement = NullPlacement::AtEnd)
|
||||
: RankQuantileOptions({SortKey("", order)}, null_placement) {}
|
||||
|
||||
static constexpr const char kTypeName[] = "RankQuantileOptions";
|
||||
static RankQuantileOptions Defaults() { return RankQuantileOptions(); }
|
||||
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
};
|
||||
|
||||
/// \brief Partitioning options for NthToIndices
|
||||
class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit PartitionNthOptions(int64_t pivot,
|
||||
NullPlacement null_placement = NullPlacement::AtEnd);
|
||||
PartitionNthOptions() : PartitionNthOptions(0) {}
|
||||
static constexpr const char kTypeName[] = "PartitionNthOptions";
|
||||
|
||||
/// The index into the equivalent sorted array of the partition pivot element.
|
||||
int64_t pivot;
|
||||
/// Whether nulls and NaNs are partitioned at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT WinsorizeOptions : public FunctionOptions {
|
||||
public:
|
||||
WinsorizeOptions(double lower_limit, double upper_limit);
|
||||
WinsorizeOptions() : WinsorizeOptions(0, 1) {}
|
||||
static constexpr const char kTypeName[] = "WinsorizeOptions";
|
||||
|
||||
/// The quantile below which all values are replaced with the quantile's value.
|
||||
///
|
||||
/// For example, if lower_limit = 0.05, then all values in the lower 5% percentile
|
||||
/// will be replaced with the 5% percentile value.
|
||||
double lower_limit;
|
||||
|
||||
/// The quantile above which all values are replaced with the quantile's value.
|
||||
///
|
||||
/// For example, if upper_limit = 0.95, then all values in the upper 95% percentile
|
||||
/// will be replaced with the 95% percentile value.
|
||||
double upper_limit;
|
||||
};
|
||||
|
||||
/// \brief Options for cumulative functions
|
||||
/// \note Also aliased as CumulativeSumOptions for backward compatibility
|
||||
class ARROW_EXPORT CumulativeOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit CumulativeOptions(bool skip_nulls = false);
|
||||
explicit CumulativeOptions(double start, bool skip_nulls = false);
|
||||
explicit CumulativeOptions(std::shared_ptr<Scalar> start, bool skip_nulls = false);
|
||||
static constexpr const char kTypeName[] = "CumulativeOptions";
|
||||
static CumulativeOptions Defaults() { return CumulativeOptions(); }
|
||||
|
||||
/// Optional starting value for cumulative operation computation, default depends on the
|
||||
/// operation and input type.
|
||||
/// - sum: 0
|
||||
/// - prod: 1
|
||||
/// - min: maximum of the input type
|
||||
/// - max: minimum of the input type
|
||||
/// - mean: start is ignored because it has no meaning for mean
|
||||
std::optional<std::shared_ptr<Scalar>> start;
|
||||
|
||||
/// If true, nulls in the input are ignored and produce a corresponding null output.
|
||||
/// When false, the first null encountered is propagated through the remaining output.
|
||||
bool skip_nulls = false;
|
||||
};
|
||||
using CumulativeSumOptions = CumulativeOptions; // For backward compatibility
|
||||
|
||||
/// \brief Options for pairwise functions
|
||||
class ARROW_EXPORT PairwiseOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit PairwiseOptions(int64_t periods = 1);
|
||||
static constexpr const char kTypeName[] = "PairwiseOptions";
|
||||
static PairwiseOptions Defaults() { return PairwiseOptions(); }
|
||||
|
||||
/// Periods to shift for applying the binary operation, accepts negative values.
|
||||
int64_t periods = 1;
|
||||
};
|
||||
|
||||
/// \brief Options for list_flatten function
|
||||
class ARROW_EXPORT ListFlattenOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ListFlattenOptions(bool recursive = false);
|
||||
static constexpr const char kTypeName[] = "ListFlattenOptions";
|
||||
static ListFlattenOptions Defaults() { return ListFlattenOptions(); }
|
||||
|
||||
/// \brief If true, the list is flattened recursively until a non-list
|
||||
/// array is formed.
|
||||
bool recursive = false;
|
||||
};
|
||||
|
||||
/// \brief Options for inverse_permutation function
|
||||
class ARROW_EXPORT InversePermutationOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit InversePermutationOptions(int64_t max_index = -1,
|
||||
std::shared_ptr<DataType> output_type = NULLPTR);
|
||||
static constexpr const char kTypeName[] = "InversePermutationOptions";
|
||||
static InversePermutationOptions Defaults() { return InversePermutationOptions(); }
|
||||
|
||||
/// \brief The max value in the input indices to allow. The length of the function's
|
||||
/// output will be this value plus 1. If negative, this value will be set to the length
|
||||
/// of the input indices minus 1 and the length of the function's output will be the
|
||||
/// length of the input indices.
|
||||
int64_t max_index = -1;
|
||||
/// \brief The type of the output inverse permutation. If null, the output will be of
|
||||
/// the same type as the input indices, otherwise must be signed integer type. An
|
||||
/// invalid error will be reported if this type is not able to store the length of the
|
||||
/// input indices.
|
||||
std::shared_ptr<DataType> output_type = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief Options for scatter function
|
||||
class ARROW_EXPORT ScatterOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ScatterOptions(int64_t max_index = -1);
|
||||
static constexpr const char kTypeName[] = "ScatterOptions";
|
||||
static ScatterOptions Defaults() { return ScatterOptions(); }
|
||||
|
||||
/// \brief The max value in the input indices to allow. The length of the function's
|
||||
/// output will be this value plus 1. If negative, this value will be set to the length
|
||||
/// of the input indices minus 1 and the length of the function's output will be the
|
||||
/// length of the input indices.
|
||||
int64_t max_index = -1;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Filter with a boolean selection filter
|
||||
///
|
||||
/// The output will be populated with values from the input at positions
|
||||
/// where the selection filter is not 0. Nulls in the filter will be handled
|
||||
/// based on options.null_selection_behavior.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, "e", "f"] and
|
||||
/// filter = [0, 1, 1, 0, null, 1], the output will be
|
||||
/// (null_selection_behavior == DROP) = ["b", "c", "f"]
|
||||
/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
|
||||
///
|
||||
/// \param[in] values array to filter
|
||||
/// \param[in] filter indicates which values should be filtered out
|
||||
/// \param[in] options configures null_selection_behavior
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Filter(const Datum& values, const Datum& filter,
|
||||
const FilterOptions& options = FilterOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
namespace internal {
|
||||
|
||||
// These internal functions are implemented in kernels/vector_selection.cc
|
||||
|
||||
/// \brief Return the number of selected indices in the boolean filter
|
||||
///
|
||||
/// \param filter a plain or run-end encoded boolean array with or without nulls
|
||||
/// \param null_selection how to handle nulls in the filter
|
||||
ARROW_EXPORT
|
||||
int64_t GetFilterOutputSize(const ArraySpan& filter,
|
||||
FilterOptions::NullSelectionBehavior null_selection);
|
||||
|
||||
/// \brief Compute uint64 selection indices for use with Take given a boolean
|
||||
/// filter
|
||||
///
|
||||
/// \param filter a plain or run-end encoded boolean array with or without nulls
|
||||
/// \param null_selection how to handle nulls in the filter
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> GetTakeIndices(
|
||||
const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
|
||||
MemoryPool* memory_pool = default_memory_pool());
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief ReplaceWithMask replaces each value in the array corresponding
|
||||
/// to a true value in the mask with the next element from `replacements`.
|
||||
///
|
||||
/// \param[in] values Array input to replace
|
||||
/// \param[in] mask Array or Scalar of Boolean mask values
|
||||
/// \param[in] replacements The replacement values to draw from. There must
|
||||
/// be as many replacement values as true values in the mask.
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
///
|
||||
/// \return the resulting datum
|
||||
///
|
||||
/// \since 5.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
|
||||
const Datum& replacements, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief FillNullForward fill null values in forward direction
|
||||
///
|
||||
/// The output array will be of the same type as the input values
|
||||
/// array, with replaced null values in forward direction.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, null, "f"],
|
||||
/// the output will be = ["a", "b", "c", "c", "c", "f"]
|
||||
///
|
||||
/// \param[in] values datum from which to take
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> FillNullForward(const Datum& values, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief FillNullBackward fill null values in backward direction
|
||||
///
|
||||
/// The output array will be of the same type as the input values
|
||||
/// array, with replaced null values in backward direction.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, null, "f"],
|
||||
/// the output will be = ["a", "b", "c", "f", "f", "f"]
|
||||
///
|
||||
/// \param[in] values datum from which to take
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> FillNullBackward(const Datum& values, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Take from an array of values at indices in another array
|
||||
///
|
||||
/// The output array will be of the same type as the input values
|
||||
/// array, with elements taken from the values array at the given
|
||||
/// indices. If an index is null then the taken element will be null.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, "e", "f"] and
|
||||
/// indices = [2, 1, null, 3], the output will be
|
||||
/// = [values[2], values[1], null, values[3]]
|
||||
/// = ["c", "b", null, null]
|
||||
///
|
||||
/// \param[in] values datum from which to take
|
||||
/// \param[in] indices which values to take
|
||||
/// \param[in] options options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Take(const Datum& values, const Datum& indices,
|
||||
const TakeOptions& options = TakeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Take with Array inputs and output
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
|
||||
const TakeOptions& options = TakeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Drop Null from an array of values
|
||||
///
|
||||
/// The output array will be of the same type as the input values
|
||||
/// array, with elements taken from the values array without nulls.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, "e", "f"],
|
||||
/// the output will be = ["a", "b", "c", "e", "f"]
|
||||
///
|
||||
/// \param[in] values datum from which to take
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> DropNull(const Datum& values, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief DropNull with Array inputs and output
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> DropNull(const Array& values, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return indices that partition an array around n-th sorted element.
|
||||
///
|
||||
/// Find index of n-th(0 based) smallest value and perform indirect
|
||||
/// partition of an array around that element. Output indices[0 ~ n-1]
|
||||
/// holds values no greater than n-th element, and indices[n+1 ~ end]
|
||||
/// holds values no less than n-th element. Elements in each partition
|
||||
/// is not sorted. Nulls will be partitioned to the end of the output.
|
||||
/// Output is not guaranteed to be stable.
|
||||
///
|
||||
/// \param[in] values array to be partitioned
|
||||
/// \param[in] n pivot array around sorted n-th element
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would partition an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return indices that partition an array around n-th sorted element.
|
||||
///
|
||||
/// This overload takes a PartitionNthOptions specifying the pivot index
|
||||
/// and the null handling.
|
||||
///
|
||||
/// \param[in] values array to be partitioned
|
||||
/// \param[in] options options including pivot index and null handling
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would partition an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> NthToIndices(const Array& values,
|
||||
const PartitionNthOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return indices that would select the first `k` elements.
|
||||
///
|
||||
/// Perform an indirect sort of the datum, keeping only the first `k` elements. The output
|
||||
/// array will contain indices such that the item indicated by the k-th index will be in
|
||||
/// the position it would be if the datum were sorted by `options.sort_keys`. However,
|
||||
/// indices of null values will not be part of the output. The sort is not guaranteed to
|
||||
/// be stable.
|
||||
///
|
||||
/// \param[in] datum datum to be partitioned
|
||||
/// \param[in] options options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return a datum with the same schema as the input
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SelectKUnstable(const Datum& datum,
|
||||
const SelectKOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort an array.
|
||||
///
|
||||
/// Perform an indirect sort of array. The output array will contain
|
||||
/// indices that would sort an array, which would be the same length
|
||||
/// as input. Nulls will be stably partitioned to the end of the output
|
||||
/// regardless of order.
|
||||
///
|
||||
/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
|
||||
/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
|
||||
/// 3].
|
||||
///
|
||||
/// \param[in] array array to sort
|
||||
/// \param[in] order ascending or descending
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const Array& array,
|
||||
SortOrder order = SortOrder::Ascending,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort an array.
|
||||
///
|
||||
/// This overload takes a ArraySortOptions specifying the sort order
|
||||
/// and the null handling.
|
||||
///
|
||||
/// \param[in] array array to sort
|
||||
/// \param[in] options options including sort order and null handling
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const Array& array,
|
||||
const ArraySortOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort a chunked array.
|
||||
///
|
||||
/// Perform an indirect sort of chunked array. The output array will
|
||||
/// contain indices that would sort a chunked array, which would be
|
||||
/// the same length as input. Nulls will be stably partitioned to the
|
||||
/// end of the output regardless of order.
|
||||
///
|
||||
/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
|
||||
/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
|
||||
/// 4, 1, 0, 3].
|
||||
///
|
||||
/// \param[in] chunked_array chunked array to sort
|
||||
/// \param[in] order ascending or descending
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
|
||||
SortOrder order = SortOrder::Ascending,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort a chunked array.
|
||||
///
|
||||
/// This overload takes a ArraySortOptions specifying the sort order
|
||||
/// and the null handling.
|
||||
///
|
||||
/// \param[in] chunked_array chunked array to sort
|
||||
/// \param[in] options options including sort order and null handling
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
|
||||
const ArraySortOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort an input in the
|
||||
/// specified order. Input is one of array, chunked array record batch
|
||||
/// or table.
|
||||
///
|
||||
/// Perform an indirect sort of input. The output array will contain
|
||||
/// indices that would sort an input, which would be the same length
|
||||
/// as input. Nulls will be stably partitioned to the start or to the end
|
||||
/// of the output depending on SortOrder::null_placement.
|
||||
///
|
||||
/// For example given input (table) = {
|
||||
/// "column1": [[null, 1], [ 3, null, 2, 1]],
|
||||
/// "column2": [[ 5], [3, null, null, 5, 5]],
|
||||
/// } and options = {
|
||||
/// {"column1", SortOrder::Ascending},
|
||||
/// {"column2", SortOrder::Descending},
|
||||
/// }, the output will be [5, 1, 4, 2, 0, 3].
|
||||
///
|
||||
/// \param[in] datum array, chunked array, record batch or table to sort
|
||||
/// \param[in] options options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort a table
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute unique elements from an array-like object
|
||||
///
|
||||
/// Note if a null occurs in the input it will NOT be included in the output.
|
||||
///
|
||||
/// \param[in] datum array-like input
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return result as Array
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Unique(const Datum& datum, ExecContext* ctx = NULLPTR);
|
||||
|
||||
// Constants for accessing the output of ValueCounts
|
||||
ARROW_EXPORT extern const char kValuesFieldName[];
|
||||
ARROW_EXPORT extern const char kCountsFieldName[];
|
||||
ARROW_EXPORT extern const int32_t kValuesFieldIndex;
|
||||
ARROW_EXPORT extern const int32_t kCountsFieldIndex;
|
||||
|
||||
/// \brief Return counts of unique elements from an array-like object.
|
||||
///
|
||||
/// Note that the counts do not include counts for nulls in the array. These can be
|
||||
/// obtained separately from metadata.
|
||||
///
|
||||
/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
|
||||
/// which can lead to unexpected results if the input Array has these values.
|
||||
///
|
||||
/// \param[in] value array-like input
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return counts An array of <input type "Values", int64_t "Counts"> structs.
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Dictionary-encode values in an array-like object
|
||||
///
|
||||
/// Any nulls encountered in the dictionary will be handled according to the
|
||||
/// specified null encoding behavior.
|
||||
///
|
||||
/// For example, given values ["a", "b", null, "a", null] the output will be
|
||||
/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
|
||||
/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
|
||||
///
|
||||
/// If the input is already dictionary encoded this function is a no-op unless
|
||||
/// it needs to modify the null_encoding (TODO)
|
||||
///
|
||||
/// \param[in] data array-like input
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \param[in] options configures null encoding behavior
|
||||
/// \return result with same shape and type as input
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> DictionaryEncode(
|
||||
const Datum& data,
|
||||
const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Run-end-encode values in an array-like object
|
||||
///
|
||||
/// The returned run-end encoded type uses the same value type of the input and
|
||||
/// run-end type defined in the options.
|
||||
///
|
||||
/// \param[in] value array-like input
|
||||
/// \param[in] options configures encoding behavior
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return result with same shape but run-end encoded
|
||||
///
|
||||
/// \since 12.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> RunEndEncode(
|
||||
const Datum& value,
|
||||
const RunEndEncodeOptions& options = RunEndEncodeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Decode a Run-End Encoded array to a plain array
|
||||
///
|
||||
/// The output data type is the same as the values array type of run-end encoded
|
||||
/// input.
|
||||
///
|
||||
/// \param[in] value run-end-encoded input
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return plain array resulting from decoding the run-end encoded input
|
||||
///
|
||||
/// \since 12.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> RunEndDecode(const Datum& value, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative sum of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative sum behavior
|
||||
/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
|
||||
/// status on overflow, otherwise wrap around on overflow
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeSum(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
bool check_overflow = false, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative product of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative prod behavior
|
||||
/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
|
||||
/// status on overflow, otherwise wrap around on overflow
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeProd(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
bool check_overflow = false, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative max of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative max behavior
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeMax(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative min of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative min behavior
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeMin(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative mean of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative mean behavior, `start` is ignored
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeMean(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the first order difference of an array.
|
||||
///
|
||||
/// Computes the first order difference of an array, i.e.
|
||||
/// output[i] = input[i] - input[i - p] if i >= p
|
||||
/// output[i] = null otherwise
|
||||
/// where p is the period. For example, with p = 1,
|
||||
/// Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5].
|
||||
/// With p = 2,
|
||||
/// Diff([1, 4, 9, 10, 15]) = [null, null, 8, 6, 6]
|
||||
/// p can also be negative, in which case the diff is computed in
|
||||
/// the opposite direction.
|
||||
/// \param[in] array array input
|
||||
/// \param[in] options options, specifying overflow behavior and period
|
||||
/// \param[in] check_overflow whether to return error on overflow
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return result as array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> PairwiseDiff(const Array& array,
|
||||
const PairwiseOptions& options,
|
||||
bool check_overflow = false,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the inverse permutation of the given indices.
|
||||
///
|
||||
/// For indices[i] = x, inverse_permutation[x] = i. And inverse_permutation[x] = null if x
|
||||
/// does not appear in the input indices. Indices must be in the range of [0, max_index],
|
||||
/// or null, which will be ignored. If multiple indices point to the same value, the last
|
||||
/// one is used.
|
||||
///
|
||||
/// For example, with
|
||||
/// indices = [null, 0, null, 2, 4, 1, 1]
|
||||
/// the inverse permutation is
|
||||
/// [1, 6, 3, null, 4, null, null]
|
||||
/// if max_index = 6.
|
||||
///
|
||||
/// \param[in] indices array-like indices
|
||||
/// \param[in] options configures the max index and the output type
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting inverse permutation
|
||||
///
|
||||
/// \since 20.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> InversePermutation(
|
||||
const Datum& indices,
|
||||
const InversePermutationOptions& options = InversePermutationOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Scatter the values into specified positions according to the indices.
|
||||
///
|
||||
/// For indices[i] = x, output[x] = values[i]. And output[x] = null if x does not appear
|
||||
/// in the input indices. Indices must be in the range of [0, max_index], or null, in
|
||||
/// which case the corresponding value will be ignored. If multiple indices point to the
|
||||
/// same value, the last one is used.
|
||||
///
|
||||
/// For example, with
|
||||
/// values = [a, b, c, d, e, f, g]
|
||||
/// indices = [null, 0, null, 2, 4, 1, 1]
|
||||
/// the output is
|
||||
/// [b, g, d, null, e, null, null]
|
||||
/// if max_index = 6.
|
||||
///
|
||||
/// \param[in] values datum to scatter
|
||||
/// \param[in] indices array-like indices
|
||||
/// \param[in] options configures the max index of to scatter
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
///
|
||||
/// \since 20.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Scatter(const Datum& values, const Datum& indices,
|
||||
const ScatterOptions& options = ScatterOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,134 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/function.h"
|
||||
#include "arrow/compute/function_options.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
|
||||
namespace compute {
|
||||
|
||||
class ExecContext;
|
||||
|
||||
/// \addtogroup compute-concrete-options
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT CastOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit CastOptions(bool safe = true);
|
||||
|
||||
static constexpr const char kTypeName[] = "CastOptions";
|
||||
static CastOptions Safe(TypeHolder to_type = {}) {
|
||||
CastOptions safe(true);
|
||||
safe.to_type = std::move(to_type);
|
||||
return safe;
|
||||
}
|
||||
|
||||
static CastOptions Unsafe(TypeHolder to_type = {}) {
|
||||
CastOptions unsafe(false);
|
||||
unsafe.to_type = std::move(to_type);
|
||||
return unsafe;
|
||||
}
|
||||
|
||||
// Type being casted to. May be passed separate to eager function
|
||||
// compute::Cast
|
||||
TypeHolder to_type;
|
||||
|
||||
bool allow_int_overflow;
|
||||
bool allow_time_truncate;
|
||||
bool allow_time_overflow;
|
||||
bool allow_decimal_truncate;
|
||||
bool allow_float_truncate;
|
||||
// Indicate if conversions from Binary/FixedSizeBinary to string must
|
||||
// validate the utf8 payload.
|
||||
bool allow_invalid_utf8;
|
||||
|
||||
/// true if the safety options all match CastOptions::Safe
|
||||
///
|
||||
/// Note, if this returns false it does not mean is_unsafe will return true
|
||||
bool is_safe() const;
|
||||
/// true if the safety options all match CastOptions::Unsafe
|
||||
///
|
||||
/// Note, if this returns false it does not mean is_safe will return true
|
||||
bool is_unsafe() const;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Return true if a cast function is defined
|
||||
ARROW_EXPORT
|
||||
bool CanCast(const DataType& from_type, const DataType& to_type);
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Convenience invocation APIs for a number of kernels
|
||||
|
||||
/// \brief Cast from one array type to another
|
||||
/// \param[in] value array to cast
|
||||
/// \param[in] to_type type to cast to
|
||||
/// \param[in] options casting options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting array
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Cast(const Array& value, const TypeHolder& to_type,
|
||||
const CastOptions& options = CastOptions::Safe(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Cast from one array type to another
|
||||
/// \param[in] value array to cast
|
||||
/// \param[in] options casting options. The "to_type" field must be populated
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting array
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Cast(const Datum& value, const CastOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Cast from one value to another
|
||||
/// \param[in] value datum to cast
|
||||
/// \param[in] to_type type to cast to
|
||||
/// \param[in] options casting options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Cast(const Datum& value, const TypeHolder& to_type,
|
||||
const CastOptions& options = CastOptions::Safe(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,489 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
// It seems like 64K might be a good default chunksize to use for execution
|
||||
// based on the experience of other query processing systems. The current
|
||||
// default is not to chunk contiguous arrays, though, but this may change in
|
||||
// the future once parallel execution is implemented
|
||||
static constexpr int64_t kDefaultExecChunksize = UINT16_MAX;
|
||||
|
||||
/// \brief Context for expression-global variables and options used by
|
||||
/// function evaluation
|
||||
class ARROW_EXPORT ExecContext {
|
||||
public:
|
||||
// If no function registry passed, the default is used.
|
||||
explicit ExecContext(MemoryPool* pool = default_memory_pool(),
|
||||
::arrow::internal::Executor* executor = NULLPTR,
|
||||
FunctionRegistry* func_registry = NULLPTR);
|
||||
|
||||
/// \brief The MemoryPool used for allocations, default is
|
||||
/// default_memory_pool().
|
||||
MemoryPool* memory_pool() const { return pool_; }
|
||||
|
||||
const ::arrow::internal::CpuInfo* cpu_info() const;
|
||||
|
||||
/// \brief An Executor which may be used to parallelize execution.
|
||||
::arrow::internal::Executor* executor() const { return executor_; }
|
||||
|
||||
/// \brief The FunctionRegistry for looking up functions by name and
|
||||
/// selecting kernels for execution. Defaults to the library-global function
|
||||
/// registry provided by GetFunctionRegistry.
|
||||
FunctionRegistry* func_registry() const { return func_registry_; }
|
||||
|
||||
// \brief Set maximum length unit of work for kernel execution. Larger
|
||||
// contiguous array inputs will be split into smaller chunks, and, if
|
||||
// possible and enabled, processed in parallel. The default chunksize is
|
||||
// INT64_MAX, so contiguous arrays are not split.
|
||||
void set_exec_chunksize(int64_t chunksize) { exec_chunksize_ = chunksize; }
|
||||
|
||||
// \brief Maximum length for ExecBatch data chunks processed by
|
||||
// kernels. Contiguous array inputs with longer length will be split into
|
||||
// smaller chunks.
|
||||
int64_t exec_chunksize() const { return exec_chunksize_; }
|
||||
|
||||
/// \brief Set whether to use multiple threads for function execution. This
|
||||
/// is not yet used.
|
||||
void set_use_threads(bool use_threads = true) { use_threads_ = use_threads; }
|
||||
|
||||
/// \brief If true, then utilize multiple threads where relevant for function
|
||||
/// execution. This is not yet used.
|
||||
bool use_threads() const { return use_threads_; }
|
||||
|
||||
// Set the preallocation strategy for kernel execution as it relates to
|
||||
// chunked execution. For chunked execution, whether via ChunkedArray inputs
|
||||
// or splitting larger Array arguments into smaller pieces, contiguous
|
||||
// allocation (if permitted by the kernel) will allocate one large array to
|
||||
// write output into yielding it to the caller at the end. If this option is
|
||||
// set to off, then preallocations will be performed independently for each
|
||||
// chunk of execution
|
||||
//
|
||||
// TODO: At some point we might want the limit the size of contiguous
|
||||
// preallocations. For example, even if the exec_chunksize is 64K or less, we
|
||||
// might limit contiguous allocations to 1M records, say.
|
||||
void set_preallocate_contiguous(bool preallocate) {
|
||||
preallocate_contiguous_ = preallocate;
|
||||
}
|
||||
|
||||
/// \brief If contiguous preallocations should be used when doing chunked
|
||||
/// execution as specified by exec_chunksize(). See
|
||||
/// set_preallocate_contiguous() for more information.
|
||||
bool preallocate_contiguous() const { return preallocate_contiguous_; }
|
||||
|
||||
private:
|
||||
MemoryPool* pool_;
|
||||
::arrow::internal::Executor* executor_;
|
||||
FunctionRegistry* func_registry_;
|
||||
int64_t exec_chunksize_ = std::numeric_limits<int64_t>::max();
|
||||
bool preallocate_contiguous_ = true;
|
||||
bool use_threads_ = true;
|
||||
};
|
||||
|
||||
// TODO: Consider standardizing on uint16 selection vectors and only use them
|
||||
// when we can ensure that each value is 64K length or smaller
|
||||
|
||||
/// \brief Container for an array of value selection indices that were
|
||||
/// materialized from a filter.
|
||||
///
|
||||
/// Columnar query engines (see e.g. [1]) have found that rather than
|
||||
/// materializing filtered data, the filter can instead be converted to an
|
||||
/// array of the "on" indices and then "fusing" these indices in operator
|
||||
/// implementations. This is especially relevant for aggregations but also
|
||||
/// applies to scalar operations.
|
||||
///
|
||||
/// We are not yet using this so this is mostly a placeholder for now.
|
||||
///
|
||||
/// [1]: http://cidrdb.org/cidr2005/papers/P19.pdf
|
||||
class ARROW_EXPORT SelectionVector {
|
||||
public:
|
||||
explicit SelectionVector(std::shared_ptr<ArrayData> data);
|
||||
|
||||
explicit SelectionVector(const Array& arr);
|
||||
|
||||
/// \brief Create SelectionVector from boolean mask
|
||||
static Result<std::shared_ptr<SelectionVector>> FromMask(const BooleanArray& arr);
|
||||
|
||||
const int32_t* indices() const { return indices_; }
|
||||
int32_t length() const;
|
||||
|
||||
private:
|
||||
std::shared_ptr<ArrayData> data_;
|
||||
const int32_t* indices_;
|
||||
};
|
||||
|
||||
/// An index to represent that a batch does not belong to an ordered stream
|
||||
constexpr int64_t kUnsequencedIndex = -1;
|
||||
|
||||
/// \brief A unit of work for kernel execution. It contains a collection of
|
||||
/// Array and Scalar values and an optional SelectionVector indicating that
|
||||
/// there is an unmaterialized filter that either must be materialized, or (if
|
||||
/// the kernel supports it) pushed down into the kernel implementation.
|
||||
///
|
||||
/// ExecBatch is semantically similar to RecordBatch in that in a SQL context
|
||||
/// it represents a collection of records, but constant "columns" are
|
||||
/// represented by Scalar values rather than having to be converted into arrays
|
||||
/// with repeated values.
|
||||
///
|
||||
/// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight
|
||||
/// than is desirable for this class. Microbenchmarks would help determine for
|
||||
/// sure. See ARROW-8928.
|
||||
|
||||
/// \addtogroup acero-internals
|
||||
/// @{
|
||||
|
||||
struct ARROW_EXPORT ExecBatch {
|
||||
ExecBatch() = default;
|
||||
ExecBatch(std::vector<Datum> values, int64_t length)
|
||||
: values(std::move(values)), length(length) {}
|
||||
|
||||
explicit ExecBatch(const RecordBatch& batch);
|
||||
|
||||
/// \brief Infer the ExecBatch length from values.
|
||||
static Result<int64_t> InferLength(const std::vector<Datum>& values);
|
||||
|
||||
/// Creates an ExecBatch with length-validation.
|
||||
///
|
||||
/// If any value is given, then all values must have a common length. If the given
|
||||
/// length is negative, then the length of the ExecBatch is set to this common length,
|
||||
/// or to 1 if no values are given. Otherwise, the given length must equal the common
|
||||
/// length, if any value is given.
|
||||
static Result<ExecBatch> Make(std::vector<Datum> values, int64_t length = -1);
|
||||
|
||||
Result<std::shared_ptr<RecordBatch>> ToRecordBatch(
|
||||
std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// The values representing positional arguments to be passed to a kernel's
|
||||
/// exec function for processing.
|
||||
std::vector<Datum> values;
|
||||
|
||||
/// A deferred filter represented as an array of indices into the values.
|
||||
///
|
||||
/// For example, the filter [true, true, false, true] would be represented as
|
||||
/// the selection vector [0, 1, 3]. When the selection vector is set,
|
||||
/// ExecBatch::length is equal to the length of this array.
|
||||
std::shared_ptr<SelectionVector> selection_vector;
|
||||
|
||||
/// A predicate Expression guaranteed to evaluate to true for all rows in this batch.
|
||||
Expression guarantee = literal(true);
|
||||
|
||||
/// The semantic length of the ExecBatch. When the values are all scalars,
|
||||
/// the length should be set to 1 for non-aggregate kernels, otherwise the
|
||||
/// length is taken from the array values, except when there is a selection
|
||||
/// vector. When there is a selection vector set, the length of the batch is
|
||||
/// the length of the selection. Aggregate kernels can have an ExecBatch
|
||||
/// formed by projecting just the partition columns from a batch in which
|
||||
/// case, it would have scalar rows with length greater than 1.
|
||||
///
|
||||
/// If the array values are of length 0 then the length is 0 regardless of
|
||||
/// whether any values are Scalar.
|
||||
int64_t length = 0;
|
||||
|
||||
/// \brief index of this batch in a sorted stream of batches
|
||||
///
|
||||
/// This index must be strictly monotonic starting at 0 without gaps or
|
||||
/// it can be set to kUnsequencedIndex if there is no meaningful order
|
||||
int64_t index = kUnsequencedIndex;
|
||||
|
||||
/// \brief The sum of bytes in each buffer referenced by the batch
|
||||
///
|
||||
/// Note: Scalars are not counted
|
||||
/// Note: Some values may referenced only part of a buffer, for
|
||||
/// example, an array with an offset. The actual data
|
||||
/// visible to this batch will be smaller than the total
|
||||
/// buffer size in this case.
|
||||
int64_t TotalBufferSize() const;
|
||||
|
||||
/// \brief Return the value at the i-th index
|
||||
template <typename index_type>
|
||||
inline const Datum& operator[](index_type i) const {
|
||||
return values[i];
|
||||
}
|
||||
|
||||
bool Equals(const ExecBatch& other) const;
|
||||
|
||||
/// \brief A convenience for the number of values / arguments.
|
||||
int num_values() const { return static_cast<int>(values.size()); }
|
||||
|
||||
ExecBatch Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
Result<ExecBatch> SelectValues(const std::vector<int>& ids) const;
|
||||
|
||||
/// \brief A convenience for returning the types from the batch.
|
||||
std::vector<TypeHolder> GetTypes() const {
|
||||
std::vector<TypeHolder> result;
|
||||
for (const auto& value : this->values) {
|
||||
result.emplace_back(value.type());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
|
||||
inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
|
||||
|
||||
ARROW_EXPORT void PrintTo(const ExecBatch&, std::ostream*);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup compute-internals Utilities for calling functions, useful for those
|
||||
/// extending the function registry
|
||||
///
|
||||
/// @{
|
||||
|
||||
struct ExecValue {
|
||||
ArraySpan array = {};
|
||||
const Scalar* scalar = NULLPTR;
|
||||
|
||||
ExecValue(const Scalar* scalar) // NOLINT implicit conversion
|
||||
: scalar(scalar) {}
|
||||
|
||||
ExecValue(ArraySpan array) // NOLINT implicit conversion
|
||||
: array(std::move(array)) {}
|
||||
|
||||
ExecValue(const ArrayData& array) { // NOLINT implicit conversion
|
||||
this->array.SetMembers(array);
|
||||
}
|
||||
|
||||
ExecValue() = default;
|
||||
ExecValue(const ExecValue& other) = default;
|
||||
ExecValue& operator=(const ExecValue& other) = default;
|
||||
ExecValue(ExecValue&& other) = default;
|
||||
ExecValue& operator=(ExecValue&& other) = default;
|
||||
|
||||
int64_t length() const { return this->is_array() ? this->array.length : 1; }
|
||||
|
||||
bool is_array() const { return this->scalar == NULLPTR; }
|
||||
bool is_scalar() const { return !this->is_array(); }
|
||||
|
||||
void SetArray(const ArrayData& array) {
|
||||
this->array.SetMembers(array);
|
||||
this->scalar = NULLPTR;
|
||||
}
|
||||
|
||||
void SetScalar(const Scalar* scalar) { this->scalar = scalar; }
|
||||
|
||||
template <typename ExactType>
|
||||
const ExactType& scalar_as() const {
|
||||
return ::arrow::internal::checked_cast<const ExactType&>(*this->scalar);
|
||||
}
|
||||
|
||||
/// XXX: here temporarily for compatibility with datum, see
|
||||
/// e.g. MakeStructExec in scalar_nested.cc
|
||||
int64_t null_count() const {
|
||||
if (this->is_array()) {
|
||||
return this->array.GetNullCount();
|
||||
} else {
|
||||
return this->scalar->is_valid ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
const DataType* type() const {
|
||||
if (this->is_array()) {
|
||||
return array.type;
|
||||
} else {
|
||||
return scalar->type.get();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ARROW_EXPORT ExecResult {
|
||||
// The default value of the variant is ArraySpan
|
||||
std::variant<ArraySpan, std::shared_ptr<ArrayData>> value;
|
||||
|
||||
int64_t length() const {
|
||||
if (this->is_array_span()) {
|
||||
return this->array_span()->length;
|
||||
} else {
|
||||
return this->array_data()->length;
|
||||
}
|
||||
}
|
||||
|
||||
const DataType* type() const {
|
||||
if (this->is_array_span()) {
|
||||
return this->array_span()->type;
|
||||
} else {
|
||||
return this->array_data()->type.get();
|
||||
}
|
||||
}
|
||||
|
||||
const ArraySpan* array_span() const { return &std::get<ArraySpan>(this->value); }
|
||||
ArraySpan* array_span_mutable() { return &std::get<ArraySpan>(this->value); }
|
||||
|
||||
bool is_array_span() const { return this->value.index() == 0; }
|
||||
|
||||
const std::shared_ptr<ArrayData>& array_data() const {
|
||||
return std::get<std::shared_ptr<ArrayData>>(this->value);
|
||||
}
|
||||
ArrayData* array_data_mutable() {
|
||||
return std::get<std::shared_ptr<ArrayData>>(this->value).get();
|
||||
}
|
||||
|
||||
bool is_array_data() const { return this->value.index() == 1; }
|
||||
};
|
||||
|
||||
/// \brief A "lightweight" column batch object which contains no
|
||||
/// std::shared_ptr objects and does not have any memory ownership
|
||||
/// semantics. Can represent a view onto an "owning" ExecBatch.
|
||||
struct ARROW_EXPORT ExecSpan {
|
||||
ExecSpan() = default;
|
||||
ExecSpan(const ExecSpan& other) = default;
|
||||
ExecSpan& operator=(const ExecSpan& other) = default;
|
||||
ExecSpan(ExecSpan&& other) = default;
|
||||
ExecSpan& operator=(ExecSpan&& other) = default;
|
||||
|
||||
explicit ExecSpan(std::vector<ExecValue> values, int64_t length)
|
||||
: length(length), values(std::move(values)) {}
|
||||
|
||||
explicit ExecSpan(const ExecBatch& batch) {
|
||||
this->length = batch.length;
|
||||
this->values.resize(batch.values.size());
|
||||
for (size_t i = 0; i < batch.values.size(); ++i) {
|
||||
const Datum& in_value = batch[i];
|
||||
ExecValue* out_value = &this->values[i];
|
||||
if (in_value.is_array()) {
|
||||
out_value->SetArray(*in_value.array());
|
||||
} else {
|
||||
out_value->SetScalar(in_value.scalar().get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Return the value at the i-th index
|
||||
template <typename index_type>
|
||||
inline const ExecValue& operator[](index_type i) const {
|
||||
return values[i];
|
||||
}
|
||||
|
||||
/// \brief A convenience for the number of values / arguments.
|
||||
int num_values() const { return static_cast<int>(values.size()); }
|
||||
|
||||
std::vector<TypeHolder> GetTypes() const {
|
||||
std::vector<TypeHolder> result;
|
||||
for (const auto& value : this->values) {
|
||||
result.emplace_back(value.type());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
ExecBatch ToExecBatch() const {
|
||||
ExecBatch result;
|
||||
result.length = this->length;
|
||||
for (const ExecValue& value : this->values) {
|
||||
if (value.is_array()) {
|
||||
result.values.push_back(value.array.ToArrayData());
|
||||
} else {
|
||||
result.values.push_back(value.scalar->GetSharedPtr());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int64_t length = 0;
|
||||
std::vector<ExecValue> values;
|
||||
};
|
||||
|
||||
/// \defgroup compute-call-function One-shot calls to compute functions
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief One-shot invoker for all types of functions.
|
||||
///
|
||||
/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
|
||||
/// and wrapping of outputs.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
|
||||
const FunctionOptions* options, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Variant of CallFunction which uses a function's default options.
|
||||
///
|
||||
/// NB: Some functions require FunctionOptions be provided.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief One-shot invoker for all types of functions.
|
||||
///
|
||||
/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
|
||||
/// and wrapping of outputs.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
|
||||
const FunctionOptions* options, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Variant of CallFunction which uses a function's default options.
|
||||
///
|
||||
/// NB: Some functions require FunctionOptions be provided.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup compute-function-executor One-shot calls to obtain function executors
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief One-shot executor provider for all types of functions.
|
||||
///
|
||||
/// This function creates and initializes a `FunctionExecutor` appropriate
|
||||
/// for the given function name, input types and function options.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<FunctionExecutor>> GetFunctionExecutor(
|
||||
const std::string& func_name, std::vector<TypeHolder> in_types,
|
||||
const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR);
|
||||
|
||||
/// \brief One-shot executor provider for all types of functions.
|
||||
///
|
||||
/// This function creates and initializes a `FunctionExecutor` appropriate
|
||||
/// for the given function name, input types (taken from the Datum arguments)
|
||||
/// and function options.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<FunctionExecutor>> GetFunctionExecutor(
|
||||
const std::string& func_name, const std::vector<Datum>& args,
|
||||
const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR);
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,295 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/small_vector.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
/// \defgroup expression-core Expressions to describe data transformations
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// An unbound expression which maps a single Datum to another Datum.
|
||||
/// An expression is one of
|
||||
/// - A literal Datum.
|
||||
/// - A reference to a single (potentially nested) field of the input Datum.
|
||||
/// - A call to a compute function, with arguments specified by other Expressions.
|
||||
class ARROW_EXPORT Expression {
|
||||
public:
|
||||
struct Call {
|
||||
std::string function_name;
|
||||
std::vector<Expression> arguments;
|
||||
std::shared_ptr<FunctionOptions> options;
|
||||
// Cached hash value
|
||||
size_t hash;
|
||||
|
||||
// post-Bind properties:
|
||||
std::shared_ptr<Function> function;
|
||||
const Kernel* kernel = NULLPTR;
|
||||
std::shared_ptr<KernelState> kernel_state;
|
||||
TypeHolder type;
|
||||
|
||||
void ComputeHash();
|
||||
};
|
||||
|
||||
std::string ToString() const;
|
||||
bool Equals(const Expression& other) const;
|
||||
size_t hash() const;
|
||||
struct Hash {
|
||||
size_t operator()(const Expression& expr) const { return expr.hash(); }
|
||||
};
|
||||
|
||||
/// Bind this expression to the given input type, looking up Kernels and field types.
|
||||
/// Some expression simplification may be performed and implicit casts will be inserted.
|
||||
/// Any state necessary for execution will be initialized and returned.
|
||||
Result<Expression> Bind(const TypeHolder& in, ExecContext* = NULLPTR) const;
|
||||
Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;
|
||||
|
||||
// XXX someday
|
||||
// Clone all KernelState in this bound expression. If any function referenced by this
|
||||
// expression has mutable KernelState, it is not safe to execute or apply simplification
|
||||
// passes to it (or copies of it!) from multiple threads. Cloning state produces new
|
||||
// KernelStates where necessary to ensure that Expressions may be manipulated safely
|
||||
// on multiple threads.
|
||||
// Result<ExpressionState> CloneState() const;
|
||||
// Status SetState(ExpressionState);
|
||||
|
||||
/// Return true if all an expression's field references have explicit types
|
||||
/// and all of its functions' kernels are looked up.
|
||||
bool IsBound() const;
|
||||
|
||||
/// Return true if this expression is composed only of Scalar literals, field
|
||||
/// references, and calls to ScalarFunctions.
|
||||
bool IsScalarExpression() const;
|
||||
|
||||
/// Return true if this expression is literal and entirely null.
|
||||
bool IsNullLiteral() const;
|
||||
|
||||
/// Return true if this expression could evaluate to true. Will return true for any
|
||||
/// unbound or non-boolean Expressions. IsSatisfiable does not (currently) do any
|
||||
/// canonicalization or simplification of the expression, so even Expressions
|
||||
/// which are unsatisfiable may spuriously return `true` here. This function is
|
||||
/// intended for use in predicate pushdown where a filter expression is simplified
|
||||
/// by a guarantee, so it assumes that trying to simplify again would be redundant.
|
||||
bool IsSatisfiable() const;
|
||||
|
||||
// XXX someday
|
||||
// Result<PipelineGraph> GetPipelines();
|
||||
|
||||
bool is_valid() const { return impl_ != NULLPTR; }
|
||||
|
||||
/// Access a Call or return nullptr if this expression is not a call
|
||||
const Call* call() const;
|
||||
/// Access a Datum or return nullptr if this expression is not a literal
|
||||
const Datum* literal() const;
|
||||
/// Access a FieldRef or return nullptr if this expression is not a field_ref
|
||||
const FieldRef* field_ref() const;
|
||||
|
||||
/// The type to which this expression will evaluate
|
||||
const DataType* type() const;
|
||||
// XXX someday
|
||||
// NullGeneralization::type nullable() const;
|
||||
|
||||
struct Parameter {
|
||||
FieldRef ref;
|
||||
|
||||
// post-bind properties
|
||||
TypeHolder type;
|
||||
::arrow::internal::SmallVector<int, 2> indices;
|
||||
};
|
||||
const Parameter* parameter() const;
|
||||
|
||||
Expression() = default;
|
||||
explicit Expression(Call call);
|
||||
explicit Expression(Datum literal);
|
||||
explicit Expression(Parameter parameter);
|
||||
|
||||
static bool Identical(const Expression& l, const Expression& r);
|
||||
|
||||
private:
|
||||
using Impl = std::variant<Datum, Parameter, Call>;
|
||||
std::shared_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
|
||||
inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }
|
||||
|
||||
ARROW_EXPORT void PrintTo(const Expression&, std::ostream*);
|
||||
|
||||
// Factories
|
||||
|
||||
ARROW_EXPORT
|
||||
Expression literal(Datum lit);
|
||||
|
||||
template <typename Arg>
|
||||
Expression literal(Arg&& arg) {
|
||||
return literal(Datum(std::forward<Arg>(arg)));
|
||||
}
|
||||
|
||||
ARROW_EXPORT
|
||||
Expression field_ref(FieldRef ref);
|
||||
|
||||
ARROW_EXPORT
|
||||
Expression call(std::string function, std::vector<Expression> arguments,
|
||||
std::shared_ptr<FunctionOptions> options = NULLPTR);
|
||||
|
||||
template <typename Options, typename = typename std::enable_if<
|
||||
std::is_base_of<FunctionOptions, Options>::value>::type>
|
||||
Expression call(std::string function, std::vector<Expression> arguments,
|
||||
Options options) {
|
||||
return call(std::move(function), std::move(arguments),
|
||||
std::make_shared<Options>(std::move(options)));
|
||||
}
|
||||
|
||||
/// Assemble a list of all fields referenced by an Expression at any depth.
|
||||
ARROW_EXPORT
|
||||
std::vector<FieldRef> FieldsInExpression(const Expression&);
|
||||
|
||||
/// Check if the expression references any fields.
|
||||
ARROW_EXPORT
|
||||
bool ExpressionHasFieldRefs(const Expression&);
|
||||
|
||||
struct ARROW_EXPORT KnownFieldValues;
|
||||
|
||||
/// Assemble a mapping from field references to known values. This derives known values
|
||||
/// from "equal" and "is_null" Expressions referencing a field and a literal.
|
||||
ARROW_EXPORT
|
||||
Result<KnownFieldValues> ExtractKnownFieldValues(
|
||||
const Expression& guaranteed_true_predicate);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup expression-passes Functions for modification of Expressions
|
||||
///
|
||||
/// @{
|
||||
///
|
||||
/// These transform bound expressions. Some transforms utilize a guarantee, which is
|
||||
/// provided as an Expression which is guaranteed to evaluate to true. The
|
||||
/// guaranteed_true_predicate need not be bound, but canonicalization is currently
|
||||
/// deferred to producers of guarantees. For example in order to be recognized as a
|
||||
/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
|
||||
/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
|
||||
/// other semantically identical Expressions will not be recognized.
|
||||
|
||||
/// Weak canonicalization which establishes guarantees for subsequent passes. Even
|
||||
/// equivalent Expressions may result in different canonicalized expressions.
|
||||
/// TODO this could be a strong canonicalization
|
||||
ARROW_EXPORT
|
||||
Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);
|
||||
|
||||
/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
|
||||
/// be null so replace the call with a null literal). Includes early evaluation of all
|
||||
/// calls whose arguments are entirely literal.
|
||||
ARROW_EXPORT
|
||||
Result<Expression> FoldConstants(Expression);
|
||||
|
||||
/// Simplify Expressions by replacing with known values of the fields which it references.
|
||||
ARROW_EXPORT
|
||||
Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
|
||||
Expression);
|
||||
|
||||
/// Simplify an expression by replacing subexpressions based on a guarantee:
|
||||
/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
|
||||
/// used to remove redundant function calls from a filter expression or to replace a
|
||||
/// reference to a constant-value field with a literal.
|
||||
ARROW_EXPORT
|
||||
Result<Expression> SimplifyWithGuarantee(Expression,
|
||||
const Expression& guaranteed_true_predicate);
|
||||
|
||||
/// Replace all named field refs (e.g. "x" or "x.y") with field paths (e.g. [0] or [1,3])
|
||||
///
|
||||
/// This isn't usually needed and does not offer any simplification by itself. However,
|
||||
/// it can be useful to normalize an expression to paths to make it simpler to work with.
|
||||
ARROW_EXPORT Result<Expression> RemoveNamedRefs(Expression expression);
|
||||
|
||||
/// @}
|
||||
|
||||
// Execution
|
||||
|
||||
/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
|
||||
/// RecordBatch which may have missing or incorrectly ordered columns.
|
||||
/// Missing fields will be replaced with null scalars.
|
||||
ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
|
||||
const Datum& partial,
|
||||
Expression guarantee = literal(true));
|
||||
|
||||
/// Execute a scalar expression against the provided state and input ExecBatch. This
|
||||
/// expression must be bound.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
|
||||
ExecContext* = NULLPTR);
|
||||
|
||||
/// Convenience function for invoking against a RecordBatch
|
||||
ARROW_EXPORT
|
||||
Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
|
||||
const Datum& partial_input, ExecContext* = NULLPTR);
|
||||
|
||||
// Serialization
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<Expression> Deserialize(std::shared_ptr<Buffer>);
|
||||
|
||||
/// \defgroup expression-convenience Helpers for convenient expression creation
|
||||
///
|
||||
/// @{
|
||||
|
||||
ARROW_EXPORT Expression project(std::vector<Expression> values,
|
||||
std::vector<std::string> names);
|
||||
|
||||
ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression less(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false);
|
||||
|
||||
ARROW_EXPORT Expression is_valid(Expression lhs);
|
||||
|
||||
ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
|
||||
ARROW_EXPORT Expression and_(const std::vector<Expression>&);
|
||||
ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
|
||||
ARROW_EXPORT Expression or_(const std::vector<Expression>&);
|
||||
ARROW_EXPORT Expression not_(Expression operand);
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,410 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/kernel.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/compare.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
/// \addtogroup compute-functions
|
||||
/// @{
|
||||
|
||||
/// \brief Contains the number of required arguments for the function.
|
||||
///
|
||||
/// Naming conventions taken from https://en.wikipedia.org/wiki/Arity.
|
||||
struct ARROW_EXPORT Arity {
|
||||
/// \brief A function taking no arguments
|
||||
static Arity Nullary() { return Arity(0, false); }
|
||||
|
||||
/// \brief A function taking 1 argument
|
||||
static Arity Unary() { return Arity(1, false); }
|
||||
|
||||
/// \brief A function taking 2 arguments
|
||||
static Arity Binary() { return Arity(2, false); }
|
||||
|
||||
/// \brief A function taking 3 arguments
|
||||
static Arity Ternary() { return Arity(3, false); }
|
||||
|
||||
/// \brief A function taking a variable number of arguments
|
||||
///
|
||||
/// \param[in] min_args the minimum number of arguments required when
|
||||
/// invoking the function
|
||||
static Arity VarArgs(int min_args = 0) { return Arity(min_args, true); }
|
||||
|
||||
// NOTE: the 0-argument form (default constructor) is required for Cython
|
||||
explicit Arity(int num_args = 0, bool is_varargs = false)
|
||||
: num_args(num_args), is_varargs(is_varargs) {}
|
||||
|
||||
/// The number of required arguments (or the minimum number for varargs
|
||||
/// functions).
|
||||
int num_args;
|
||||
|
||||
/// If true, then the num_args is the minimum number of required arguments.
|
||||
bool is_varargs = false;
|
||||
};
|
||||
|
||||
struct ARROW_EXPORT FunctionDoc {
|
||||
/// \brief A one-line summary of the function, using a verb.
|
||||
///
|
||||
/// For example, "Add two numeric arrays or scalars".
|
||||
std::string summary;
|
||||
|
||||
/// \brief A detailed description of the function, meant to follow the summary.
|
||||
std::string description;
|
||||
|
||||
/// \brief Symbolic names (identifiers) for the function arguments.
|
||||
///
|
||||
/// Some bindings may use this to generate nicer function signatures.
|
||||
std::vector<std::string> arg_names;
|
||||
|
||||
// TODO add argument descriptions?
|
||||
|
||||
/// \brief Name of the options class, if any.
|
||||
std::string options_class;
|
||||
|
||||
/// \brief Whether options are required for function execution
|
||||
///
|
||||
/// If false, then either the function does not have an options class
|
||||
/// or there is a usable default options value.
|
||||
bool options_required;
|
||||
|
||||
FunctionDoc() = default;
|
||||
|
||||
FunctionDoc(std::string summary, std::string description,
|
||||
std::vector<std::string> arg_names, std::string options_class = "",
|
||||
bool options_required = false)
|
||||
: summary(std::move(summary)),
|
||||
description(std::move(description)),
|
||||
arg_names(std::move(arg_names)),
|
||||
options_class(std::move(options_class)),
|
||||
options_required(options_required) {}
|
||||
|
||||
static const FunctionDoc& Empty();
|
||||
};
|
||||
|
||||
/// \brief An executor of a function with a preconfigured kernel
|
||||
class ARROW_EXPORT FunctionExecutor {
|
||||
public:
|
||||
virtual ~FunctionExecutor() = default;
|
||||
/// \brief Initialize or re-initialize the preconfigured kernel
|
||||
///
|
||||
/// This method may be called zero or more times. Depending on how
|
||||
/// the FunctionExecutor was obtained, it may already have been initialized.
|
||||
virtual Status Init(const FunctionOptions* options = NULLPTR,
|
||||
ExecContext* exec_ctx = NULLPTR) = 0;
|
||||
/// \brief Execute the preconfigured kernel with arguments that must fit it
|
||||
///
|
||||
/// The method requires the arguments be castable to the preconfigured types.
|
||||
///
|
||||
/// \param[in] args Arguments to execute the function on
|
||||
/// \param[in] length Length of arguments batch or -1 to default it. If the
|
||||
/// function has no parameters, this determines the batch length, defaulting
|
||||
/// to 0. Otherwise, if the function is scalar, this must equal the argument
|
||||
/// batch's inferred length or be -1 to default to it. This is ignored for
|
||||
/// vector functions.
|
||||
virtual Result<Datum> Execute(const std::vector<Datum>& args, int64_t length = -1) = 0;
|
||||
};
|
||||
|
||||
/// \brief Base class for compute functions. Function implementations contain a
|
||||
/// collection of "kernels" which are implementations of the function for
|
||||
/// specific argument types. Selecting a viable kernel for executing a function
|
||||
/// is referred to as "dispatching".
|
||||
class ARROW_EXPORT Function {
|
||||
public:
|
||||
/// \brief The kind of function, which indicates in what contexts it is
|
||||
/// valid for use.
|
||||
enum Kind {
|
||||
/// A function that performs scalar data operations on whole arrays of
|
||||
/// data. Can generally process Array or Scalar values. The size of the
|
||||
/// output will be the same as the size (or broadcasted size, in the case
|
||||
/// of mixing Array and Scalar inputs) of the input.
|
||||
SCALAR,
|
||||
|
||||
/// A function with array input and output whose behavior depends on the
|
||||
/// values of the entire arrays passed, rather than the value of each scalar
|
||||
/// value.
|
||||
VECTOR,
|
||||
|
||||
/// A function that computes scalar summary statistics from array input.
|
||||
SCALAR_AGGREGATE,
|
||||
|
||||
/// A function that computes grouped summary statistics from array input
|
||||
/// and an array of group identifiers.
|
||||
HASH_AGGREGATE,
|
||||
|
||||
/// A function that dispatches to other functions and does not contain its
|
||||
/// own kernels.
|
||||
META
|
||||
};
|
||||
|
||||
virtual ~Function() = default;
|
||||
|
||||
/// \brief The name of the kernel. The registry enforces uniqueness of names.
|
||||
const std::string& name() const { return name_; }
|
||||
|
||||
/// \brief The kind of kernel, which indicates in what contexts it is valid
|
||||
/// for use.
|
||||
Function::Kind kind() const { return kind_; }
|
||||
|
||||
/// \brief Contains the number of arguments the function requires, or if the
|
||||
/// function accepts variable numbers of arguments.
|
||||
const Arity& arity() const { return arity_; }
|
||||
|
||||
/// \brief Return the function documentation
|
||||
const FunctionDoc& doc() const { return doc_; }
|
||||
|
||||
/// \brief Returns the number of registered kernels for this function.
|
||||
virtual int num_kernels() const = 0;
|
||||
|
||||
/// \brief Return a kernel that can execute the function given the exact
|
||||
/// argument types (without implicit type casts).
|
||||
///
|
||||
/// NB: This function is overridden in CastFunction.
|
||||
virtual Result<const Kernel*> DispatchExact(const std::vector<TypeHolder>& types) const;
|
||||
|
||||
/// \brief Return a best-match kernel that can execute the function given the argument
|
||||
/// types, after implicit casts are applied.
|
||||
///
|
||||
/// \param[in,out] values Argument types. An element may be modified to
|
||||
/// indicate that the returned kernel only approximately matches the input
|
||||
/// value descriptors; callers are responsible for casting inputs to the type
|
||||
/// required by the kernel.
|
||||
virtual Result<const Kernel*> DispatchBest(std::vector<TypeHolder>* values) const;
|
||||
|
||||
/// \brief Get a function executor with a best-matching kernel
|
||||
///
|
||||
/// The returned executor will by default work with the default FunctionOptions
|
||||
/// and KernelContext. If you want to change that, call `FunctionExecutor::Init`.
|
||||
virtual Result<std::shared_ptr<FunctionExecutor>> GetBestExecutor(
|
||||
std::vector<TypeHolder> inputs) const;
|
||||
|
||||
/// \brief Execute the function eagerly with the passed input arguments with
|
||||
/// kernel dispatch, batch iteration, and memory allocation details taken
|
||||
/// care of.
|
||||
///
|
||||
/// If the `options` pointer is null, then `default_options()` will be used.
|
||||
///
|
||||
/// This function can be overridden in subclasses.
|
||||
virtual Result<Datum> Execute(const std::vector<Datum>& args,
|
||||
const FunctionOptions* options, ExecContext* ctx) const;
|
||||
|
||||
virtual Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
|
||||
ExecContext* ctx) const;
|
||||
|
||||
/// \brief Returns the default options for this function.
|
||||
///
|
||||
/// Whatever option semantics a Function has, implementations must guarantee
|
||||
/// that default_options() is valid to pass to Execute as options.
|
||||
const FunctionOptions* default_options() const { return default_options_; }
|
||||
|
||||
virtual Status Validate() const;
|
||||
|
||||
/// \brief Returns the pure property for this function.
|
||||
///
|
||||
/// Impure functions are those that may return different results for the same
|
||||
/// input arguments. For example, a function that returns a random number is
|
||||
/// not pure. An expression containing only pure functions can be simplified by
|
||||
/// pre-evaluating any sub-expressions that have constant arguments.
|
||||
virtual bool is_pure() const { return true; }
|
||||
|
||||
protected:
|
||||
Function(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options)
|
||||
: name_(std::move(name)),
|
||||
kind_(kind),
|
||||
arity_(arity),
|
||||
doc_(std::move(doc)),
|
||||
default_options_(default_options) {}
|
||||
|
||||
Status CheckArity(size_t num_args) const;
|
||||
|
||||
std::string name_;
|
||||
Function::Kind kind_;
|
||||
Arity arity_;
|
||||
const FunctionDoc doc_;
|
||||
const FunctionOptions* default_options_ = NULLPTR;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename KernelType>
|
||||
class FunctionImpl : public Function {
|
||||
public:
|
||||
/// \brief Return pointers to current-available kernels for inspection
|
||||
std::vector<const KernelType*> kernels() const {
|
||||
std::vector<const KernelType*> result;
|
||||
for (const auto& kernel : kernels_) {
|
||||
result.push_back(&kernel);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int num_kernels() const override { return static_cast<int>(kernels_.size()); }
|
||||
|
||||
protected:
|
||||
FunctionImpl(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options)
|
||||
: Function(std::move(name), kind, arity, std::move(doc), default_options) {}
|
||||
|
||||
std::vector<KernelType> kernels_;
|
||||
};
|
||||
|
||||
/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
|
||||
ARROW_EXPORT
|
||||
const Kernel* DispatchExactImpl(const Function* func, const std::vector<TypeHolder>&);
|
||||
|
||||
/// \brief Return an error message if no Kernel is found.
|
||||
ARROW_EXPORT
|
||||
Status NoMatchingKernel(const Function* func, const std::vector<TypeHolder>&);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// \brief A function that executes elementwise operations on arrays or
|
||||
/// scalars, and therefore whose results generally do not depend on the order
|
||||
/// of the values in the arguments. Accepts and returns arrays that are all of
|
||||
/// the same size. These functions roughly correspond to the functions used in
|
||||
/// SQL expressions.
|
||||
class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
|
||||
public:
|
||||
using KernelType = ScalarKernel;
|
||||
|
||||
ScalarFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR, bool is_pure = true)
|
||||
: detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity,
|
||||
std::move(doc), default_options),
|
||||
is_pure_(is_pure) {}
|
||||
|
||||
/// \brief Add a kernel with given input/output types, no required state
|
||||
/// initialization, preallocation for fixed-width types, and default null
|
||||
/// handling (intersect validity bitmaps of inputs).
|
||||
Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
|
||||
ArrayKernelExec exec, KernelInit init = NULLPTR,
|
||||
std::shared_ptr<MatchConstraint> constraint = NULLPTR);
|
||||
|
||||
/// \brief Add a kernel (function implementation). Returns error if the
|
||||
/// kernel's signature does not match the function's arity.
|
||||
Status AddKernel(ScalarKernel kernel);
|
||||
|
||||
/// \brief Returns the pure property for this function.
|
||||
bool is_pure() const override { return is_pure_; }
|
||||
|
||||
private:
|
||||
const bool is_pure_;
|
||||
};
|
||||
|
||||
/// \brief A function that executes general array operations that may yield
|
||||
/// outputs of different sizes or have results that depend on the whole array
|
||||
/// contents. These functions roughly correspond to the functions found in
|
||||
/// non-SQL array languages like APL and its derivatives.
|
||||
class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
|
||||
public:
|
||||
using KernelType = VectorKernel;
|
||||
|
||||
VectorFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR)
|
||||
: detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity,
|
||||
std::move(doc), default_options) {}
|
||||
|
||||
/// \brief Add a simple kernel with given input/output types, no required
|
||||
/// state initialization, no data preallocation, and no preallocation of the
|
||||
/// validity bitmap.
|
||||
Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
|
||||
ArrayKernelExec exec, KernelInit init = NULLPTR);
|
||||
|
||||
/// \brief Add a kernel (function implementation). Returns error if the
|
||||
/// kernel's signature does not match the function's arity.
|
||||
Status AddKernel(VectorKernel kernel);
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ScalarAggregateFunction
|
||||
: public detail::FunctionImpl<ScalarAggregateKernel> {
|
||||
public:
|
||||
using KernelType = ScalarAggregateKernel;
|
||||
|
||||
ScalarAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR)
|
||||
: detail::FunctionImpl<ScalarAggregateKernel>(std::move(name),
|
||||
Function::SCALAR_AGGREGATE, arity,
|
||||
std::move(doc), default_options) {}
|
||||
|
||||
/// \brief Add a kernel (function implementation). Returns error if the
|
||||
/// kernel's signature does not match the function's arity.
|
||||
Status AddKernel(ScalarAggregateKernel kernel);
|
||||
};
|
||||
|
||||
class ARROW_EXPORT HashAggregateFunction
|
||||
: public detail::FunctionImpl<HashAggregateKernel> {
|
||||
public:
|
||||
using KernelType = HashAggregateKernel;
|
||||
|
||||
HashAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR)
|
||||
: detail::FunctionImpl<HashAggregateKernel>(std::move(name),
|
||||
Function::HASH_AGGREGATE, arity,
|
||||
std::move(doc), default_options) {}
|
||||
|
||||
/// \brief Add a kernel (function implementation). Returns error if the
|
||||
/// kernel's signature does not match the function's arity.
|
||||
Status AddKernel(HashAggregateKernel kernel);
|
||||
};
|
||||
|
||||
/// \brief A function that dispatches to other functions. Must implement
|
||||
/// MetaFunction::ExecuteImpl.
|
||||
///
|
||||
/// For Array, ChunkedArray, and Scalar Datum kinds, may rely on the execution
|
||||
/// of concrete Function types, but must handle other Datum kinds on its own.
|
||||
class ARROW_EXPORT MetaFunction : public Function {
|
||||
public:
|
||||
int num_kernels() const override { return 0; }
|
||||
|
||||
Result<Datum> Execute(const std::vector<Datum>& args, const FunctionOptions* options,
|
||||
ExecContext* ctx) const override;
|
||||
|
||||
Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
|
||||
ExecContext* ctx) const override;
|
||||
|
||||
protected:
|
||||
virtual Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
|
||||
const FunctionOptions* options,
|
||||
ExecContext* ctx) const = 0;
|
||||
|
||||
MetaFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR)
|
||||
: Function(std::move(name), Function::META, arity, std::move(doc),
|
||||
default_options) {}
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,81 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
/// \addtogroup compute-functions
|
||||
/// @{
|
||||
|
||||
/// \brief Extension point for defining options outside libarrow (but
|
||||
/// still within this project).
|
||||
class ARROW_EXPORT FunctionOptionsType {
|
||||
public:
|
||||
virtual ~FunctionOptionsType() = default;
|
||||
|
||||
virtual const char* type_name() const = 0;
|
||||
virtual std::string Stringify(const FunctionOptions&) const = 0;
|
||||
virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0;
|
||||
virtual Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const;
|
||||
virtual Result<std::unique_ptr<FunctionOptions>> Deserialize(
|
||||
const Buffer& buffer) const;
|
||||
virtual std::unique_ptr<FunctionOptions> Copy(const FunctionOptions&) const = 0;
|
||||
};
|
||||
|
||||
/// \brief Base class for specifying options configuring a function's behavior,
|
||||
/// such as error handling.
|
||||
class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> {
|
||||
public:
|
||||
virtual ~FunctionOptions() = default;
|
||||
|
||||
const FunctionOptionsType* options_type() const { return options_type_; }
|
||||
const char* type_name() const { return options_type()->type_name(); }
|
||||
|
||||
bool Equals(const FunctionOptions& other) const;
|
||||
std::string ToString() const;
|
||||
std::unique_ptr<FunctionOptions> Copy() const;
|
||||
/// \brief Serialize an options struct to a buffer.
|
||||
Result<std::shared_ptr<Buffer>> Serialize() const;
|
||||
/// \brief Deserialize an options struct from a buffer.
|
||||
/// Note: this will only look for `type_name` in the default FunctionRegistry;
|
||||
/// to use a custom FunctionRegistry, look up the FunctionOptionsType, then
|
||||
/// call FunctionOptionsType::Deserialize().
|
||||
static Result<std::unique_ptr<FunctionOptions>> Deserialize(
|
||||
const std::string& type_name, const Buffer& buffer);
|
||||
|
||||
protected:
|
||||
explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {}
|
||||
const FunctionOptionsType* options_type_;
|
||||
};
|
||||
|
||||
ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*);
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,32 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/compute/visibility.h"
|
||||
#include "arrow/status.h"
|
||||
|
||||
namespace arrow::compute {
|
||||
|
||||
/// \brief Initialize the compute module.
|
||||
///
|
||||
/// Register the compute kernel functions to be available on the
|
||||
/// global FunctionRegistry.
|
||||
/// This function will only be available if ARROW_COMPUTE is enabled.
|
||||
ARROW_COMPUTE_EXPORT Status Initialize();
|
||||
|
||||
} // namespace arrow::compute
|
||||
@@ -0,0 +1,772 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/compute/exec.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/device_allocation_type_set.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
// macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h.
|
||||
// No other BSD seems to do so. The name is used as an identifier in MemAllocation enum.
|
||||
#if defined(__APPLE__) && defined(PREALLOCATE)
|
||||
# undef PREALLOCATE
|
||||
#endif
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
class FunctionOptions;
|
||||
|
||||
/// \brief Base class for opaque kernel-specific state. For example, if there
|
||||
/// is some kind of initialization required.
|
||||
struct ARROW_EXPORT KernelState {
|
||||
virtual ~KernelState() = default;
|
||||
};
|
||||
|
||||
/// \brief Context/state for the execution of a particular kernel.
|
||||
class ARROW_EXPORT KernelContext {
|
||||
public:
|
||||
// Can pass optional backreference; not used consistently for the
|
||||
// moment but will be made so in the future
|
||||
explicit KernelContext(ExecContext* exec_ctx, const Kernel* kernel = NULLPTR)
|
||||
: exec_ctx_(exec_ctx), kernel_(kernel) {}
|
||||
|
||||
/// \brief Allocate buffer from the context's memory pool. The contents are
|
||||
/// not initialized.
|
||||
Result<std::shared_ptr<ResizableBuffer>> Allocate(int64_t nbytes);
|
||||
|
||||
/// \brief Allocate buffer for bitmap from the context's memory pool. Like
|
||||
/// Allocate, the contents of the buffer are not initialized but the last
|
||||
/// byte is preemptively zeroed to help avoid ASAN or valgrind issues.
|
||||
Result<std::shared_ptr<ResizableBuffer>> AllocateBitmap(int64_t num_bits);
|
||||
|
||||
/// \brief Assign the active KernelState to be utilized for each stage of
|
||||
/// kernel execution. Ownership and memory lifetime of the KernelState must
|
||||
/// be minded separately.
|
||||
void SetState(KernelState* state) { state_ = state; }
|
||||
|
||||
// Set kernel that is being invoked since some kernel
|
||||
// implementations will examine the kernel state.
|
||||
void SetKernel(const Kernel* kernel) { kernel_ = kernel; }
|
||||
|
||||
KernelState* state() { return state_; }
|
||||
|
||||
/// \brief Configuration related to function execution that is to be shared
|
||||
/// across multiple kernels.
|
||||
ExecContext* exec_context() { return exec_ctx_; }
|
||||
|
||||
/// \brief The memory pool to use for allocations. For now, it uses the
|
||||
/// MemoryPool contained in the ExecContext used to create the KernelContext.
|
||||
MemoryPool* memory_pool() { return exec_ctx_->memory_pool(); }
|
||||
|
||||
const Kernel* kernel() const { return kernel_; }
|
||||
|
||||
private:
|
||||
ExecContext* exec_ctx_;
|
||||
KernelState* state_ = NULLPTR;
|
||||
const Kernel* kernel_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief An type-checking interface to permit customizable validation rules
|
||||
/// for use with InputType and KernelSignature. This is for scenarios where the
|
||||
/// acceptance is not an exact type instance, such as a TIMESTAMP type for a
|
||||
/// specific TimeUnit, but permitting any time zone.
|
||||
struct ARROW_EXPORT TypeMatcher {
|
||||
virtual ~TypeMatcher() = default;
|
||||
|
||||
/// \brief Return true if this matcher accepts the data type.
|
||||
virtual bool Matches(const DataType& type) const = 0;
|
||||
|
||||
/// \brief A human-interpretable string representation of what the type
|
||||
/// matcher checks for, usable when printing KernelSignature or formatting
|
||||
/// error messages.
|
||||
virtual std::string ToString() const = 0;
|
||||
|
||||
/// \brief Return true if this TypeMatcher contains the same matching rule as
|
||||
/// the other. Currently depends on RTTI.
|
||||
virtual bool Equals(const TypeMatcher& other) const = 0;
|
||||
};
|
||||
|
||||
namespace match {
|
||||
|
||||
/// \brief Match any DataType instance having the same DataType::id.
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id);
|
||||
|
||||
/// \brief Match any TimestampType instance having the same unit, but the time
|
||||
/// zones can be different.
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> TimestampTypeUnit(TimeUnit::type unit);
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> Time32TypeUnit(TimeUnit::type unit);
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> Time64TypeUnit(TimeUnit::type unit);
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> DurationTypeUnit(TimeUnit::type unit);
|
||||
|
||||
// \brief Match any integer type
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> Integer();
|
||||
|
||||
// Match types using 32-bit varbinary representation
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> BinaryLike();
|
||||
|
||||
// Match types using 64-bit varbinary representation
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> LargeBinaryLike();
|
||||
|
||||
// Match any fixed binary type
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> FixedSizeBinaryLike();
|
||||
|
||||
// \brief Match any primitive type (boolean or any type representable as a C
|
||||
// Type)
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> Primitive();
|
||||
|
||||
// \brief Match any integer type that can be used as run-end in run-end encoded
|
||||
// arrays
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndInteger();
|
||||
|
||||
/// \brief Match run-end encoded types that use any valid run-end type and
|
||||
/// encode specific value types
|
||||
///
|
||||
/// @param[in] value_type_matcher a matcher that is applied to the values field
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
|
||||
std::shared_ptr<TypeMatcher> value_type_matcher);
|
||||
|
||||
/// \brief Match run-end encoded types that use any valid run-end type and
|
||||
/// encode specific value types
|
||||
///
|
||||
/// @param[in] value_type_id a type id that the type of the values field should match
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(Type::type value_type_id);
|
||||
|
||||
/// \brief Match run-end encoded types that encode specific run-end and value types
|
||||
///
|
||||
/// @param[in] run_end_type_matcher a matcher that is applied to the run_ends field
|
||||
/// @param[in] value_type_matcher a matcher that is applied to the values field
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
|
||||
std::shared_ptr<TypeMatcher> run_end_type_matcher,
|
||||
std::shared_ptr<TypeMatcher> value_type_matcher);
|
||||
|
||||
} // namespace match
|
||||
|
||||
/// \brief An object used for type-checking arguments to be passed to a kernel
|
||||
/// and stored in a KernelSignature. The type-checking rule can be supplied
|
||||
/// either with an exact DataType instance or a custom TypeMatcher.
|
||||
class ARROW_EXPORT InputType {
|
||||
public:
|
||||
/// \brief The kind of type-checking rule that the InputType contains.
|
||||
enum Kind {
|
||||
/// \brief Accept any value type.
|
||||
ANY_TYPE,
|
||||
|
||||
/// \brief A fixed arrow::DataType and will only exact match having this
|
||||
/// exact type (e.g. same TimestampType unit, same decimal scale and
|
||||
/// precision, or same nested child types).
|
||||
EXACT_TYPE,
|
||||
|
||||
/// \brief Uses a TypeMatcher implementation to check the type.
|
||||
USE_TYPE_MATCHER
|
||||
};
|
||||
|
||||
/// \brief Accept any value type
|
||||
InputType() : kind_(ANY_TYPE) {}
|
||||
|
||||
/// \brief Accept an exact value type.
|
||||
InputType(std::shared_ptr<DataType> type) // NOLINT implicit construction
|
||||
: kind_(EXACT_TYPE), type_(std::move(type)) {}
|
||||
|
||||
/// \brief Use the passed TypeMatcher to type check.
|
||||
InputType(std::shared_ptr<TypeMatcher> type_matcher) // NOLINT implicit construction
|
||||
: kind_(USE_TYPE_MATCHER), type_matcher_(std::move(type_matcher)) {}
|
||||
|
||||
/// \brief Match any type with the given Type::type. Uses a TypeMatcher for
|
||||
/// its implementation.
|
||||
InputType(Type::type type_id) // NOLINT implicit construction
|
||||
: InputType(match::SameTypeId(type_id)) {}
|
||||
|
||||
InputType(const InputType& other) { CopyInto(other); }
|
||||
|
||||
void operator=(const InputType& other) { CopyInto(other); }
|
||||
|
||||
InputType(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
|
||||
|
||||
void operator=(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
|
||||
|
||||
// \brief Match any input (array, scalar of any type)
|
||||
static InputType Any() { return InputType(); }
|
||||
|
||||
/// \brief Return true if this input type matches the same type cases as the
|
||||
/// other.
|
||||
bool Equals(const InputType& other) const;
|
||||
|
||||
bool operator==(const InputType& other) const { return this->Equals(other); }
|
||||
|
||||
bool operator!=(const InputType& other) const { return !(*this == other); }
|
||||
|
||||
/// \brief Return hash code.
|
||||
size_t Hash() const;
|
||||
|
||||
/// \brief Render a human-readable string representation.
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Return true if the Datum matches this argument kind in
|
||||
/// type (and only allows scalar or array-like Datums).
|
||||
bool Matches(const Datum& value) const;
|
||||
|
||||
/// \brief Return true if the type matches this InputType
|
||||
bool Matches(const DataType& type) const;
|
||||
|
||||
/// \brief The type matching rule that this InputType uses.
|
||||
Kind kind() const { return kind_; }
|
||||
|
||||
/// \brief For InputType::EXACT_TYPE kind, the exact type that this InputType
|
||||
/// must match. Otherwise this function should not be used and will assert in
|
||||
/// debug builds.
|
||||
const std::shared_ptr<DataType>& type() const;
|
||||
|
||||
/// \brief For InputType::USE_TYPE_MATCHER, the TypeMatcher to be used for
|
||||
/// checking the type of a value. Otherwise this function should not be used
|
||||
/// and will assert in debug builds.
|
||||
const TypeMatcher& type_matcher() const;
|
||||
|
||||
private:
|
||||
void CopyInto(const InputType& other) {
|
||||
this->kind_ = other.kind_;
|
||||
this->type_ = other.type_;
|
||||
this->type_matcher_ = other.type_matcher_;
|
||||
}
|
||||
|
||||
void MoveInto(InputType&& other) {
|
||||
this->kind_ = other.kind_;
|
||||
this->type_ = std::move(other.type_);
|
||||
this->type_matcher_ = std::move(other.type_matcher_);
|
||||
}
|
||||
|
||||
Kind kind_;
|
||||
|
||||
// For EXACT_TYPE Kind
|
||||
std::shared_ptr<DataType> type_;
|
||||
|
||||
// For USE_TYPE_MATCHER Kind
|
||||
std::shared_ptr<TypeMatcher> type_matcher_;
|
||||
};
|
||||
|
||||
/// \brief Container to capture both exact and input-dependent output types.
|
||||
class ARROW_EXPORT OutputType {
|
||||
public:
|
||||
/// \brief An enum indicating whether the value type is an invariant fixed
|
||||
/// value or one that's computed by a kernel-defined resolver function.
|
||||
enum ResolveKind { FIXED, COMPUTED };
|
||||
|
||||
/// Type resolution function. Given input types, return output type. This
|
||||
/// function MAY may use the kernel state to decide the output type based on
|
||||
/// the FunctionOptions.
|
||||
///
|
||||
/// This function SHOULD _not_ be used to check for arity, that is to be
|
||||
/// performed one or more layers above.
|
||||
using Resolver =
|
||||
std::function<Result<TypeHolder>(KernelContext*, const std::vector<TypeHolder>&)>;
|
||||
|
||||
/// \brief Output an exact type
|
||||
OutputType(std::shared_ptr<DataType> type) // NOLINT implicit construction
|
||||
: kind_(FIXED), type_(std::move(type)) {}
|
||||
|
||||
/// \brief Output a computed type depending on actual input types
|
||||
template <typename Fn>
|
||||
OutputType(Fn resolver) // NOLINT implicit construction
|
||||
: kind_(COMPUTED), resolver_(std::move(resolver)) {}
|
||||
|
||||
OutputType(const OutputType& other) {
|
||||
this->kind_ = other.kind_;
|
||||
this->type_ = other.type_;
|
||||
this->resolver_ = other.resolver_;
|
||||
}
|
||||
|
||||
OutputType(OutputType&& other) {
|
||||
this->kind_ = other.kind_;
|
||||
this->type_ = std::move(other.type_);
|
||||
this->resolver_ = other.resolver_;
|
||||
}
|
||||
|
||||
OutputType& operator=(const OutputType&) = default;
|
||||
OutputType& operator=(OutputType&&) = default;
|
||||
|
||||
/// \brief Return the type of the expected output value of the kernel given
|
||||
/// the input argument types. The resolver may make use of state information
|
||||
/// kept in the KernelContext.
|
||||
Result<TypeHolder> Resolve(KernelContext* ctx,
|
||||
const std::vector<TypeHolder>& args) const;
|
||||
|
||||
/// \brief The exact output value type for the FIXED kind.
|
||||
const std::shared_ptr<DataType>& type() const;
|
||||
|
||||
/// \brief For use with COMPUTED resolution strategy. It may be more
|
||||
/// convenient to invoke this with OutputType::Resolve returned from this
|
||||
/// method.
|
||||
const Resolver& resolver() const;
|
||||
|
||||
/// \brief Render a human-readable string representation.
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Return the kind of type resolution of this output type, whether
|
||||
/// fixed/invariant or computed by a resolver.
|
||||
ResolveKind kind() const { return kind_; }
|
||||
|
||||
private:
|
||||
ResolveKind kind_;
|
||||
|
||||
// For FIXED resolution
|
||||
std::shared_ptr<DataType> type_;
|
||||
|
||||
// For COMPUTED resolution
|
||||
Resolver resolver_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief Additional constraints to apply to the input types of a kernel when matching a
|
||||
/// specific kernel signature.
|
||||
class ARROW_EXPORT MatchConstraint {
|
||||
public:
|
||||
virtual ~MatchConstraint() = default;
|
||||
|
||||
/// \brief Return true if the input types satisfy the constraint.
|
||||
virtual bool Matches(const std::vector<TypeHolder>& types) const = 0;
|
||||
|
||||
/// \brief Convenience function to create a MatchConstraint from a match function.
|
||||
static std::shared_ptr<MatchConstraint> Make(
|
||||
std::function<bool(const std::vector<TypeHolder>&)> matches);
|
||||
};
|
||||
|
||||
/// \brief Constraint that all input types are decimal types and have the same scale.
|
||||
ARROW_EXPORT std::shared_ptr<MatchConstraint> DecimalsHaveSameScale();
|
||||
|
||||
/// \brief Holds the input types, optional match constraint and output type of the kernel.
|
||||
///
|
||||
/// VarArgs functions with minimum N arguments should pass up to N input types to be
|
||||
/// used to validate the input types of a function invocation. The first N-1 types
|
||||
/// will be matched against the first N-1 arguments, and the last type will be
|
||||
/// matched against the remaining arguments.
|
||||
class ARROW_EXPORT KernelSignature {
|
||||
public:
|
||||
KernelSignature(std::vector<InputType> in_types, OutputType out_type,
|
||||
bool is_varargs = false,
|
||||
std::shared_ptr<MatchConstraint> constraint = NULLPTR);
|
||||
|
||||
/// \brief Convenience ctor since make_shared can be awkward
|
||||
static std::shared_ptr<KernelSignature> Make(
|
||||
std::vector<InputType> in_types, OutputType out_type, bool is_varargs = false,
|
||||
std::shared_ptr<MatchConstraint> constraint = NULLPTR);
|
||||
|
||||
/// \brief Return true if the signature is compatible with the list of input
|
||||
/// value descriptors and satisfies the match constraint, if any.
|
||||
bool MatchesInputs(const std::vector<TypeHolder>& types) const;
|
||||
|
||||
/// \brief Returns true if the input types of each signature are
|
||||
/// equal. Well-formed functions should have a deterministic output type
|
||||
/// given input types, but currently it is the responsibility of the
|
||||
/// developer to ensure this.
|
||||
bool Equals(const KernelSignature& other) const;
|
||||
|
||||
bool operator==(const KernelSignature& other) const { return this->Equals(other); }
|
||||
|
||||
bool operator!=(const KernelSignature& other) const { return !(*this == other); }
|
||||
|
||||
/// \brief Compute a hash code for the signature
|
||||
size_t Hash() const;
|
||||
|
||||
/// \brief The input types for the kernel. For VarArgs functions, this should
|
||||
/// generally contain a single validator to use for validating all of the
|
||||
/// function arguments.
|
||||
const std::vector<InputType>& in_types() const { return in_types_; }
|
||||
|
||||
/// \brief The output type for the kernel. Use Resolve to return the
|
||||
/// exact output given input argument types, since many kernels'
|
||||
/// output types depend on their input types (or their type
|
||||
/// metadata).
|
||||
const OutputType& out_type() const { return out_type_; }
|
||||
|
||||
/// \brief Render a human-readable string representation
|
||||
std::string ToString() const;
|
||||
|
||||
bool is_varargs() const { return is_varargs_; }
|
||||
|
||||
private:
|
||||
std::vector<InputType> in_types_;
|
||||
OutputType out_type_;
|
||||
bool is_varargs_;
|
||||
std::shared_ptr<MatchConstraint> constraint_;
|
||||
|
||||
// For caching the hash code after it's computed the first time
|
||||
mutable uint64_t hash_code_;
|
||||
};
|
||||
|
||||
/// \brief A function may contain multiple variants of a kernel for a given
|
||||
/// type combination for different SIMD levels. Based on the active system's
|
||||
/// CPU info or the user's preferences, we can elect to use one over the other.
|
||||
struct SimdLevel {
|
||||
enum type { NONE = 0, SSE4_2, AVX, AVX2, AVX512, NEON, MAX };
|
||||
};
|
||||
|
||||
/// \brief The strategy to use for propagating or otherwise populating the
|
||||
/// validity bitmap of a kernel output.
|
||||
struct NullHandling {
|
||||
enum type {
|
||||
/// Compute the output validity bitmap by intersecting the validity bitmaps
|
||||
/// of the arguments using bitwise-and operations. This means that values
|
||||
/// in the output are valid/non-null only if the corresponding values in
|
||||
/// all input arguments were valid/non-null. Kernel generally need not
|
||||
/// touch the bitmap thereafter, but a kernel's exec function is permitted
|
||||
/// to alter the bitmap after the null intersection is computed if it needs
|
||||
/// to.
|
||||
INTERSECTION,
|
||||
|
||||
/// Kernel expects a pre-allocated buffer to write the result bitmap
|
||||
/// into. The preallocated memory is not zeroed (except for the last byte),
|
||||
/// so the kernel should ensure to completely populate the bitmap.
|
||||
COMPUTED_PREALLOCATE,
|
||||
|
||||
/// Kernel allocates and sets the validity bitmap of the output.
|
||||
COMPUTED_NO_PREALLOCATE,
|
||||
|
||||
/// Kernel output is never null and a validity bitmap does not need to be
|
||||
/// allocated.
|
||||
OUTPUT_NOT_NULL
|
||||
};
|
||||
};
|
||||
|
||||
/// \brief The preference for memory preallocation of fixed-width type outputs
|
||||
/// in kernel execution.
|
||||
struct MemAllocation {
|
||||
enum type {
|
||||
// For data types that support pre-allocation (i.e. fixed-width), the
|
||||
// kernel expects to be provided a pre-allocated data buffer to write
|
||||
// into. Non-fixed-width types must always allocate their own data
|
||||
// buffers. The allocation made for the same length as the execution batch,
|
||||
// so vector kernels yielding differently sized output should not use this.
|
||||
//
|
||||
// It is valid for the data to not be preallocated but the validity bitmap
|
||||
// is (or is computed using the intersection/bitwise-and method).
|
||||
//
|
||||
// For variable-size output types like BinaryType or StringType, or for
|
||||
// nested types, this option has no effect.
|
||||
PREALLOCATE,
|
||||
|
||||
// The kernel is responsible for allocating its own data buffer for
|
||||
// fixed-width type outputs.
|
||||
NO_PREALLOCATE
|
||||
};
|
||||
};
|
||||
|
||||
struct Kernel;
|
||||
|
||||
/// \brief Arguments to pass to an KernelInit function. A struct is used to help
|
||||
/// avoid API breakage should the arguments passed need to be expanded.
|
||||
struct KernelInitArgs {
|
||||
/// \brief A pointer to the kernel being initialized. The init function may
|
||||
/// depend on the kernel's KernelSignature or other data contained there.
|
||||
const Kernel* kernel;
|
||||
|
||||
/// \brief The types of the input arguments that the kernel is
|
||||
/// about to be executed against.
|
||||
const std::vector<TypeHolder>& inputs;
|
||||
|
||||
/// \brief Opaque options specific to this kernel. May be nullptr for functions
|
||||
/// that do not require options.
|
||||
const FunctionOptions* options;
|
||||
};
|
||||
|
||||
/// \brief Common initializer function for all kernel types.
|
||||
using KernelInit = std::function<Result<std::unique_ptr<KernelState>>(
|
||||
KernelContext*, const KernelInitArgs&)>;
|
||||
|
||||
/// \brief Base type for kernels. Contains the function signature and
|
||||
/// optionally the state initialization function, along with some common
|
||||
/// attributes
|
||||
struct ARROW_EXPORT Kernel {
|
||||
Kernel() = default;
|
||||
|
||||
Kernel(std::shared_ptr<KernelSignature> sig, KernelInit init)
|
||||
: signature(std::move(sig)), init(std::move(init)) {}
|
||||
|
||||
Kernel(std::vector<InputType> in_types, OutputType out_type, KernelInit init)
|
||||
: Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
|
||||
std::move(init)) {}
|
||||
|
||||
/// \brief The "signature" of the kernel containing the InputType input
|
||||
/// argument validators and OutputType output type resolver.
|
||||
std::shared_ptr<KernelSignature> signature;
|
||||
|
||||
/// \brief Create a new KernelState for invocations of this kernel, e.g. to
|
||||
/// set up any options or state relevant for execution.
|
||||
KernelInit init;
|
||||
|
||||
/// \brief Create a vector of new KernelState for invocations of this kernel.
|
||||
static Status InitAll(KernelContext*, const KernelInitArgs&,
|
||||
std::vector<std::unique_ptr<KernelState>>*);
|
||||
|
||||
/// \brief Indicates whether execution can benefit from parallelization
|
||||
/// (splitting large chunks into smaller chunks and using multiple
|
||||
/// threads). Some kernels may not support parallel execution at
|
||||
/// all. Synchronization and concurrency-related issues are currently the
|
||||
/// responsibility of the Kernel's implementation.
|
||||
bool parallelizable = true;
|
||||
|
||||
/// \brief Indicates the level of SIMD instruction support in the host CPU is
|
||||
/// required to use the function. The intention is for functions to be able to
|
||||
/// contain multiple kernels with the same signature but different levels of SIMD,
|
||||
/// so that the most optimized kernel supported on a host's processor can be chosen.
|
||||
SimdLevel::type simd_level = SimdLevel::NONE;
|
||||
|
||||
// Additional kernel-specific data
|
||||
std::shared_ptr<KernelState> data;
|
||||
};
|
||||
|
||||
/// \brief The scalar kernel execution API that must be implemented for SCALAR
|
||||
/// kernel types. This includes both stateless and stateful kernels. Kernels
|
||||
/// depending on some execution state access that state via subclasses of
|
||||
/// KernelState set on the KernelContext object. Implementations should
|
||||
/// endeavor to write into pre-allocated memory if they are able, though for
|
||||
/// some kernels (e.g. in cases when a builder like StringBuilder) must be
|
||||
/// employed this may not be possible.
|
||||
using ArrayKernelExec = Status (*)(KernelContext*, const ExecSpan&, ExecResult*);
|
||||
|
||||
/// \brief Kernel data structure for implementations of ScalarFunction. In
|
||||
/// addition to the members found in Kernel, contains the null handling
|
||||
/// and memory pre-allocation preferences.
|
||||
struct ARROW_EXPORT ScalarKernel : public Kernel {
|
||||
ScalarKernel() = default;
|
||||
|
||||
ScalarKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
|
||||
KernelInit init = NULLPTR)
|
||||
: Kernel(std::move(sig), init), exec(exec) {}
|
||||
|
||||
ScalarKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
|
||||
KernelInit init = NULLPTR)
|
||||
: Kernel(std::move(in_types), std::move(out_type), std::move(init)), exec(exec) {}
|
||||
|
||||
/// \brief Perform a single invocation of this kernel. Depending on the
|
||||
/// implementation, it may only write into preallocated memory, while in some
|
||||
/// cases it will allocate its own memory. Any required state is managed
|
||||
/// through the KernelContext.
|
||||
ArrayKernelExec exec;
|
||||
|
||||
/// \brief Writing execution results into larger contiguous allocations
|
||||
/// requires that the kernel be able to write into sliced output ArrayData*,
|
||||
/// including sliced output validity bitmaps. Some kernel implementations may
|
||||
/// not be able to do this, so setting this to false disables this
|
||||
/// functionality.
|
||||
bool can_write_into_slices = true;
|
||||
|
||||
// For scalar functions preallocated data and intersecting arg validity
|
||||
// bitmaps is a reasonable default
|
||||
NullHandling::type null_handling = NullHandling::INTERSECTION;
|
||||
MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// VectorKernel (for VectorFunction)
|
||||
|
||||
/// \brief Kernel data structure for implementations of VectorFunction. In
|
||||
/// contains an optional finalizer function, the null handling and memory
|
||||
/// pre-allocation preferences (which have different defaults from
|
||||
/// ScalarKernel), and some other execution-related options.
|
||||
struct ARROW_EXPORT VectorKernel : public Kernel {
|
||||
/// \brief See VectorKernel::finalize member for usage
|
||||
using FinalizeFunc = std::function<Status(KernelContext*, std::vector<Datum>*)>;
|
||||
|
||||
/// \brief Function for executing a stateful VectorKernel against a
|
||||
/// ChunkedArray input. Does not need to be defined for all VectorKernels
|
||||
using ChunkedExec = Status (*)(KernelContext*, const ExecBatch&, Datum* out);
|
||||
|
||||
VectorKernel() = default;
|
||||
|
||||
VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
|
||||
KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
|
||||
: Kernel(std::move(in_types), std::move(out_type), std::move(init)),
|
||||
exec(exec),
|
||||
finalize(std::move(finalize)) {}
|
||||
|
||||
VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
|
||||
KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
|
||||
: Kernel(std::move(sig), std::move(init)),
|
||||
exec(exec),
|
||||
finalize(std::move(finalize)) {}
|
||||
|
||||
/// \brief Perform a single invocation of this kernel. Any required state is
|
||||
/// managed through the KernelContext.
|
||||
ArrayKernelExec exec;
|
||||
|
||||
/// \brief Execute the kernel on a ChunkedArray. Does not need to be defined
|
||||
ChunkedExec exec_chunked = NULLPTR;
|
||||
|
||||
/// \brief For VectorKernel, convert intermediate results into finalized
|
||||
/// results. Mutates input argument. Some kernels may accumulate state
|
||||
/// (example: hashing-related functions) through processing chunked inputs, and
|
||||
/// then need to attach some accumulated state to each of the outputs of
|
||||
/// processing each chunk of data.
|
||||
FinalizeFunc finalize;
|
||||
|
||||
/// Since vector kernels generally are implemented rather differently from
|
||||
/// scalar/elementwise kernels (and they may not even yield arrays of the same
|
||||
/// size), so we make the developer opt-in to any memory preallocation rather
|
||||
/// than having to turn it off.
|
||||
NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
|
||||
MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE;
|
||||
|
||||
/// \brief Writing execution results into larger contiguous allocations
|
||||
/// requires that the kernel be able to write into sliced output ArrayData*,
|
||||
/// including sliced output validity bitmaps. Some kernel implementations may
|
||||
/// not be able to do this, so setting this to false disables this
|
||||
/// functionality.
|
||||
bool can_write_into_slices = true;
|
||||
|
||||
/// Some vector kernels can do chunkwise execution using ExecSpanIterator,
|
||||
/// in some cases accumulating some state. Other kernels (like Take) need to
|
||||
/// be passed whole arrays and don't work on ChunkedArray inputs
|
||||
bool can_execute_chunkwise = true;
|
||||
|
||||
/// Some kernels (like unique and value_counts) yield non-chunked output from
|
||||
/// chunked-array inputs. This option controls how the results are boxed when
|
||||
/// returned from ExecVectorFunction
|
||||
///
|
||||
/// true -> ChunkedArray
|
||||
/// false -> Array
|
||||
bool output_chunked = true;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ScalarAggregateKernel (for ScalarAggregateFunction)
|
||||
|
||||
using ScalarAggregateConsume = Status (*)(KernelContext*, const ExecSpan&);
|
||||
using ScalarAggregateMerge = Status (*)(KernelContext*, KernelState&&, KernelState*);
|
||||
// Finalize returns Datum to permit multiple return values
|
||||
using ScalarAggregateFinalize = Status (*)(KernelContext*, Datum*);
|
||||
|
||||
/// \brief Kernel data structure for implementations of
|
||||
/// ScalarAggregateFunction. The four necessary components of an aggregation
|
||||
/// kernel are the init, consume, merge, and finalize functions.
|
||||
///
|
||||
/// * init: creates a new KernelState for a kernel.
|
||||
/// * consume: processes an ExecSpan and updates the KernelState found in the
|
||||
/// KernelContext.
|
||||
/// * merge: combines one KernelState with another.
|
||||
/// * finalize: produces the end result of the aggregation using the
|
||||
/// KernelState in the KernelContext.
|
||||
struct ARROW_EXPORT ScalarAggregateKernel : public Kernel {
|
||||
ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
|
||||
ScalarAggregateConsume consume, ScalarAggregateMerge merge,
|
||||
ScalarAggregateFinalize finalize, const bool ordered)
|
||||
: Kernel(std::move(sig), std::move(init)),
|
||||
consume(consume),
|
||||
merge(merge),
|
||||
finalize(finalize),
|
||||
ordered(ordered) {}
|
||||
|
||||
ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
|
||||
KernelInit init, ScalarAggregateConsume consume,
|
||||
ScalarAggregateMerge merge, ScalarAggregateFinalize finalize,
|
||||
const bool ordered)
|
||||
: ScalarAggregateKernel(
|
||||
KernelSignature::Make(std::move(in_types), std::move(out_type)),
|
||||
std::move(init), consume, merge, finalize, ordered) {}
|
||||
|
||||
/// \brief Merge a vector of KernelStates into a single KernelState.
|
||||
/// The merged state will be returned and will be set on the KernelContext.
|
||||
static Result<std::unique_ptr<KernelState>> MergeAll(
|
||||
const ScalarAggregateKernel* kernel, KernelContext* ctx,
|
||||
std::vector<std::unique_ptr<KernelState>> states);
|
||||
|
||||
ScalarAggregateConsume consume;
|
||||
ScalarAggregateMerge merge;
|
||||
ScalarAggregateFinalize finalize;
|
||||
/// \brief Whether this kernel requires ordering
|
||||
/// Some aggregations, such as, "first", requires some kind of input order. The
|
||||
/// order can be implicit, e.g., the order of the input data, or explicit, e.g.
|
||||
/// the ordering specified with a window aggregation.
|
||||
/// The caller of the aggregate kernel is responsible for passing data in some
|
||||
/// defined order to the kernel. The flag here is a way for the kernel to tell
|
||||
/// the caller that data passed to the kernel must be defined in some order.
|
||||
bool ordered = false;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// HashAggregateKernel (for HashAggregateFunction)
|
||||
|
||||
using HashAggregateResize = Status (*)(KernelContext*, int64_t);
|
||||
using HashAggregateConsume = Status (*)(KernelContext*, const ExecSpan&);
|
||||
using HashAggregateMerge = Status (*)(KernelContext*, KernelState&&, const ArrayData&);
|
||||
|
||||
// Finalize returns Datum to permit multiple return values
|
||||
using HashAggregateFinalize = Status (*)(KernelContext*, Datum*);
|
||||
|
||||
/// \brief Kernel data structure for implementations of
|
||||
/// HashAggregateFunction. The four necessary components of an aggregation
|
||||
/// kernel are the init, consume, merge, and finalize functions.
|
||||
///
|
||||
/// * init: creates a new KernelState for a kernel.
|
||||
/// * resize: ensure that the KernelState can accommodate the specified number of groups.
|
||||
/// * consume: processes an ExecSpan (which includes the argument as well
|
||||
/// as an array of group identifiers) and updates the KernelState found in the
|
||||
/// KernelContext.
|
||||
/// * merge: combines one KernelState with another.
|
||||
/// * finalize: produces the end result of the aggregation using the
|
||||
/// KernelState in the KernelContext.
|
||||
struct ARROW_EXPORT HashAggregateKernel : public Kernel {
|
||||
HashAggregateKernel() = default;
|
||||
|
||||
HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
|
||||
HashAggregateResize resize, HashAggregateConsume consume,
|
||||
HashAggregateMerge merge, HashAggregateFinalize finalize,
|
||||
const bool ordered)
|
||||
: Kernel(std::move(sig), std::move(init)),
|
||||
resize(resize),
|
||||
consume(consume),
|
||||
merge(merge),
|
||||
finalize(finalize),
|
||||
ordered(ordered) {}
|
||||
|
||||
HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
|
||||
KernelInit init, HashAggregateConsume consume,
|
||||
HashAggregateResize resize, HashAggregateMerge merge,
|
||||
HashAggregateFinalize finalize, const bool ordered)
|
||||
: HashAggregateKernel(
|
||||
KernelSignature::Make(std::move(in_types), std::move(out_type)),
|
||||
std::move(init), resize, consume, merge, finalize, ordered) {}
|
||||
|
||||
HashAggregateResize resize;
|
||||
HashAggregateConsume consume;
|
||||
HashAggregateMerge merge;
|
||||
HashAggregateFinalize finalize;
|
||||
/// @brief whether the summarizer requires ordering
|
||||
/// This is similar to ScalarAggregateKernel. See ScalarAggregateKernel
|
||||
/// for detailed doc of this variable.
|
||||
bool ordered = false;
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,120 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/compare.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
enum class SortOrder {
|
||||
/// Arrange values in increasing order
|
||||
Ascending,
|
||||
/// Arrange values in decreasing order
|
||||
Descending,
|
||||
};
|
||||
|
||||
enum class NullPlacement {
|
||||
/// Place nulls and NaNs before any non-null values.
|
||||
/// NaNs will come after nulls.
|
||||
AtStart,
|
||||
/// Place nulls and NaNs after any non-null values.
|
||||
/// NaNs will come before nulls.
|
||||
AtEnd,
|
||||
};
|
||||
|
||||
/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
|
||||
class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
|
||||
public:
|
||||
explicit SortKey(FieldRef target, SortOrder order = SortOrder::Ascending)
|
||||
: target(std::move(target)), order(order) {}
|
||||
|
||||
bool Equals(const SortKey& other) const;
|
||||
std::string ToString() const;
|
||||
|
||||
/// A FieldRef targeting the sort column.
|
||||
FieldRef target;
|
||||
/// How to order by this sort key.
|
||||
SortOrder order;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Ordering : public util::EqualityComparable<Ordering> {
|
||||
public:
|
||||
Ordering(std::vector<SortKey> sort_keys,
|
||||
NullPlacement null_placement = NullPlacement::AtStart)
|
||||
: sort_keys_(std::move(sort_keys)), null_placement_(null_placement) {}
|
||||
/// true if data ordered by other is also ordered by this
|
||||
///
|
||||
/// For example, if data is ordered by [a, b, c] then it is also ordered
|
||||
/// by [a, b] but not by [b, c] or [a, b, c, d].
|
||||
///
|
||||
/// [a, b].IsSuborderOf([a, b, c]) - true
|
||||
/// [a, b, c].IsSuborderOf([a, b, c]) - true
|
||||
/// [b, c].IsSuborderOf([a, b, c]) - false
|
||||
/// [a, b, c, d].IsSuborderOf([a, b, c]) - false
|
||||
///
|
||||
/// The implicit ordering is not a suborder of any other ordering and
|
||||
/// no other ordering is a suborder of it. The implicit ordering is not a
|
||||
/// suborder of itself.
|
||||
///
|
||||
/// The unordered ordering is a suborder of all other orderings but no
|
||||
/// other ordering is a suborder of it. The unordered ordering is a suborder
|
||||
/// of itself.
|
||||
///
|
||||
/// The unordered ordering is a suborder of the implicit ordering.
|
||||
bool IsSuborderOf(const Ordering& other) const;
|
||||
|
||||
bool Equals(const Ordering& other) const;
|
||||
std::string ToString() const;
|
||||
|
||||
bool is_implicit() const { return is_implicit_; }
|
||||
bool is_unordered() const { return !is_implicit_ && sort_keys_.empty(); }
|
||||
|
||||
const std::vector<SortKey>& sort_keys() const { return sort_keys_; }
|
||||
NullPlacement null_placement() const { return null_placement_; }
|
||||
|
||||
static const Ordering& Implicit() {
|
||||
static const Ordering kImplicit(true);
|
||||
return kImplicit;
|
||||
}
|
||||
|
||||
static const Ordering& Unordered() {
|
||||
static const Ordering kUnordered(false);
|
||||
// It is also possible to get an unordered ordering by passing in an empty vector
|
||||
// using the normal constructor. This is ok and useful when ordering comes from user
|
||||
// input.
|
||||
return kUnordered;
|
||||
}
|
||||
|
||||
private:
|
||||
explicit Ordering(bool is_implicit)
|
||||
: null_placement_(NullPlacement::AtStart), is_implicit_(is_implicit) {}
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys_;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement_;
|
||||
bool is_implicit_ = false;
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,126 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
class Function;
|
||||
class FunctionOptionsType;
|
||||
|
||||
/// \brief A mutable central function registry for built-in functions as well
|
||||
/// as user-defined functions. Functions are implementations of
|
||||
/// arrow::compute::Function.
|
||||
///
|
||||
/// Generally, each function contains kernels which are implementations of a
|
||||
/// function for a specific argument signature. After looking up a function in
|
||||
/// the registry, one can either execute it eagerly with Function::Execute or
|
||||
/// use one of the function's dispatch methods to pick a suitable kernel for
|
||||
/// lower-level function execution.
|
||||
class ARROW_EXPORT FunctionRegistry {
|
||||
public:
|
||||
~FunctionRegistry();
|
||||
|
||||
/// \brief Construct a new registry.
|
||||
///
|
||||
/// Most users only need to use the global registry.
|
||||
static std::unique_ptr<FunctionRegistry> Make();
|
||||
|
||||
/// \brief Construct a new nested registry with the given parent.
|
||||
///
|
||||
/// Most users only need to use the global registry. The returned registry never changes
|
||||
/// its parent, even when an operation allows overwriting.
|
||||
static std::unique_ptr<FunctionRegistry> Make(FunctionRegistry* parent);
|
||||
|
||||
/// \brief Check whether a new function can be added to the registry.
|
||||
///
|
||||
/// \returns Status::KeyError if a function with the same name is already registered.
|
||||
Status CanAddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
|
||||
|
||||
/// \brief Add a new function to the registry.
|
||||
///
|
||||
/// \returns Status::KeyError if a function with the same name is already registered.
|
||||
Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
|
||||
|
||||
/// \brief Check whether an alias can be added for the given function name.
|
||||
///
|
||||
/// \returns Status::KeyError if the function with the given name is not registered.
|
||||
Status CanAddAlias(const std::string& target_name, const std::string& source_name);
|
||||
|
||||
/// \brief Add alias for the given function name.
|
||||
///
|
||||
/// \returns Status::KeyError if the function with the given name is not registered.
|
||||
Status AddAlias(const std::string& target_name, const std::string& source_name);
|
||||
|
||||
/// \brief Check whether a new function options type can be added to the registry.
|
||||
///
|
||||
/// \return Status::KeyError if a function options type with the same name is already
|
||||
/// registered.
|
||||
Status CanAddFunctionOptionsType(const FunctionOptionsType* options_type,
|
||||
bool allow_overwrite = false);
|
||||
|
||||
/// \brief Add a new function options type to the registry.
|
||||
///
|
||||
/// \returns Status::KeyError if a function options type with the same name is already
|
||||
/// registered.
|
||||
Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
|
||||
bool allow_overwrite = false);
|
||||
|
||||
/// \brief Retrieve a function by name from the registry.
|
||||
Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const;
|
||||
|
||||
/// \brief Return vector of all entry names in the registry.
|
||||
///
|
||||
/// Helpful for displaying a manifest of available functions.
|
||||
std::vector<std::string> GetFunctionNames() const;
|
||||
|
||||
/// \brief Retrieve a function options type by name from the registry.
|
||||
Result<const FunctionOptionsType*> GetFunctionOptionsType(
|
||||
const std::string& name) const;
|
||||
|
||||
/// \brief The number of currently registered functions.
|
||||
int num_functions() const;
|
||||
|
||||
/// \brief The cast function object registered in AddFunction.
|
||||
///
|
||||
/// Helpful for get cast function as needed.
|
||||
const Function* cast_function() const;
|
||||
|
||||
private:
|
||||
FunctionRegistry();
|
||||
|
||||
// Use PIMPL pattern to not have std::unordered_map here
|
||||
class FunctionRegistryImpl;
|
||||
std::unique_ptr<FunctionRegistryImpl> impl_;
|
||||
|
||||
explicit FunctionRegistry(FunctionRegistryImpl* impl);
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,198 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/kernel.h"
|
||||
#include "arrow/compute/visibility.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
/// \brief A segment
|
||||
/// A segment group is a chunk of continuous rows that have the same segment key. (For
|
||||
/// example, in ordered time series processing, segment key can be "date", and a segment
|
||||
/// group can be all the rows that belong to the same date.) A segment group can span
|
||||
/// across multiple exec batches. A segment is a chunk of continuous rows that has the
|
||||
/// same segment key within a given batch. When a segment group span cross batches, it
|
||||
/// will have multiple segments. A segment never spans cross batches. The segment data
|
||||
/// structure only makes sense when used along with a exec batch.
|
||||
struct ARROW_COMPUTE_EXPORT Segment {
|
||||
/// \brief the offset into the batch where the segment starts
|
||||
int64_t offset;
|
||||
/// \brief the length of the segment
|
||||
int64_t length;
|
||||
/// \brief whether the segment may be extended by a next one
|
||||
bool is_open;
|
||||
/// \brief whether the segment extends a preceeding one
|
||||
bool extends;
|
||||
};
|
||||
|
||||
inline bool operator==(const Segment& segment1, const Segment& segment2) {
|
||||
return segment1.offset == segment2.offset && segment1.length == segment2.length &&
|
||||
segment1.is_open == segment2.is_open && segment1.extends == segment2.extends;
|
||||
}
|
||||
inline bool operator!=(const Segment& segment1, const Segment& segment2) {
|
||||
return !(segment1 == segment2);
|
||||
}
|
||||
|
||||
/// \brief a helper class to divide a batch into segments of equal values
|
||||
///
|
||||
/// For example, given a batch with two columns specifed as segment keys:
|
||||
///
|
||||
/// A A [other columns]...
|
||||
/// A A ...
|
||||
/// A B ...
|
||||
/// A B ...
|
||||
/// A A ...
|
||||
///
|
||||
/// Then the batch could be divided into 3 segments. The first would be rows 0 & 1,
|
||||
/// the second would be rows 2 & 3, and the third would be row 4.
|
||||
///
|
||||
/// Further, a segmenter keeps track of the last value seen. This allows it to calculate
|
||||
/// segments which span batches. In our above example the last batch we emit would set
|
||||
/// the "open" flag, which indicates whether the segment may extend into the next batch.
|
||||
///
|
||||
/// If the next call to the segmenter starts with `A A` then that segment would set the
|
||||
/// "extends" flag, which indicates whether the segment continues the last open batch.
|
||||
class ARROW_COMPUTE_EXPORT RowSegmenter {
|
||||
public:
|
||||
virtual ~RowSegmenter() = default;
|
||||
|
||||
/// \brief Construct a Segmenter which segments on the specified key types
|
||||
///
|
||||
/// \param[in] key_types the specified key types
|
||||
/// \param[in] nullable_keys whether values of the specified keys may be null
|
||||
/// \param[in] ctx the execution context to use
|
||||
static Result<std::unique_ptr<RowSegmenter>> Make(
|
||||
const std::vector<TypeHolder>& key_types, bool nullable_keys, ExecContext* ctx);
|
||||
|
||||
/// \brief Return the key types of this segmenter
|
||||
virtual const std::vector<TypeHolder>& key_types() const = 0;
|
||||
|
||||
/// \brief Reset this segmenter
|
||||
///
|
||||
/// A segmenter normally extends (see `Segment`) a segment from one batch to the next.
|
||||
/// If segment-extension is undesirable, for example when each batch is processed
|
||||
/// independently, then `Reset` should be invoked before processing the next batch.
|
||||
virtual Status Reset() = 0;
|
||||
|
||||
/// \brief Get all segments for the given batch
|
||||
virtual Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) = 0;
|
||||
};
|
||||
|
||||
/// Consumes batches of keys and yields batches of the group ids.
|
||||
class ARROW_COMPUTE_EXPORT Grouper {
|
||||
public:
|
||||
virtual ~Grouper() = default;
|
||||
|
||||
/// Construct a Grouper which receives the specified key types
|
||||
static Result<std::unique_ptr<Grouper>> Make(const std::vector<TypeHolder>& key_types,
|
||||
ExecContext* ctx = default_exec_context());
|
||||
|
||||
/// Reset all intermediate state, make the grouper logically as just `Make`ed.
|
||||
/// The underlying buffers, if any, may or may not be released though.
|
||||
virtual Status Reset() = 0;
|
||||
|
||||
/// Consume a batch of keys, producing the corresponding group ids as an integer array,
|
||||
/// over a slice defined by an offset and length, which defaults to the batch length.
|
||||
/// Currently only uint32 indices will be produced, eventually the bit width will only
|
||||
/// be as wide as necessary.
|
||||
virtual Result<Datum> Consume(const ExecSpan& batch, int64_t offset = 0,
|
||||
int64_t length = -1) = 0;
|
||||
|
||||
/// Like Consume, but groups not already encountered emit null instead of
|
||||
/// generating a new group id.
|
||||
virtual Result<Datum> Lookup(const ExecSpan& batch, int64_t offset = 0,
|
||||
int64_t length = -1) = 0;
|
||||
|
||||
/// Like Consume, but only populates the Grouper without returning the group ids.
|
||||
virtual Status Populate(const ExecSpan& batch, int64_t offset = 0,
|
||||
int64_t length = -1) = 0;
|
||||
|
||||
/// Get current unique keys. May be called multiple times.
|
||||
virtual Result<ExecBatch> GetUniques() = 0;
|
||||
|
||||
/// Get the current number of groups.
|
||||
virtual uint32_t num_groups() const = 0;
|
||||
|
||||
/// \brief Assemble lists of indices of identical elements.
|
||||
///
|
||||
/// \param[in] ids An unsigned, all-valid integral array which will be
|
||||
/// used as grouping criteria.
|
||||
/// \param[in] num_groups An upper bound for the elements of ids
|
||||
/// \param[in] ctx Execution context to use during the operation
|
||||
/// \return A num_groups-long ListArray where the slot at i contains a
|
||||
/// list of indices where i appears in ids.
|
||||
///
|
||||
/// MakeGroupings([
|
||||
/// 2,
|
||||
/// 2,
|
||||
/// 5,
|
||||
/// 5,
|
||||
/// 2,
|
||||
/// 3
|
||||
/// ], 8) == [
|
||||
/// [],
|
||||
/// [],
|
||||
/// [0, 1, 4],
|
||||
/// [5],
|
||||
/// [],
|
||||
/// [2, 3],
|
||||
/// [],
|
||||
/// []
|
||||
/// ]
|
||||
static Result<std::shared_ptr<ListArray>> MakeGroupings(
|
||||
const UInt32Array& ids, uint32_t num_groups,
|
||||
ExecContext* ctx = default_exec_context());
|
||||
|
||||
/// \brief Produce a ListArray whose slots are selections of `array` which correspond to
|
||||
/// the provided groupings.
|
||||
///
|
||||
/// For example,
|
||||
/// ApplyGroupings([
|
||||
/// [],
|
||||
/// [],
|
||||
/// [0, 1, 4],
|
||||
/// [5],
|
||||
/// [],
|
||||
/// [2, 3],
|
||||
/// [],
|
||||
/// []
|
||||
/// ], [2, 2, 5, 5, 2, 3]) == [
|
||||
/// [],
|
||||
/// [],
|
||||
/// [2, 2, 2],
|
||||
/// [3],
|
||||
/// [],
|
||||
/// [5, 5],
|
||||
/// [],
|
||||
/// []
|
||||
/// ]
|
||||
static Result<std::shared_ptr<ListArray>> ApplyGroupings(
|
||||
const ListArray& groupings, const Array& array,
|
||||
ExecContext* ctx = default_exec_context());
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,59 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
struct Datum;
|
||||
struct TypeHolder;
|
||||
|
||||
namespace compute {
|
||||
|
||||
class Function;
|
||||
class ScalarAggregateFunction;
|
||||
class FunctionExecutor;
|
||||
class FunctionOptions;
|
||||
class FunctionRegistry;
|
||||
|
||||
/// \brief Return the process-global function registry.
|
||||
// Defined in registry.cc
|
||||
ARROW_EXPORT FunctionRegistry* GetFunctionRegistry();
|
||||
|
||||
class CastOptions;
|
||||
|
||||
struct ExecBatch;
|
||||
class ExecContext;
|
||||
struct ExecValue;
|
||||
class KernelContext;
|
||||
|
||||
struct Kernel;
|
||||
struct ScalarKernel;
|
||||
struct ScalarAggregateKernel;
|
||||
struct VectorKernel;
|
||||
|
||||
struct KernelState;
|
||||
|
||||
class Expression;
|
||||
|
||||
ARROW_EXPORT ExecContext* default_exec_context();
|
||||
ARROW_EXPORT ExecContext* threaded_exec_context();
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,221 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/compute/visibility.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/cpu_info.h"
|
||||
#include "arrow/util/simd.h"
|
||||
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
# define BYTESWAP(x) __builtin_bswap64(x)
|
||||
# define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
|
||||
# define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
|
||||
#elif defined(_MSC_VER)
|
||||
# include <intrin.h>
|
||||
# define BYTESWAP(x) _byteswap_uint64(x)
|
||||
# define ROTL(x, n) _rotl((x), (n))
|
||||
# define ROTL64(x, n) _rotl64((x), (n))
|
||||
#endif
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
// Some platforms typedef int64_t as long int instead of long long int,
|
||||
// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
|
||||
// which need long long.
|
||||
// We use the cast to the type below in these intrinsics to make the code
|
||||
// compile in all cases.
|
||||
//
|
||||
using int64_for_gather_t = const long long int; // NOLINT runtime-int
|
||||
|
||||
// All MiniBatch... classes use TempVectorStack for vector allocations and can
|
||||
// only work with vectors up to 1024 elements.
|
||||
//
|
||||
// They should only be allocated on the stack to guarantee the right sequence
|
||||
// of allocation and deallocation of vectors from TempVectorStack.
|
||||
//
|
||||
class MiniBatch {
|
||||
public:
|
||||
static constexpr int kLogMiniBatchLength = 10;
|
||||
static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
|
||||
};
|
||||
|
||||
namespace bit_util {
|
||||
|
||||
ARROW_COMPUTE_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
|
||||
const int num_bits, const uint8_t* bits,
|
||||
int* num_indexes, uint16_t* indexes,
|
||||
int bit_offset = 0);
|
||||
|
||||
ARROW_COMPUTE_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
|
||||
const int num_bits, const uint8_t* bits,
|
||||
const uint16_t* input_indexes,
|
||||
int* num_indexes, uint16_t* indexes,
|
||||
int bit_offset = 0);
|
||||
|
||||
// Input and output indexes may be pointing to the same data (in-place filtering).
|
||||
ARROW_COMPUTE_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits,
|
||||
const uint8_t* bits, int* num_indexes_bit0,
|
||||
uint16_t* indexes_bit0,
|
||||
uint16_t* indexes_bit1, int bit_offset = 0);
|
||||
|
||||
// Bit 1 is replaced with byte 0xFF.
|
||||
ARROW_COMPUTE_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits,
|
||||
const uint8_t* bits, uint8_t* bytes,
|
||||
int bit_offset = 0);
|
||||
|
||||
// Return highest bit of each byte.
|
||||
ARROW_COMPUTE_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits,
|
||||
const uint8_t* bytes, uint8_t* bits,
|
||||
int bit_offset = 0);
|
||||
|
||||
ARROW_COMPUTE_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
|
||||
uint32_t num_bytes);
|
||||
|
||||
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
|
||||
// The functions below use BMI2 instructions, be careful before calling!
|
||||
|
||||
namespace avx2 {
|
||||
ARROW_COMPUTE_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
|
||||
const uint8_t* bits,
|
||||
const uint16_t* input_indexes,
|
||||
int* num_indexes, uint16_t* indexes);
|
||||
ARROW_COMPUTE_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
|
||||
const uint8_t* bits, int* num_indexes,
|
||||
uint16_t* indexes,
|
||||
uint16_t base_index = 0);
|
||||
ARROW_COMPUTE_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits,
|
||||
uint8_t* bytes);
|
||||
ARROW_COMPUTE_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes,
|
||||
uint8_t* bits);
|
||||
ARROW_COMPUTE_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes,
|
||||
uint32_t num_bytes);
|
||||
} // namespace avx2
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace bit_util
|
||||
} // namespace util
|
||||
|
||||
namespace compute {
|
||||
|
||||
/// Modify an Expression with pre-order and post-order visitation.
|
||||
/// `pre` will be invoked on each Expression. `pre` will visit Calls before their
|
||||
/// arguments, `post_call` will visit Calls (and no other Expressions) after their
|
||||
/// arguments. Visitors should return the Identical expression to indicate no change; this
|
||||
/// will prevent unnecessary construction in the common case where a modification is not
|
||||
/// possible/necessary/...
|
||||
///
|
||||
/// If an argument was modified, `post_call` visits a reconstructed Call with the modified
|
||||
/// arguments but also receives a pointer to the unmodified Expression as a second
|
||||
/// argument. If no arguments were modified the unmodified Expression* will be nullptr.
|
||||
template <typename PreVisit, typename PostVisitCall>
|
||||
Result<Expression> ModifyExpression(Expression expr, const PreVisit& pre,
|
||||
const PostVisitCall& post_call) {
|
||||
ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
|
||||
|
||||
auto call = expr.call();
|
||||
if (!call) return expr;
|
||||
|
||||
bool at_least_one_modified = false;
|
||||
std::vector<Expression> modified_arguments;
|
||||
|
||||
for (size_t i = 0; i < call->arguments.size(); ++i) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto modified_argument,
|
||||
ModifyExpression(call->arguments[i], pre, post_call));
|
||||
|
||||
if (Expression::Identical(modified_argument, call->arguments[i])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!at_least_one_modified) {
|
||||
modified_arguments = call->arguments;
|
||||
at_least_one_modified = true;
|
||||
}
|
||||
|
||||
modified_arguments[i] = std::move(modified_argument);
|
||||
}
|
||||
|
||||
if (at_least_one_modified) {
|
||||
// reconstruct the call expression with the modified arguments
|
||||
auto modified_call = *call;
|
||||
modified_call.arguments = std::move(modified_arguments);
|
||||
return post_call(Expression(std::move(modified_call)), &expr);
|
||||
}
|
||||
|
||||
return post_call(std::move(expr), NULLPTR);
|
||||
}
|
||||
|
||||
// Helper class to calculate the modified number of rows to process using SIMD.
|
||||
//
|
||||
// Some array elements at the end will be skipped in order to avoid buffer
|
||||
// overrun, when doing memory loads and stores using larger word size than a
|
||||
// single array element.
|
||||
//
|
||||
class TailSkipForSIMD {
|
||||
public:
|
||||
static int64_t FixBitAccess(int num_bytes_accessed_together, int64_t num_rows,
|
||||
int bit_offset) {
|
||||
int64_t num_bytes = bit_util::BytesForBits(num_rows + bit_offset);
|
||||
int64_t num_bytes_safe =
|
||||
std::max(static_cast<int64_t>(0LL), num_bytes - num_bytes_accessed_together + 1);
|
||||
int64_t num_rows_safe =
|
||||
std::max(static_cast<int64_t>(0LL), 8 * num_bytes_safe - bit_offset);
|
||||
return std::min(num_rows_safe, num_rows);
|
||||
}
|
||||
static int64_t FixBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
|
||||
int64_t length) {
|
||||
int64_t num_rows_to_skip = bit_util::CeilDiv(length, num_bytes_accessed_together);
|
||||
int64_t num_rows_safe =
|
||||
std::max(static_cast<int64_t>(0LL), num_rows - num_rows_to_skip);
|
||||
return num_rows_safe;
|
||||
}
|
||||
static int64_t FixVarBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
|
||||
const uint32_t* offsets) {
|
||||
// Do not process rows that could read past the end of the buffer using N
|
||||
// byte loads/stores.
|
||||
//
|
||||
int64_t num_rows_safe = num_rows;
|
||||
while (num_rows_safe > 0 &&
|
||||
offsets[num_rows_safe] + num_bytes_accessed_together > offsets[num_rows]) {
|
||||
--num_rows_safe;
|
||||
}
|
||||
return num_rows_safe;
|
||||
}
|
||||
static int FixSelection(int64_t num_rows_safe, int num_selected,
|
||||
const uint16_t* selection) {
|
||||
int num_selected_safe = num_selected;
|
||||
while (num_selected_safe > 0 && selection[num_selected_safe - 1] >= num_rows_safe) {
|
||||
--num_selected_safe;
|
||||
}
|
||||
return num_selected_safe;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,49 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(push)
|
||||
# pragma warning(disable : 4251)
|
||||
# else
|
||||
# pragma GCC diagnostic ignored "-Wattributes"
|
||||
# endif
|
||||
|
||||
# ifdef ARROW_COMPUTE_STATIC
|
||||
# define ARROW_COMPUTE_EXPORT
|
||||
# elif defined(ARROW_COMPUTE_EXPORTING)
|
||||
# define ARROW_COMPUTE_EXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define ARROW_COMPUTE_EXPORT __declspec(dllimport)
|
||||
# endif
|
||||
|
||||
# define ARROW_COMPUTE_NO_EXPORT
|
||||
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(pop)
|
||||
# endif
|
||||
|
||||
#else // Not Windows
|
||||
# ifndef ARROW_COMPUTE_EXPORT
|
||||
# define ARROW_COMPUTE_EXPORT __attribute__((visibility("default")))
|
||||
# endif
|
||||
# ifndef ARROW_COMPUTE_NO_EXPORT
|
||||
# define ARROW_COMPUTE_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
# endif
|
||||
#endif
|
||||
@@ -0,0 +1,98 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/config.h" // IWYU pragma: export
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
struct BuildInfo {
|
||||
/// The packed version number, e.g. 1002003 (decimal) for Arrow 1.2.3
|
||||
int version;
|
||||
/// The "major" version number, e.g. 1 for Arrow 1.2.3
|
||||
int version_major;
|
||||
/// The "minor" version number, e.g. 2 for Arrow 1.2.3
|
||||
int version_minor;
|
||||
/// The "patch" version number, e.g. 3 for Arrow 1.2.3
|
||||
int version_patch;
|
||||
/// The version string, e.g. "1.2.3"
|
||||
std::string version_string;
|
||||
std::string so_version;
|
||||
std::string full_so_version;
|
||||
|
||||
/// The CMake compiler identifier, e.g. "GNU"
|
||||
std::string compiler_id;
|
||||
std::string compiler_version;
|
||||
std::string compiler_flags;
|
||||
|
||||
/// The git changeset id, if available
|
||||
std::string git_id;
|
||||
/// The git changeset description, if available
|
||||
std::string git_description;
|
||||
std::string package_kind;
|
||||
|
||||
/// The uppercase build type, e.g. "DEBUG" or "RELEASE"
|
||||
std::string build_type;
|
||||
};
|
||||
|
||||
struct RuntimeInfo {
|
||||
/// The enabled SIMD level
|
||||
///
|
||||
/// This can be less than `detected_simd_level` if the ARROW_USER_SIMD_LEVEL
|
||||
/// environment variable is set to another value.
|
||||
std::string simd_level;
|
||||
|
||||
/// The SIMD level available on the OS and CPU
|
||||
std::string detected_simd_level;
|
||||
|
||||
/// Whether using the OS-based timezone database
|
||||
/// This is set at compile-time.
|
||||
bool using_os_timezone_db;
|
||||
|
||||
/// The path to the timezone database; by default None.
|
||||
std::optional<std::string> timezone_db_path;
|
||||
};
|
||||
|
||||
/// \brief Get runtime build info.
|
||||
///
|
||||
/// The returned values correspond to exact loaded version of the Arrow library,
|
||||
/// rather than the values frozen at application compile-time through the `ARROW_*`
|
||||
/// preprocessor definitions.
|
||||
ARROW_EXPORT
|
||||
const BuildInfo& GetBuildInfo();
|
||||
|
||||
/// \brief Get runtime info.
|
||||
///
|
||||
ARROW_EXPORT
|
||||
RuntimeInfo GetRuntimeInfo();
|
||||
|
||||
struct GlobalOptions {
|
||||
/// Path to text timezone database. This is only configurable on Windows,
|
||||
/// which does not have a compatible OS timezone database.
|
||||
std::optional<std::string> timezone_db_path;
|
||||
};
|
||||
|
||||
ARROW_EXPORT
|
||||
Status Initialize(const GlobalOptions& options) noexcept;
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,22 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/csv/options.h"
|
||||
#include "arrow/csv/reader.h"
|
||||
#include "arrow/csv/writer.h"
|
||||
@@ -0,0 +1,36 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/csv/options.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/delimiting.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace csv {
|
||||
|
||||
ARROW_EXPORT
|
||||
std::unique_ptr<Chunker> MakeChunker(const ParseOptions& options);
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,78 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace csv {
|
||||
|
||||
class BlockParser;
|
||||
struct ConvertOptions;
|
||||
|
||||
class ARROW_EXPORT ColumnBuilder {
|
||||
public:
|
||||
virtual ~ColumnBuilder() = default;
|
||||
|
||||
/// Spawn a task that will try to convert and append the given CSV block.
|
||||
/// All calls to Append() should happen on the same thread, otherwise
|
||||
/// call Insert() instead.
|
||||
virtual void Append(const std::shared_ptr<BlockParser>& parser) = 0;
|
||||
|
||||
/// Spawn a task that will try to convert and insert the given CSV block
|
||||
virtual void Insert(int64_t block_index,
|
||||
const std::shared_ptr<BlockParser>& parser) = 0;
|
||||
|
||||
/// Return the final chunked array. The TaskGroup _must_ have finished!
|
||||
virtual Result<std::shared_ptr<ChunkedArray>> Finish() = 0;
|
||||
|
||||
std::shared_ptr<arrow::internal::TaskGroup> task_group() { return task_group_; }
|
||||
|
||||
/// Construct a strictly-typed ColumnBuilder.
|
||||
static Result<std::shared_ptr<ColumnBuilder>> Make(
|
||||
MemoryPool* pool, const std::shared_ptr<DataType>& type, int32_t col_index,
|
||||
const ConvertOptions& options,
|
||||
const std::shared_ptr<arrow::internal::TaskGroup>& task_group);
|
||||
|
||||
/// Construct a type-inferring ColumnBuilder.
|
||||
static Result<std::shared_ptr<ColumnBuilder>> Make(
|
||||
MemoryPool* pool, int32_t col_index, const ConvertOptions& options,
|
||||
const std::shared_ptr<arrow::internal::TaskGroup>& task_group);
|
||||
|
||||
/// Construct a ColumnBuilder for a column of nulls
|
||||
/// (i.e. not present in the CSV file).
|
||||
static Result<std::shared_ptr<ColumnBuilder>> MakeNull(
|
||||
MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
const std::shared_ptr<arrow::internal::TaskGroup>& task_group);
|
||||
|
||||
protected:
|
||||
explicit ColumnBuilder(std::shared_ptr<arrow::internal::TaskGroup> task_group)
|
||||
: task_group_(std::move(task_group)) {}
|
||||
|
||||
std::shared_ptr<arrow::internal::TaskGroup> task_group_;
|
||||
};
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,64 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace csv {
|
||||
|
||||
class BlockParser;
|
||||
struct ConvertOptions;
|
||||
|
||||
class ARROW_EXPORT ColumnDecoder {
|
||||
public:
|
||||
virtual ~ColumnDecoder() = default;
|
||||
|
||||
/// Spawn a task that will try to convert and insert the given CSV block
|
||||
virtual Future<std::shared_ptr<Array>> Decode(
|
||||
const std::shared_ptr<BlockParser>& parser) = 0;
|
||||
|
||||
/// Construct a strictly-typed ColumnDecoder.
|
||||
static Result<std::shared_ptr<ColumnDecoder>> Make(MemoryPool* pool,
|
||||
std::shared_ptr<DataType> type,
|
||||
int32_t col_index,
|
||||
const ConvertOptions& options);
|
||||
|
||||
/// Construct a type-inferring ColumnDecoder.
|
||||
/// Inference will run only on the first block, the type will be frozen afterwards.
|
||||
static Result<std::shared_ptr<ColumnDecoder>> Make(MemoryPool* pool, int32_t col_index,
|
||||
const ConvertOptions& options);
|
||||
|
||||
/// Construct a ColumnDecoder for a column of nulls
|
||||
/// (i.e. not present in the CSV file).
|
||||
static Result<std::shared_ptr<ColumnDecoder>> MakeNull(MemoryPool* pool,
|
||||
std::shared_ptr<DataType> type);
|
||||
|
||||
protected:
|
||||
ColumnDecoder() = default;
|
||||
};
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,82 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/csv/options.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace csv {
|
||||
|
||||
class BlockParser;
|
||||
|
||||
class ARROW_EXPORT Converter {
|
||||
public:
|
||||
Converter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
|
||||
MemoryPool* pool);
|
||||
virtual ~Converter() = default;
|
||||
|
||||
virtual Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
|
||||
int32_t col_index) = 0;
|
||||
|
||||
std::shared_ptr<DataType> type() const { return type_; }
|
||||
|
||||
// Create a Converter for the given data type
|
||||
static Result<std::shared_ptr<Converter>> Make(
|
||||
const std::shared_ptr<DataType>& type, const ConvertOptions& options,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
protected:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Converter);
|
||||
|
||||
virtual Status Initialize() = 0;
|
||||
|
||||
// CAUTION: ConvertOptions can grow large (if it customizes hundreds or
|
||||
// thousands of columns), so avoid copying it in each Converter.
|
||||
const ConvertOptions& options_;
|
||||
MemoryPool* pool_;
|
||||
std::shared_ptr<DataType> type_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT DictionaryConverter : public Converter {
|
||||
public:
|
||||
DictionaryConverter(const std::shared_ptr<DataType>& value_type,
|
||||
const ConvertOptions& options, MemoryPool* pool);
|
||||
|
||||
// If the dictionary length goes above this value, conversion will fail
|
||||
// with Status::IndexError.
|
||||
virtual void SetMaxCardinality(int32_t max_length) = 0;
|
||||
|
||||
// Create a Converter for the given dictionary value type.
|
||||
// The dictionary index type will always be Int32.
|
||||
static Result<std::shared_ptr<DictionaryConverter>> Make(
|
||||
const std::shared_ptr<DataType>& value_type, const ConvertOptions& options,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
protected:
|
||||
std::shared_ptr<DataType> value_type_;
|
||||
};
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,55 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <string_view>
|
||||
|
||||
namespace arrow {
|
||||
namespace csv {
|
||||
|
||||
/// \brief Description of an invalid row
|
||||
struct InvalidRow {
|
||||
/// \brief Number of columns expected in the row
|
||||
int32_t expected_columns;
|
||||
/// \brief Actual number of columns found in the row
|
||||
int32_t actual_columns;
|
||||
/// \brief The physical row number if known or -1
|
||||
///
|
||||
/// This number is one-based and also accounts for non-data rows (such as
|
||||
/// CSV header rows).
|
||||
int64_t number;
|
||||
/// \brief View of the entire row. Memory will be freed after callback returns
|
||||
const std::string_view text;
|
||||
};
|
||||
|
||||
/// \brief Result returned by an InvalidRowHandler
|
||||
enum class InvalidRowResult {
|
||||
// Generate an error describing this row
|
||||
Error,
|
||||
// Skip over this row
|
||||
Skip
|
||||
};
|
||||
|
||||
/// \brief callback for handling a row with an invalid number of columns while parsing
|
||||
/// \return result indicating if an error should be returned from the parser or the row is
|
||||
/// skipped
|
||||
using InvalidRowHandler = std::function<InvalidRowResult(const InvalidRow&)>;
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,226 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/csv/invalid_row.h"
|
||||
#include "arrow/csv/type_fwd.h"
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class DataType;
|
||||
class TimestampParser;
|
||||
|
||||
namespace csv {
|
||||
|
||||
// Silly workaround for https://github.com/michaeljones/breathe/issues/453
|
||||
constexpr char kDefaultEscapeChar = '\\';
|
||||
|
||||
struct ARROW_EXPORT ParseOptions {
|
||||
// Parsing options
|
||||
|
||||
/// Field delimiter
|
||||
char delimiter = ',';
|
||||
/// Whether quoting is used
|
||||
bool quoting = true;
|
||||
/// Quoting character (if `quoting` is true)
|
||||
char quote_char = '"';
|
||||
/// Whether a quote inside a value is double-quoted
|
||||
bool double_quote = true;
|
||||
/// Whether escaping is used
|
||||
bool escaping = false;
|
||||
/// Escaping character (if `escaping` is true)
|
||||
char escape_char = kDefaultEscapeChar;
|
||||
/// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
|
||||
bool newlines_in_values = false;
|
||||
/// Whether empty lines are ignored. If false, an empty line represents
|
||||
/// a single empty value (assuming a one-column CSV file).
|
||||
bool ignore_empty_lines = true;
|
||||
/// A handler function for rows which do not have the correct number of columns
|
||||
InvalidRowHandler invalid_row_handler;
|
||||
|
||||
/// Create parsing options with default values
|
||||
static ParseOptions Defaults();
|
||||
|
||||
/// \brief Test that all set options are valid
|
||||
Status Validate() const;
|
||||
};
|
||||
|
||||
struct ARROW_EXPORT ConvertOptions {
|
||||
// Conversion options
|
||||
|
||||
/// Whether to check UTF8 validity of string columns
|
||||
bool check_utf8 = true;
|
||||
/// Optional per-column types (disabling type inference on those columns)
|
||||
std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
|
||||
/// Recognized spellings for null values
|
||||
std::vector<std::string> null_values;
|
||||
/// Recognized spellings for boolean true values
|
||||
std::vector<std::string> true_values;
|
||||
/// Recognized spellings for boolean false values
|
||||
std::vector<std::string> false_values;
|
||||
|
||||
/// Whether string / binary columns can have null values.
|
||||
///
|
||||
/// If true, then strings in "null_values" are considered null for string columns.
|
||||
/// If false, then all strings are valid string values.
|
||||
bool strings_can_be_null = false;
|
||||
|
||||
/// Whether quoted values can be null.
|
||||
///
|
||||
/// If true, then strings in "null_values" are also considered null when they
|
||||
/// appear quoted in the CSV file. Otherwise, quoted values are never considered null.
|
||||
bool quoted_strings_can_be_null = true;
|
||||
|
||||
/// Whether to try to automatically dict-encode string / binary data.
|
||||
/// If true, then when type inference detects a string or binary column,
|
||||
/// it is dict-encoded up to `auto_dict_max_cardinality` distinct values
|
||||
/// (per chunk), after which it switches to regular encoding.
|
||||
///
|
||||
/// This setting is ignored for non-inferred columns (those in `column_types`).
|
||||
bool auto_dict_encode = false;
|
||||
int32_t auto_dict_max_cardinality = 50;
|
||||
|
||||
/// Decimal point character for floating-point and decimal data
|
||||
char decimal_point = '.';
|
||||
|
||||
// XXX Should we have a separate FilterOptions?
|
||||
|
||||
/// If non-empty, indicates the names of columns from the CSV file that should
|
||||
/// be actually read and converted (in the vector's order).
|
||||
/// Columns not in this vector will be ignored.
|
||||
std::vector<std::string> include_columns;
|
||||
/// If false, columns in `include_columns` but not in the CSV file will error out.
|
||||
/// If true, columns in `include_columns` but not in the CSV file will produce
|
||||
/// a column of nulls (whose type is selected using `column_types`,
|
||||
/// or null by default)
|
||||
/// This option is ignored if `include_columns` is empty.
|
||||
bool include_missing_columns = false;
|
||||
|
||||
/// User-defined timestamp parsers, using the virtual parser interface in
|
||||
/// arrow/util/value_parsing.h. More than one parser can be specified, and
|
||||
/// the CSV conversion logic will try parsing values starting from the
|
||||
/// beginning of this vector. If no parsers are specified, we use the default
|
||||
/// built-in ISO-8601 parser.
|
||||
std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;
|
||||
|
||||
/// Create conversion options with default values, including conventional
|
||||
/// values for `null_values`, `true_values` and `false_values`
|
||||
static ConvertOptions Defaults();
|
||||
|
||||
/// \brief Test that all set options are valid
|
||||
Status Validate() const;
|
||||
};
|
||||
|
||||
struct ARROW_EXPORT ReadOptions {
|
||||
// Reader options
|
||||
|
||||
/// Whether to use the global CPU thread pool
|
||||
bool use_threads = true;
|
||||
|
||||
/// \brief Block size we request from the IO layer.
|
||||
///
|
||||
/// This will determine multi-threading granularity as well as
|
||||
/// the size of individual record batches.
|
||||
/// Minimum valid value for block size is 1
|
||||
int32_t block_size = 1 << 20; // 1 MB
|
||||
|
||||
/// Number of header rows to skip (not including the row of column names, if any)
|
||||
int32_t skip_rows = 0;
|
||||
|
||||
/// Number of rows to skip after the column names are read, if any
|
||||
int32_t skip_rows_after_names = 0;
|
||||
|
||||
/// Column names for the target table.
|
||||
/// If empty, fall back on autogenerate_column_names.
|
||||
std::vector<std::string> column_names;
|
||||
|
||||
/// Whether to autogenerate column names if `column_names` is empty.
|
||||
/// If true, column names will be of the form "f0", "f1"...
|
||||
/// If false, column names will be read from the first CSV row after `skip_rows`.
|
||||
bool autogenerate_column_names = false;
|
||||
|
||||
/// Create read options with default values
|
||||
static ReadOptions Defaults();
|
||||
|
||||
/// \brief Test that all set options are valid
|
||||
Status Validate() const;
|
||||
};
|
||||
|
||||
/// \brief Quoting style for CSV writing
|
||||
enum class ARROW_EXPORT QuotingStyle {
|
||||
/// Only enclose values in quotes which need them, because their CSV rendering can
|
||||
/// contain quotes itself (e.g. strings or binary values)
|
||||
Needed,
|
||||
/// Enclose all valid values in quotes. Nulls are not quoted. May cause readers to
|
||||
/// interpret all values as strings if schema is inferred.
|
||||
AllValid,
|
||||
/// Do not enclose any values in quotes. Prevents values from containing quotes ("),
|
||||
/// cell delimiters (,) or line endings (\\r, \\n), (following RFC4180). If values
|
||||
/// contain these characters, an error is caused when attempting to write.
|
||||
None
|
||||
};
|
||||
|
||||
struct ARROW_EXPORT WriteOptions {
|
||||
/// Whether to write an initial header line with column names
|
||||
bool include_header = true;
|
||||
|
||||
/// \brief Maximum number of rows processed at a time
|
||||
///
|
||||
/// The CSV writer converts and writes data in batches of N rows.
|
||||
/// This number can impact performance.
|
||||
int32_t batch_size = 1024;
|
||||
|
||||
/// Field delimiter
|
||||
char delimiter = ',';
|
||||
|
||||
/// \brief The string to write for null values. Quotes are not allowed in this string.
|
||||
std::string null_string;
|
||||
|
||||
/// \brief IO context for writing.
|
||||
io::IOContext io_context;
|
||||
|
||||
/// \brief The end of line character to use for ending rows
|
||||
std::string eol = "\n";
|
||||
|
||||
/// \brief Quoting style
|
||||
QuotingStyle quoting_style = QuotingStyle::Needed;
|
||||
|
||||
/// \brief Quoting style of header
|
||||
///
|
||||
/// Note that `QuotingStyle::Needed` and `QuotingStyle::AllValid` have the same
|
||||
/// effect of quoting all column names.
|
||||
QuotingStyle quoting_header = QuotingStyle::Needed;
|
||||
|
||||
/// Create write options with default values
|
||||
static WriteOptions Defaults();
|
||||
|
||||
/// \brief Test that all set options are valid
|
||||
Status Validate() const;
|
||||
};
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,228 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/csv/options.h"
|
||||
#include "arrow/csv/type_fwd.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class MemoryPool;
|
||||
|
||||
namespace csv {
|
||||
|
||||
/// Skip at most num_rows from the given input. The input pointer is updated
|
||||
/// and the number of actually skipped rows is returns (may be less than
|
||||
/// requested if the input is too short).
|
||||
ARROW_EXPORT
|
||||
int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
|
||||
const uint8_t** out_data);
|
||||
|
||||
class BlockParserImpl;
|
||||
|
||||
namespace detail {
|
||||
|
||||
struct ParsedValueDesc {
|
||||
uint32_t offset : 31;
|
||||
bool quoted : 1;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT DataBatch {
|
||||
public:
|
||||
explicit DataBatch(int32_t num_cols) : num_cols_(num_cols) {}
|
||||
|
||||
/// \brief Return the number of parsed rows (not skipped)
|
||||
int32_t num_rows() const { return num_rows_; }
|
||||
/// \brief Return the number of parsed columns
|
||||
int32_t num_cols() const { return num_cols_; }
|
||||
/// \brief Return the total size in bytes of parsed data
|
||||
uint32_t num_bytes() const { return parsed_size_; }
|
||||
/// \brief Return the number of skipped rows
|
||||
int32_t num_skipped_rows() const { return static_cast<int32_t>(skipped_rows_.size()); }
|
||||
|
||||
template <typename Visitor>
|
||||
Status VisitColumn(int32_t col_index, int64_t first_row, Visitor&& visit) const {
|
||||
using detail::ParsedValueDesc;
|
||||
|
||||
int32_t batch_row = 0;
|
||||
for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) {
|
||||
const auto& values_buffer = values_buffers_[buf_index];
|
||||
const auto values = reinterpret_cast<const ParsedValueDesc*>(values_buffer->data());
|
||||
const auto max_pos =
|
||||
static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)) - 1;
|
||||
for (int32_t pos = col_index; pos < max_pos; pos += num_cols_, ++batch_row) {
|
||||
auto start = values[pos].offset;
|
||||
auto stop = values[pos + 1].offset;
|
||||
auto quoted = values[pos + 1].quoted;
|
||||
Status status = visit(parsed_ + start, stop - start, quoted);
|
||||
if (ARROW_PREDICT_FALSE(!status.ok())) {
|
||||
return DecorateWithRowNumber(std::move(status), first_row, batch_row);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename Visitor>
|
||||
Status VisitLastRow(Visitor&& visit) const {
|
||||
using detail::ParsedValueDesc;
|
||||
|
||||
const auto& values_buffer = values_buffers_.back();
|
||||
const auto values = reinterpret_cast<const ParsedValueDesc*>(values_buffer->data());
|
||||
const auto start_pos =
|
||||
static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)) -
|
||||
num_cols_ - 1;
|
||||
for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
|
||||
auto start = values[start_pos + col_index].offset;
|
||||
auto stop = values[start_pos + col_index + 1].offset;
|
||||
auto quoted = values[start_pos + col_index + 1].quoted;
|
||||
ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
Status DecorateWithRowNumber(Status&& status, int64_t first_row,
|
||||
int32_t batch_row) const {
|
||||
if (first_row >= 0) {
|
||||
// `skipped_rows_` is in ascending order by construction, so use bisection
|
||||
// to find out how many rows were skipped before `batch_row`.
|
||||
const auto skips_before =
|
||||
std::upper_bound(skipped_rows_.begin(), skipped_rows_.end(), batch_row) -
|
||||
skipped_rows_.begin();
|
||||
status = status.WithMessage("Row #", batch_row + skips_before + first_row, ": ",
|
||||
status.message());
|
||||
}
|
||||
// Use return_if so that when extra context is enabled it will be added
|
||||
ARROW_RETURN_IF_(true, std::move(status), ARROW_STRINGIFY(status));
|
||||
return std::move(status);
|
||||
}
|
||||
|
||||
// The number of rows in this batch (not including any skipped ones)
|
||||
int32_t num_rows_ = 0;
|
||||
// The number of columns
|
||||
int32_t num_cols_ = 0;
|
||||
|
||||
// XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes?
|
||||
// It may help with null parsing...
|
||||
std::vector<std::shared_ptr<Buffer>> values_buffers_;
|
||||
std::shared_ptr<Buffer> parsed_buffer_;
|
||||
const uint8_t* parsed_ = NULLPTR;
|
||||
int32_t parsed_size_ = 0;
|
||||
|
||||
// Record the current num_rows_ each time a row is skipped
|
||||
std::vector<int32_t> skipped_rows_;
|
||||
|
||||
friend class ::arrow::csv::BlockParserImpl;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
constexpr int32_t kMaxParserNumRows = 100000;
|
||||
|
||||
/// \class BlockParser
|
||||
/// \brief A reusable block-based parser for CSV data
|
||||
///
|
||||
/// The parser takes a block of CSV data and delimits rows and fields,
|
||||
/// unquoting and unescaping them on the fly. Parsed data is own by the
|
||||
/// parser, so the original buffer can be discarded after Parse() returns.
|
||||
///
|
||||
/// If the block is truncated (i.e. not all data can be parsed), it is up
|
||||
/// to the caller to arrange the next block to start with the trailing data.
|
||||
/// Also, if the previous block ends with CR (0x0d) and a new block starts
|
||||
/// with LF (0x0a), the parser will consider the leading newline as an empty
|
||||
/// line; the caller should therefore strip it.
|
||||
class ARROW_EXPORT BlockParser {
|
||||
public:
|
||||
explicit BlockParser(ParseOptions options, int32_t num_cols = -1,
|
||||
int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows);
|
||||
explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -1,
|
||||
int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows);
|
||||
~BlockParser();
|
||||
|
||||
/// \brief Parse a block of data
|
||||
///
|
||||
/// Parse a block of CSV data, ingesting up to max_num_rows rows.
|
||||
/// The number of bytes actually parsed is returned in out_size.
|
||||
Status Parse(std::string_view data, uint32_t* out_size);
|
||||
|
||||
/// \brief Parse sequential blocks of data
|
||||
///
|
||||
/// Only the last block is allowed to be truncated.
|
||||
Status Parse(const std::vector<std::string_view>& data, uint32_t* out_size);
|
||||
|
||||
/// \brief Parse the final block of data
|
||||
///
|
||||
/// Like Parse(), but called with the final block in a file.
|
||||
/// The last row may lack a trailing line separator.
|
||||
Status ParseFinal(std::string_view data, uint32_t* out_size);
|
||||
|
||||
/// \brief Parse the final sequential blocks of data
|
||||
///
|
||||
/// Only the last block is allowed to be truncated.
|
||||
Status ParseFinal(const std::vector<std::string_view>& data, uint32_t* out_size);
|
||||
|
||||
/// \brief Return the number of parsed rows
|
||||
int32_t num_rows() const { return parsed_batch().num_rows(); }
|
||||
/// \brief Return the number of parsed columns
|
||||
int32_t num_cols() const { return parsed_batch().num_cols(); }
|
||||
/// \brief Return the total size in bytes of parsed data
|
||||
uint32_t num_bytes() const { return parsed_batch().num_bytes(); }
|
||||
|
||||
/// \brief Return the total number of rows including rows which were skipped
|
||||
int32_t total_num_rows() const {
|
||||
return parsed_batch().num_rows() + parsed_batch().num_skipped_rows();
|
||||
}
|
||||
|
||||
/// \brief Return the row number of the first row in the block or -1 if unsupported
|
||||
int64_t first_row_num() const;
|
||||
|
||||
/// \brief Visit parsed values in a column
|
||||
///
|
||||
/// The signature of the visitor is
|
||||
/// Status(const uint8_t* data, uint32_t size, bool quoted)
|
||||
template <typename Visitor>
|
||||
Status VisitColumn(int32_t col_index, Visitor&& visit) const {
|
||||
return parsed_batch().VisitColumn(col_index, first_row_num(),
|
||||
std::forward<Visitor>(visit));
|
||||
}
|
||||
|
||||
template <typename Visitor>
|
||||
Status VisitLastRow(Visitor&& visit) const {
|
||||
return parsed_batch().VisitLastRow(std::forward<Visitor>(visit));
|
||||
}
|
||||
|
||||
protected:
|
||||
std::unique_ptr<BlockParserImpl> impl_;
|
||||
|
||||
const detail::DataBatch& parsed_batch() const;
|
||||
};
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,112 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/csv/options.h" // IWYU pragma: keep
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/future.h"
|
||||
#include "arrow/util/thread_pool.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace io {
|
||||
class InputStream;
|
||||
} // namespace io
|
||||
|
||||
namespace csv {
|
||||
|
||||
/// A class that reads an entire CSV file into a Arrow Table
|
||||
class ARROW_EXPORT TableReader {
|
||||
public:
|
||||
virtual ~TableReader() = default;
|
||||
|
||||
/// Read the entire CSV file and convert it to a Arrow Table
|
||||
virtual Result<std::shared_ptr<Table>> Read() = 0;
|
||||
/// Read the entire CSV file and convert it to a Arrow Table
|
||||
virtual Future<std::shared_ptr<Table>> ReadAsync() = 0;
|
||||
|
||||
/// Create a TableReader instance
|
||||
static Result<std::shared_ptr<TableReader>> Make(io::IOContext io_context,
|
||||
std::shared_ptr<io::InputStream> input,
|
||||
const ReadOptions&,
|
||||
const ParseOptions&,
|
||||
const ConvertOptions&);
|
||||
};
|
||||
|
||||
/// \brief A class that reads a CSV file incrementally
|
||||
///
|
||||
/// Caveats:
|
||||
/// - For now, this is always single-threaded (regardless of `ReadOptions::use_threads`.
|
||||
/// - Type inference is done on the first block and types are frozen afterwards;
|
||||
/// to make sure the right data types are inferred, either set
|
||||
/// `ReadOptions::block_size` to a large enough value, or use
|
||||
/// `ConvertOptions::column_types` to set the desired data types explicitly.
|
||||
class ARROW_EXPORT StreamingReader : public RecordBatchReader {
|
||||
public:
|
||||
virtual ~StreamingReader() = default;
|
||||
|
||||
virtual Future<std::shared_ptr<RecordBatch>> ReadNextAsync() = 0;
|
||||
|
||||
/// \brief Return the number of bytes which have been read and processed
|
||||
///
|
||||
/// The returned number includes CSV bytes which the StreamingReader has
|
||||
/// finished processing, but not bytes for which some processing (e.g.
|
||||
/// CSV parsing or conversion to Arrow layout) is still ongoing.
|
||||
///
|
||||
/// Furthermore, the following rules apply:
|
||||
/// - bytes skipped by `ReadOptions.skip_rows` are counted as being read before
|
||||
/// any records are returned.
|
||||
/// - bytes read while parsing the header are counted as being read before any
|
||||
/// records are returned.
|
||||
/// - bytes skipped by `ReadOptions.skip_rows_after_names` are counted after the
|
||||
/// first batch is returned.
|
||||
virtual int64_t bytes_read() const = 0;
|
||||
|
||||
/// Create a StreamingReader instance
|
||||
///
|
||||
/// This involves some I/O as the first batch must be loaded during the creation process
|
||||
/// so it is returned as a future
|
||||
///
|
||||
/// Currently, the StreamingReader is not async-reentrant and does not do any fan-out
|
||||
/// parsing (see ARROW-11889)
|
||||
static Future<std::shared_ptr<StreamingReader>> MakeAsync(
|
||||
io::IOContext io_context, std::shared_ptr<io::InputStream> input,
|
||||
arrow::internal::Executor* cpu_executor, const ReadOptions&, const ParseOptions&,
|
||||
const ConvertOptions&);
|
||||
|
||||
static Result<std::shared_ptr<StreamingReader>> Make(
|
||||
io::IOContext io_context, std::shared_ptr<io::InputStream> input,
|
||||
const ReadOptions&, const ParseOptions&, const ConvertOptions&);
|
||||
};
|
||||
|
||||
/// \brief Count the logical rows of data in a CSV file (i.e. the
|
||||
/// number of rows you would get if you read the file into a table).
|
||||
ARROW_EXPORT
|
||||
Future<int64_t> CountRowsAsync(io::IOContext io_context,
|
||||
std::shared_ptr<io::InputStream> input,
|
||||
arrow::internal::Executor* cpu_executor,
|
||||
const ReadOptions&, const ParseOptions&);
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,55 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/csv/parser.h"
|
||||
#include "arrow/testing/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace csv {
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
std::string MakeCSVData(std::vector<std::string> lines);
|
||||
|
||||
// Make a BlockParser from a vector of lines representing a CSV file
|
||||
ARROW_TESTING_EXPORT
|
||||
void MakeCSVParser(std::vector<std::string> lines, ParseOptions options, int32_t num_cols,
|
||||
MemoryPool* pool, std::shared_ptr<BlockParser>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void MakeCSVParser(std::vector<std::string> lines, ParseOptions options,
|
||||
std::shared_ptr<BlockParser>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void MakeCSVParser(std::vector<std::string> lines, std::shared_ptr<BlockParser>* out);
|
||||
|
||||
// Make a BlockParser from a vector of strings representing a single CSV column
|
||||
ARROW_TESTING_EXPORT
|
||||
void MakeColumnParser(std::vector<std::string> items, std::shared_ptr<BlockParser>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> MakeSampleCsvBuffer(
|
||||
size_t num_rows, std::function<bool(size_t row_num)> is_valid = {});
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,28 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
namespace arrow {
|
||||
namespace csv {
|
||||
|
||||
class TableReader;
|
||||
struct ConvertOptions;
|
||||
struct ReadOptions;
|
||||
struct ParseOptions;
|
||||
struct WriteOptions;
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,90 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/csv/options.h"
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/ipc/type_fwd.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/table.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace csv {
|
||||
|
||||
// Functionality for converting Arrow data to Comma separated value text.
|
||||
// This library supports all primitive types that can be cast to a StringArray or
|
||||
// a LargeStringArray.
|
||||
// It applies to following formatting rules:
|
||||
// - For non-binary types no quotes surround values. Nulls are represented as the empty
|
||||
// string.
|
||||
// - For binary types all non-null data is quoted (and quotes within data are escaped
|
||||
// with an additional quote).
|
||||
// Null values are empty and unquoted.
|
||||
|
||||
/// \defgroup csv-write-functions High-level functions for writing CSV files
|
||||
/// @{
|
||||
|
||||
/// \brief Convert table to CSV and write the result to output.
|
||||
/// Experimental
|
||||
ARROW_EXPORT Status WriteCSV(const Table& table, const WriteOptions& options,
|
||||
arrow::io::OutputStream* output);
|
||||
/// \brief Convert batch to CSV and write the result to output.
|
||||
/// Experimental
|
||||
ARROW_EXPORT Status WriteCSV(const RecordBatch& batch, const WriteOptions& options,
|
||||
arrow::io::OutputStream* output);
|
||||
/// \brief Convert batches read through a RecordBatchReader
|
||||
/// to CSV and write the results to output.
|
||||
/// Experimental
|
||||
ARROW_EXPORT Status WriteCSV(const std::shared_ptr<RecordBatchReader>& reader,
|
||||
const WriteOptions& options,
|
||||
arrow::io::OutputStream* output);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup csv-writer-factories Functions for creating an incremental CSV writer
|
||||
/// @{
|
||||
|
||||
/// \brief Create a new CSV writer. User is responsible for closing the
|
||||
/// actual OutputStream.
|
||||
///
|
||||
/// \param[in] sink output stream to write to
|
||||
/// \param[in] schema the schema of the record batches to be written
|
||||
/// \param[in] options options for serialization
|
||||
/// \return Result<std::shared_ptr<RecordBatchWriter>>
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ipc::RecordBatchWriter>> MakeCSVWriter(
|
||||
std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
|
||||
const WriteOptions& options = WriteOptions::Defaults());
|
||||
|
||||
/// \brief Create a new CSV writer.
|
||||
///
|
||||
/// \param[in] sink output stream to write to (does not take ownership)
|
||||
/// \param[in] schema the schema of the record batches to be written
|
||||
/// \param[in] options options for serialization
|
||||
/// \return Result<std::shared_ptr<RecordBatchWriter>>
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ipc::RecordBatchWriter>> MakeCSVWriter(
|
||||
io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
|
||||
const WriteOptions& options = WriteOptions::Defaults());
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace csv
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,39 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/dataset/dataset.h"
|
||||
#include "arrow/dataset/discovery.h"
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#ifdef ARROW_CSV
|
||||
# include "arrow/dataset/file_csv.h"
|
||||
#endif
|
||||
#ifdef ARROW_JSON
|
||||
# include "arrow/dataset/file_json.h"
|
||||
#endif
|
||||
#include "arrow/dataset/file_ipc.h"
|
||||
#ifdef ARROW_ORC
|
||||
# include "arrow/dataset/file_orc.h"
|
||||
#endif
|
||||
#ifdef ARROW_PARQUET
|
||||
# include "arrow/dataset/file_parquet.h"
|
||||
#endif
|
||||
#include "arrow/dataset/scanner.h"
|
||||
@@ -0,0 +1,491 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/util/async_generator_fwd.h"
|
||||
#include "arrow/util/future.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/mutex.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace internal {
|
||||
class Executor;
|
||||
} // namespace internal
|
||||
|
||||
namespace dataset {
|
||||
|
||||
using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
|
||||
|
||||
/// \brief Description of a column to scan
|
||||
struct ARROW_DS_EXPORT FragmentSelectionColumn {
|
||||
/// \brief The path to the column to load
|
||||
FieldPath path;
|
||||
/// \brief The type of the column in the dataset schema
|
||||
///
|
||||
/// A format may choose to ignore this field completely. For example, when
|
||||
/// reading from IPC the reader can just return the column in the data type
|
||||
/// that is stored on disk. There is no point in doing anything special.
|
||||
///
|
||||
/// However, some formats may be capable of casting on the fly. For example,
|
||||
/// when reading from CSV, if we know the target type of the column, we can
|
||||
/// convert from string to the target type as we read.
|
||||
DataType* requested_type;
|
||||
};
|
||||
|
||||
/// \brief A list of columns that should be loaded from a fragment
|
||||
///
|
||||
/// The paths in this selection should be referring to the fragment schema. This class
|
||||
/// contains a virtual destructor as it is expected evolution strategies will need to
|
||||
/// extend this to add any information needed to later evolve the batches.
|
||||
///
|
||||
/// For example, in the basic evolution strategy, we keep track of which columns
|
||||
/// were missing from the file so that we can fill those in with null when evolving.
|
||||
class ARROW_DS_EXPORT FragmentSelection {
|
||||
public:
|
||||
explicit FragmentSelection(std::vector<FragmentSelectionColumn> columns)
|
||||
: columns_(std::move(columns)) {}
|
||||
virtual ~FragmentSelection() = default;
|
||||
/// The columns that should be loaded from the fragment
|
||||
const std::vector<FragmentSelectionColumn>& columns() const { return columns_; }
|
||||
|
||||
private:
|
||||
std::vector<FragmentSelectionColumn> columns_;
|
||||
};
|
||||
|
||||
/// \brief Instructions for scanning a particular fragment
|
||||
///
|
||||
/// The fragment scan request is derived from ScanV2Options. The main
|
||||
/// difference is that the scan options are based on the dataset schema
|
||||
/// while the fragment request is based on the fragment schema.
|
||||
struct ARROW_DS_EXPORT FragmentScanRequest {
|
||||
/// \brief A row filter
|
||||
///
|
||||
/// The filter expression should be written against the fragment schema.
|
||||
///
|
||||
/// \see ScanV2Options for details on how this filter should be applied
|
||||
compute::Expression filter = compute::literal(true);
|
||||
|
||||
/// \brief The columns to scan
|
||||
///
|
||||
/// These indices refer to the fragment schema
|
||||
///
|
||||
/// Note: This is NOT a simple list of top-level column indices.
|
||||
/// For more details \see ScanV2Options
|
||||
///
|
||||
/// If possible a fragment should only read from disk the data needed
|
||||
/// to satisfy these columns. If a format cannot partially read a nested
|
||||
/// column (e.g. JSON) then it must apply the column selection (in memory)
|
||||
/// before returning the scanned batch.
|
||||
std::shared_ptr<FragmentSelection> fragment_selection;
|
||||
/// \brief Options specific to the format being scanned
|
||||
const FragmentScanOptions* format_scan_options;
|
||||
};
|
||||
|
||||
/// \brief An iterator-like object that can yield batches created from a fragment
|
||||
class ARROW_DS_EXPORT FragmentScanner {
|
||||
public:
|
||||
/// This instance will only be destroyed after all ongoing scan futures
|
||||
/// have been completed.
|
||||
///
|
||||
/// This means any callbacks created as part of the scan can safely
|
||||
/// capture `this`
|
||||
virtual ~FragmentScanner() = default;
|
||||
/// \brief Scan a batch of data from the file
|
||||
/// \param batch_number The index of the batch to read
|
||||
virtual Future<std::shared_ptr<RecordBatch>> ScanBatch(int batch_number) = 0;
|
||||
/// \brief Calculate an estimate of how many data bytes the given batch will represent
|
||||
///
|
||||
/// "Data bytes" should be the total size of all the buffers once the data has been
|
||||
/// decoded into the Arrow format.
|
||||
virtual int64_t EstimatedDataBytes(int batch_number) = 0;
|
||||
/// \brief The number of batches in the fragment to scan
|
||||
virtual int NumBatches() = 0;
|
||||
};
|
||||
|
||||
/// \brief Information learned about a fragment through inspection
|
||||
///
|
||||
/// This information can be used to figure out which fields need
|
||||
/// to be read from a file and how the data read in should be evolved
|
||||
/// to match the dataset schema.
|
||||
///
|
||||
/// For example, from a CSV file we can inspect and learn the column
|
||||
/// names and use those column names to determine which columns to load
|
||||
/// from the CSV file.
|
||||
struct ARROW_DS_EXPORT InspectedFragment {
|
||||
explicit InspectedFragment(std::vector<std::string> column_names)
|
||||
: column_names(std::move(column_names)) {}
|
||||
std::vector<std::string> column_names;
|
||||
};
|
||||
|
||||
/// \brief A granular piece of a Dataset, such as an individual file.
|
||||
///
|
||||
/// A Fragment can be read/scanned separately from other fragments. It yields a
|
||||
/// collection of RecordBatches when scanned
|
||||
///
|
||||
/// Note that Fragments have well defined physical schemas which are reconciled by
|
||||
/// the Datasets which contain them; these physical schemas may differ from a parent
|
||||
/// Dataset's schema and the physical schemas of sibling Fragments.
|
||||
class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
|
||||
public:
|
||||
/// \brief An expression that represents no known partition information
|
||||
static const compute::Expression kNoPartitionInformation;
|
||||
|
||||
/// \brief Return the physical schema of the Fragment.
|
||||
///
|
||||
/// The physical schema is also called the writer schema.
|
||||
/// This method is blocking and may suffer from high latency filesystem.
|
||||
/// The schema is cached after being read once, or may be specified at construction.
|
||||
Result<std::shared_ptr<Schema>> ReadPhysicalSchema();
|
||||
|
||||
/// An asynchronous version of Scan
|
||||
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options) = 0;
|
||||
|
||||
/// \brief Inspect a fragment to learn basic information
|
||||
///
|
||||
/// This will be called before a scan and a fragment should attach whatever
|
||||
/// information will be needed to figure out an evolution strategy. This information
|
||||
/// will then be passed to the call to BeginScan
|
||||
virtual Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
|
||||
|
||||
/// \brief Start a scan operation
|
||||
virtual Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
|
||||
|
||||
/// \brief Count the number of rows in this fragment matching the filter using metadata
|
||||
/// only. That is, this method may perform I/O, but will not load data.
|
||||
///
|
||||
/// If this is not possible, resolve with an empty optional. The fragment can perform
|
||||
/// I/O (e.g. to read metadata) before it deciding whether it can satisfy the request.
|
||||
virtual Future<std::optional<int64_t>> CountRows(
|
||||
compute::Expression predicate, const std::shared_ptr<ScanOptions>& options);
|
||||
|
||||
/// \brief Clear any metadata that may have been cached by this object.
|
||||
///
|
||||
/// A fragment may typically cache metadata to speed up repeated accesses.
|
||||
/// In use cases when memory use is more critical than CPU time, calling
|
||||
/// this function can help reclaim memory.
|
||||
virtual Status ClearCachedMetadata();
|
||||
|
||||
virtual std::string type_name() const = 0;
|
||||
virtual std::string ToString() const { return type_name(); }
|
||||
|
||||
/// \brief An expression which evaluates to true for all data viewed by this
|
||||
/// Fragment.
|
||||
const compute::Expression& partition_expression() const {
|
||||
return partition_expression_;
|
||||
}
|
||||
|
||||
virtual ~Fragment() = default;
|
||||
|
||||
protected:
|
||||
Fragment() = default;
|
||||
explicit Fragment(compute::Expression partition_expression,
|
||||
std::shared_ptr<Schema> physical_schema);
|
||||
|
||||
virtual Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() = 0;
|
||||
|
||||
util::Mutex physical_schema_mutex_;
|
||||
compute::Expression partition_expression_ = compute::literal(true);
|
||||
// The physical schema that is inferred from the Fragment
|
||||
std::shared_ptr<Schema> physical_schema_;
|
||||
// The physical schema that was passed to the Fragment constructor
|
||||
std::shared_ptr<Schema> given_physical_schema_;
|
||||
};
|
||||
|
||||
/// \brief Per-scan options for fragment(s) in a dataset.
|
||||
///
|
||||
/// These options are not intrinsic to the format or fragment itself, but do affect
|
||||
/// the results of a scan. These are options which make sense to change between
|
||||
/// repeated reads of the same dataset, such as format-specific conversion options
|
||||
/// (that do not affect the schema).
|
||||
///
|
||||
/// \ingroup dataset-scanning
|
||||
class ARROW_DS_EXPORT FragmentScanOptions {
|
||||
public:
|
||||
virtual std::string type_name() const = 0;
|
||||
virtual std::string ToString() const { return type_name(); }
|
||||
virtual ~FragmentScanOptions() = default;
|
||||
};
|
||||
|
||||
/// \defgroup dataset-implementations Concrete implementations
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief A trivial Fragment that yields ScanTask out of a fixed set of
|
||||
/// RecordBatch.
|
||||
class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
|
||||
public:
|
||||
class Scanner;
|
||||
InMemoryFragment(std::shared_ptr<Schema> schema, RecordBatchVector record_batches,
|
||||
compute::Expression = compute::literal(true));
|
||||
explicit InMemoryFragment(RecordBatchVector record_batches,
|
||||
compute::Expression = compute::literal(true));
|
||||
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
|
||||
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) override;
|
||||
Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) override;
|
||||
|
||||
std::string type_name() const override { return "in-memory"; }
|
||||
|
||||
protected:
|
||||
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
|
||||
|
||||
RecordBatchVector record_batches_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
using FragmentGenerator = AsyncGenerator<std::shared_ptr<Fragment>>;
|
||||
|
||||
/// \brief Rules for converting the dataset schema to and from fragment schemas
|
||||
class ARROW_DS_EXPORT FragmentEvolutionStrategy {
|
||||
public:
|
||||
/// This instance will only be destroyed when all scan operations for the
|
||||
/// fragment have completed.
|
||||
virtual ~FragmentEvolutionStrategy() = default;
|
||||
/// \brief A guarantee that applies to all batches of this fragment
|
||||
///
|
||||
/// For example, if a fragment is missing one of the fields in the dataset
|
||||
/// schema then a typical evolution strategy is to set that field to null.
|
||||
///
|
||||
/// So if the column at index 3 is missing then the guarantee is
|
||||
/// FieldRef(3) == null
|
||||
///
|
||||
/// Individual field guarantees should be AND'd together and returned
|
||||
/// as a single expression.
|
||||
virtual Result<compute::Expression> GetGuarantee(
|
||||
const std::vector<FieldPath>& dataset_schema_selection) const = 0;
|
||||
|
||||
/// \brief Return a fragment schema selection given a dataset schema selection
|
||||
///
|
||||
/// For example, if the user wants fields 2 & 4 of the dataset schema and
|
||||
/// in this fragment the field 2 is missing and the field 4 is at index 1 then
|
||||
/// this should return {1}
|
||||
virtual Result<std::unique_ptr<FragmentSelection>> DevolveSelection(
|
||||
const std::vector<FieldPath>& dataset_schema_selection) const = 0;
|
||||
|
||||
/// \brief Return a filter expression bound to the fragment schema given
|
||||
/// a filter expression bound to the dataset schema
|
||||
///
|
||||
/// The dataset scan filter will first be simplified by the guarantee returned
|
||||
/// by GetGuarantee. This means an evolution that only handles dropping or casting
|
||||
/// fields doesn't need to do anything here except return the given filter.
|
||||
///
|
||||
/// On the other hand, an evolution that is doing some kind of aliasing will likely
|
||||
/// need to convert field references in the filter to the aliased field references
|
||||
/// where appropriate.
|
||||
virtual Result<compute::Expression> DevolveFilter(
|
||||
const compute::Expression& filter) const = 0;
|
||||
|
||||
/// \brief Convert a batch from the fragment schema to the dataset schema
|
||||
///
|
||||
/// Typically this involves casting columns from the data type stored on disk
|
||||
/// to the data type of the dataset schema. For example, this fragment might
|
||||
/// have columns stored as int32 and the dataset schema might have int64 for
|
||||
/// the column. In this case we should cast the column from int32 to int64.
|
||||
///
|
||||
/// Note: A fragment may perform this cast as the data is read from disk. In
|
||||
/// that case a cast might not be needed.
|
||||
virtual Result<compute::ExecBatch> EvolveBatch(
|
||||
const std::shared_ptr<RecordBatch>& batch,
|
||||
const std::vector<FieldPath>& dataset_selection,
|
||||
const FragmentSelection& selection) const = 0;
|
||||
|
||||
/// \brief Return a string description of this strategy
|
||||
virtual std::string ToString() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Lookup to create a FragmentEvolutionStrategy for a given fragment
|
||||
class ARROW_DS_EXPORT DatasetEvolutionStrategy {
|
||||
public:
|
||||
virtual ~DatasetEvolutionStrategy() = default;
|
||||
/// \brief Create a strategy for evolving from the given fragment
|
||||
/// to the schema of the given dataset
|
||||
virtual std::unique_ptr<FragmentEvolutionStrategy> GetStrategy(
|
||||
const Dataset& dataset, const Fragment& fragment,
|
||||
const InspectedFragment& inspected_fragment) = 0;
|
||||
|
||||
/// \brief Return a string description of this strategy
|
||||
virtual std::string ToString() const = 0;
|
||||
};
|
||||
|
||||
ARROW_DS_EXPORT std::unique_ptr<DatasetEvolutionStrategy>
|
||||
MakeBasicDatasetEvolutionStrategy();
|
||||
|
||||
/// \brief A container of zero or more Fragments.
|
||||
///
|
||||
/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a
|
||||
/// directory. A Dataset has a schema to which Fragments must align during a
|
||||
/// scan operation. This is analogous to Avro's reader and writer schema.
|
||||
class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
|
||||
public:
|
||||
/// \brief Begin to build a new Scan operation against this Dataset
|
||||
Result<std::shared_ptr<ScannerBuilder>> NewScan();
|
||||
|
||||
/// \brief GetFragments returns an iterator of Fragments given a predicate.
|
||||
Result<FragmentIterator> GetFragments(compute::Expression predicate);
|
||||
Result<FragmentIterator> GetFragments();
|
||||
|
||||
/// \brief Async versions of `GetFragments`.
|
||||
Result<FragmentGenerator> GetFragmentsAsync(compute::Expression predicate);
|
||||
Result<FragmentGenerator> GetFragmentsAsync();
|
||||
|
||||
const std::shared_ptr<Schema>& schema() const { return schema_; }
|
||||
|
||||
/// \brief An expression which evaluates to true for all data viewed by this Dataset.
|
||||
/// May be null, which indicates no information is available.
|
||||
const compute::Expression& partition_expression() const {
|
||||
return partition_expression_;
|
||||
}
|
||||
|
||||
/// \brief The name identifying the kind of Dataset
|
||||
virtual std::string type_name() const = 0;
|
||||
|
||||
/// \brief Return a copy of this Dataset with a different schema.
|
||||
///
|
||||
/// The copy will view the same Fragments. If the new schema is not compatible with the
|
||||
/// original dataset's schema then an error will be raised.
|
||||
virtual Result<std::shared_ptr<Dataset>> ReplaceSchema(
|
||||
std::shared_ptr<Schema> schema) const = 0;
|
||||
|
||||
/// \brief Rules used by this dataset to handle schema evolution
|
||||
DatasetEvolutionStrategy* evolution_strategy() { return evolution_strategy_.get(); }
|
||||
|
||||
virtual ~Dataset() = default;
|
||||
|
||||
protected:
|
||||
explicit Dataset(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
|
||||
|
||||
Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression);
|
||||
|
||||
virtual Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) = 0;
|
||||
/// \brief Default non-virtual implementation method for the base
|
||||
/// `GetFragmentsAsyncImpl` method, which creates a fragment generator for
|
||||
/// the dataset, possibly filtering results with a predicate (forwarding to
|
||||
/// the synchronous `GetFragmentsImpl` method and moving the computations
|
||||
/// to the background, using the IO thread pool).
|
||||
///
|
||||
/// Currently, `executor` is always the same as `internal::GetCPUThreadPool()`,
|
||||
/// which means the results from the underlying fragment generator will be
|
||||
/// transferred to the default CPU thread pool. The generator itself is
|
||||
/// offloaded to run on the default IO thread pool.
|
||||
virtual Result<FragmentGenerator> GetFragmentsAsyncImpl(
|
||||
compute::Expression predicate, arrow::internal::Executor* executor);
|
||||
|
||||
std::shared_ptr<Schema> schema_;
|
||||
compute::Expression partition_expression_ = compute::literal(true);
|
||||
std::unique_ptr<DatasetEvolutionStrategy> evolution_strategy_ =
|
||||
MakeBasicDatasetEvolutionStrategy();
|
||||
};
|
||||
|
||||
/// \addtogroup dataset-implementations
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief A Source which yields fragments wrapping a stream of record batches.
|
||||
///
|
||||
/// The record batches must match the schema provided to the source at construction.
|
||||
class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
|
||||
public:
|
||||
class RecordBatchGenerator {
|
||||
public:
|
||||
virtual ~RecordBatchGenerator() = default;
|
||||
virtual RecordBatchIterator Get() const = 0;
|
||||
};
|
||||
|
||||
/// Construct a dataset from a schema and a factory of record batch iterators.
|
||||
InMemoryDataset(std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<RecordBatchGenerator> get_batches)
|
||||
: Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {}
|
||||
|
||||
/// Convenience constructor taking a fixed list of batches
|
||||
InMemoryDataset(std::shared_ptr<Schema> schema, RecordBatchVector batches);
|
||||
|
||||
/// Convenience constructor taking a Table
|
||||
explicit InMemoryDataset(std::shared_ptr<Table> table);
|
||||
|
||||
std::string type_name() const override { return "in-memory"; }
|
||||
|
||||
Result<std::shared_ptr<Dataset>> ReplaceSchema(
|
||||
std::shared_ptr<Schema> schema) const override;
|
||||
|
||||
protected:
|
||||
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
|
||||
|
||||
std::shared_ptr<RecordBatchGenerator> get_batches_;
|
||||
};
|
||||
|
||||
/// \brief A Dataset wrapping child Datasets.
|
||||
class ARROW_DS_EXPORT UnionDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Construct a UnionDataset wrapping child Datasets.
|
||||
///
|
||||
/// \param[in] schema the schema of the resulting dataset.
|
||||
/// \param[in] children one or more child Datasets. Their schemas must be identical to
|
||||
/// schema.
|
||||
static Result<std::shared_ptr<UnionDataset>> Make(std::shared_ptr<Schema> schema,
|
||||
DatasetVector children);
|
||||
|
||||
const DatasetVector& children() const { return children_; }
|
||||
|
||||
std::string type_name() const override { return "union"; }
|
||||
|
||||
Result<std::shared_ptr<Dataset>> ReplaceSchema(
|
||||
std::shared_ptr<Schema> schema) const override;
|
||||
|
||||
protected:
|
||||
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
|
||||
|
||||
explicit UnionDataset(std::shared_ptr<Schema> schema, DatasetVector children)
|
||||
: Dataset(std::move(schema)), children_(std::move(children)) {}
|
||||
|
||||
DatasetVector children_;
|
||||
|
||||
friend class UnionDatasetFactory;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,103 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/async_util.h"
|
||||
#include "arrow/util/future.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
namespace internal {
|
||||
|
||||
// This lines up with our other defaults in the scanner and execution plan
|
||||
constexpr uint64_t kDefaultDatasetWriterMaxRowsQueued = 8 * 1024 * 1024;
|
||||
|
||||
/// \brief Utility class that manages a set of writers to different paths
|
||||
///
|
||||
/// Writers may be closed and reopened (and a new file created) based on the dataset
|
||||
/// write options (for example, max_rows_per_file or max_open_files)
|
||||
///
|
||||
/// The dataset writer enforces its own back pressure based on the # of rows (as opposed
|
||||
/// to # of batches which is how it is typically enforced elsewhere) and # of files.
|
||||
class ARROW_DS_EXPORT DatasetWriter {
|
||||
public:
|
||||
/// \brief Create a dataset writer
|
||||
///
|
||||
/// Will fail if basename_template is invalid or if there is existing data and
|
||||
/// existing_data_behavior is kError
|
||||
///
|
||||
/// \param write_options options to control how the data should be written
|
||||
/// \param max_rows_queued max # of rows allowed to be queued before the dataset_writer
|
||||
/// will ask for backpressure
|
||||
static Result<std::unique_ptr<DatasetWriter>> Make(
|
||||
FileSystemDatasetWriteOptions write_options, util::AsyncTaskScheduler* scheduler,
|
||||
std::function<void()> pause_callback, std::function<void()> resume_callback,
|
||||
std::function<void()> finish_callback,
|
||||
uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
|
||||
|
||||
~DatasetWriter();
|
||||
|
||||
/// \brief Write a batch to the dataset
|
||||
/// \param[in] batch The batch to write
|
||||
/// \param[in] directory The directory to write to
|
||||
///
|
||||
/// Note: The written filename will be {directory}/{filename_factory(i)} where i is a
|
||||
/// counter controlled by `max_open_files` and `max_rows_per_file`
|
||||
///
|
||||
/// If multiple WriteRecordBatch calls arrive with the same `directory` then the batches
|
||||
/// may be written to the same file.
|
||||
///
|
||||
/// The returned future will be marked finished when the record batch has been queued
|
||||
/// to be written. If the returned future is unfinished then this indicates the dataset
|
||||
/// writer's queue is full and the data provider should pause.
|
||||
///
|
||||
/// This method is NOT async reentrant. The returned future will only be unfinished
|
||||
/// if back pressure needs to be applied. Async reentrancy is not necessary for
|
||||
/// concurrent writes to happen. Calling this method again before the previous future
|
||||
/// completes will not just violate max_rows_queued but likely lead to race conditions.
|
||||
///
|
||||
/// One thing to note is that the ordering of your data can affect your maximum
|
||||
/// potential parallelism. If this seems odd then consider a dataset where the first
|
||||
/// 1000 batches go to the same directory and then the 1001st batch goes to a different
|
||||
/// directory. The only way to get two parallel writes immediately would be to queue
|
||||
/// all 1000 pending writes to the first directory.
|
||||
void WriteRecordBatch(std::shared_ptr<RecordBatch> batch, const std::string& directory,
|
||||
const std::string& prefix = "");
|
||||
|
||||
/// Finish all pending writes and close any open files
|
||||
void Finish();
|
||||
|
||||
protected:
|
||||
DatasetWriter(FileSystemDatasetWriteOptions write_options,
|
||||
util::AsyncTaskScheduler* scheduler, std::function<void()> pause_callback,
|
||||
std::function<void()> resume_callback,
|
||||
std::function<void()> finish_callback,
|
||||
uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
|
||||
|
||||
class DatasetWriterImpl;
|
||||
std::unique_ptr<DatasetWriterImpl> impl_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,275 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
/// Logic for automatically determining the structure of multi-file
|
||||
/// dataset with possible partitioning according to available
|
||||
/// partitioning
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/dataset/partition.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/filesystem/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
/// \defgroup dataset-discovery Discovery API
|
||||
///
|
||||
/// @{
|
||||
|
||||
struct InspectOptions {
|
||||
/// See `fragments` property.
|
||||
static constexpr int kInspectAllFragments = -1;
|
||||
|
||||
/// Indicate how many fragments should be inspected to infer the unified dataset
|
||||
/// schema. Limiting the number of fragments accessed improves the latency of
|
||||
/// the discovery process when dealing with a high number of fragments and/or
|
||||
/// high latency file systems.
|
||||
///
|
||||
/// The default value of `1` inspects the schema of the first (in no particular
|
||||
/// order) fragment only. If the dataset has a uniform schema for all fragments,
|
||||
/// this default is the optimal value. In order to inspect all fragments and
|
||||
/// robustly unify their potentially varying schemas, set this option to
|
||||
/// `kInspectAllFragments`. A value of `0` disables inspection of fragments
|
||||
/// altogether so only the partitioning schema will be inspected.
|
||||
int fragments = 1;
|
||||
|
||||
/// Control how to unify types. By default, types are merged strictly (the
|
||||
/// type must match exactly, except nulls can be merged with other types).
|
||||
Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
|
||||
};
|
||||
|
||||
struct FinishOptions {
|
||||
/// Finalize the dataset with this given schema. If the schema is not
|
||||
/// provided, infer the schema via the Inspect, see the `inspect_options`
|
||||
/// property.
|
||||
std::shared_ptr<Schema> schema = NULLPTR;
|
||||
|
||||
/// If the schema is not provided, it will be discovered by passing the
|
||||
/// following options to `DatasetDiscovery::Inspect`.
|
||||
InspectOptions inspect_options{};
|
||||
|
||||
/// Indicate if the given Schema (when specified), should be validated against
|
||||
/// the fragments' schemas. `inspect_options` will control how many fragments
|
||||
/// are checked.
|
||||
bool validate_fragments = false;
|
||||
};
|
||||
|
||||
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's expected
|
||||
/// schema before materializing said Dataset.
|
||||
class ARROW_DS_EXPORT DatasetFactory {
|
||||
public:
|
||||
/// \brief Get the schemas of the Fragments and Partitioning.
|
||||
virtual Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
|
||||
InspectOptions options) = 0;
|
||||
|
||||
/// \brief Get unified schema for the resulting Dataset.
|
||||
Result<std::shared_ptr<Schema>> Inspect(InspectOptions options = {});
|
||||
|
||||
/// \brief Create a Dataset
|
||||
Result<std::shared_ptr<Dataset>> Finish();
|
||||
/// \brief Create a Dataset with the given schema (see \a InspectOptions::schema)
|
||||
Result<std::shared_ptr<Dataset>> Finish(std::shared_ptr<Schema> schema);
|
||||
/// \brief Create a Dataset with the given options
|
||||
virtual Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) = 0;
|
||||
|
||||
/// \brief Optional root partition for the resulting Dataset.
|
||||
const compute::Expression& root_partition() const { return root_partition_; }
|
||||
/// \brief Set the root partition for the resulting Dataset.
|
||||
Status SetRootPartition(compute::Expression partition) {
|
||||
root_partition_ = std::move(partition);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual ~DatasetFactory() = default;
|
||||
|
||||
protected:
|
||||
DatasetFactory();
|
||||
|
||||
compute::Expression root_partition_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's
|
||||
/// expected schema before materialization.
|
||||
/// \ingroup dataset-implementations
|
||||
class ARROW_DS_EXPORT UnionDatasetFactory : public DatasetFactory {
|
||||
public:
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
std::vector<std::shared_ptr<DatasetFactory>> factories);
|
||||
|
||||
/// \brief Return the list of child DatasetFactory
|
||||
const std::vector<std::shared_ptr<DatasetFactory>>& factories() const {
|
||||
return factories_;
|
||||
}
|
||||
|
||||
/// \brief Get the schemas of the Datasets.
|
||||
///
|
||||
/// Instead of applying options globally, it applies at each child factory.
|
||||
/// This will not respect `options.fragments` exactly, but will respect the
|
||||
/// spirit of peeking the first fragments or all of them.
|
||||
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
|
||||
InspectOptions options) override;
|
||||
|
||||
/// \brief Create a Dataset.
|
||||
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
|
||||
|
||||
protected:
|
||||
explicit UnionDatasetFactory(std::vector<std::shared_ptr<DatasetFactory>> factories);
|
||||
|
||||
std::vector<std::shared_ptr<DatasetFactory>> factories_;
|
||||
};
|
||||
|
||||
/// \ingroup dataset-filesystem
|
||||
struct FileSystemFactoryOptions {
|
||||
/// Either an explicit Partitioning or a PartitioningFactory to discover one.
|
||||
///
|
||||
/// If a factory is provided, it will be used to infer a schema for partition fields
|
||||
/// based on file and directory paths then construct a Partitioning. The default
|
||||
/// is a Partitioning which will yield no partition information.
|
||||
///
|
||||
/// The (explicit or discovered) partitioning will be applied to discovered files
|
||||
/// and the resulting partition information embedded in the Dataset.
|
||||
PartitioningOrFactory partitioning{Partitioning::Default()};
|
||||
|
||||
/// For the purposes of applying the partitioning, paths will be stripped
|
||||
/// of the partition_base_dir. Files not matching the partition_base_dir
|
||||
/// prefix will be skipped for partition discovery. The ignored files will still
|
||||
/// be part of the Dataset, but will not have partition information.
|
||||
///
|
||||
/// Example:
|
||||
/// partition_base_dir = "/dataset";
|
||||
///
|
||||
/// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
|
||||
///
|
||||
/// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
|
||||
///
|
||||
/// This is useful for partitioning which parses directory when ordering
|
||||
/// is important, e.g. DirectoryPartitioning.
|
||||
std::string partition_base_dir;
|
||||
|
||||
/// Invalid files (via selector or explicitly) will be excluded by checking
|
||||
/// with the FileFormat::IsSupported method. This will incur IO for each files
|
||||
/// in a serial and single threaded fashion. Disabling this feature will skip the
|
||||
/// IO, but unsupported files may be present in the Dataset
|
||||
/// (resulting in an error at scan time).
|
||||
bool exclude_invalid_files = false;
|
||||
|
||||
/// When discovering from a Selector (and not from an explicit file list), ignore
|
||||
/// files and directories matching any of these prefixes.
|
||||
///
|
||||
/// Example (with selector = "/dataset/**"):
|
||||
/// selector_ignore_prefixes = {"_", ".DS_STORE" };
|
||||
///
|
||||
/// - "/dataset/data.csv" -> not ignored
|
||||
/// - "/dataset/_metadata" -> ignored
|
||||
/// - "/dataset/.DS_STORE" -> ignored
|
||||
/// - "/dataset/_hidden/dat" -> ignored
|
||||
/// - "/dataset/nested/.DS_STORE" -> ignored
|
||||
std::vector<std::string> selector_ignore_prefixes = {
|
||||
".",
|
||||
"_",
|
||||
};
|
||||
};
|
||||
|
||||
/// \brief FileSystemDatasetFactory creates a Dataset from a vector of
|
||||
/// fs::FileInfo or a fs::FileSelector.
|
||||
/// \ingroup dataset-filesystem
|
||||
class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {
|
||||
public:
|
||||
/// \brief Build a FileSystemDatasetFactory from an explicit list of
|
||||
/// paths.
|
||||
///
|
||||
/// \param[in] filesystem passed to FileSystemDataset
|
||||
/// \param[in] paths passed to FileSystemDataset
|
||||
/// \param[in] format passed to FileSystemDataset
|
||||
/// \param[in] options see FileSystemFactoryOptions for more information.
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<std::string>& paths,
|
||||
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
|
||||
|
||||
/// \brief Build a FileSystemDatasetFactory from a fs::FileSelector.
|
||||
///
|
||||
/// The selector will expand to a vector of FileInfo. The expansion/crawling
|
||||
/// is performed in this function call. Thus, the finalized Dataset is
|
||||
/// working with a snapshot of the filesystem.
|
||||
//
|
||||
/// If options.partition_base_dir is not provided, it will be overwritten
|
||||
/// with selector.base_dir.
|
||||
///
|
||||
/// \param[in] filesystem passed to FileSystemDataset
|
||||
/// \param[in] selector used to crawl and search files
|
||||
/// \param[in] format passed to FileSystemDataset
|
||||
/// \param[in] options see FileSystemFactoryOptions for more information.
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
std::shared_ptr<fs::FileSystem> filesystem, fs::FileSelector selector,
|
||||
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
|
||||
|
||||
/// \brief Build a FileSystemDatasetFactory from an uri including filesystem
|
||||
/// information.
|
||||
///
|
||||
/// \param[in] uri passed to FileSystemDataset
|
||||
/// \param[in] format passed to FileSystemDataset
|
||||
/// \param[in] options see FileSystemFactoryOptions for more information.
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(std::string uri,
|
||||
std::shared_ptr<FileFormat> format,
|
||||
FileSystemFactoryOptions options);
|
||||
|
||||
/// \brief Build a FileSystemDatasetFactory from an explicit list of
|
||||
/// file information.
|
||||
///
|
||||
/// \param[in] filesystem passed to FileSystemDataset
|
||||
/// \param[in] files passed to FileSystemDataset
|
||||
/// \param[in] format passed to FileSystemDataset
|
||||
/// \param[in] options see FileSystemFactoryOptions for more information.
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<fs::FileInfo>& files,
|
||||
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
|
||||
|
||||
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
|
||||
InspectOptions options) override;
|
||||
|
||||
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
|
||||
|
||||
protected:
|
||||
FileSystemDatasetFactory(std::vector<fs::FileInfo> files,
|
||||
std::shared_ptr<fs::FileSystem> filesystem,
|
||||
std::shared_ptr<FileFormat> format,
|
||||
FileSystemFactoryOptions options);
|
||||
|
||||
Result<std::shared_ptr<Schema>> PartitionSchema();
|
||||
|
||||
std::vector<fs::FileInfo> files_;
|
||||
std::shared_ptr<fs::FileSystem> fs_;
|
||||
std::shared_ptr<FileFormat> format_;
|
||||
FileSystemFactoryOptions options_;
|
||||
};
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,499 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/dataset/dataset.h"
|
||||
#include "arrow/dataset/partition.h"
|
||||
#include "arrow/dataset/scanner.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "arrow/io/file.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/compression.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace dataset {
|
||||
|
||||
/// \defgroup dataset-file-formats File formats for reading and writing datasets
|
||||
/// \defgroup dataset-filesystem File system datasets
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief The path and filesystem where an actual file is located or a buffer which can
|
||||
/// be read like a file
|
||||
class ARROW_DS_EXPORT FileSource : public util::EqualityComparable<FileSource> {
|
||||
public:
|
||||
FileSource(std::string path, std::shared_ptr<fs::FileSystem> filesystem,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: file_info_(std::move(path)),
|
||||
filesystem_(std::move(filesystem)),
|
||||
compression_(compression) {}
|
||||
|
||||
FileSource(fs::FileInfo info, std::shared_ptr<fs::FileSystem> filesystem,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: file_info_(std::move(info)),
|
||||
filesystem_(std::move(filesystem)),
|
||||
compression_(compression) {}
|
||||
|
||||
explicit FileSource(std::shared_ptr<Buffer> buffer,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: buffer_(std::move(buffer)), compression_(compression) {}
|
||||
|
||||
using CustomOpen = std::function<Result<std::shared_ptr<io::RandomAccessFile>>()>;
|
||||
FileSource(CustomOpen open, int64_t size)
|
||||
: custom_open_(std::move(open)), custom_size_(size) {}
|
||||
|
||||
using CustomOpenWithCompression =
|
||||
std::function<Result<std::shared_ptr<io::RandomAccessFile>>(Compression::type)>;
|
||||
FileSource(CustomOpenWithCompression open_with_compression, int64_t size,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: custom_open_(std::bind(std::move(open_with_compression), compression)),
|
||||
custom_size_(size),
|
||||
compression_(compression) {}
|
||||
|
||||
FileSource(std::shared_ptr<io::RandomAccessFile> file, int64_t size,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: custom_open_([=] { return ToResult(file); }),
|
||||
custom_size_(size),
|
||||
compression_(compression) {}
|
||||
|
||||
explicit FileSource(std::shared_ptr<io::RandomAccessFile> file,
|
||||
Compression::type compression = Compression::UNCOMPRESSED);
|
||||
|
||||
FileSource() : custom_open_(CustomOpen{&InvalidOpen}) {}
|
||||
|
||||
static std::vector<FileSource> FromPaths(const std::shared_ptr<fs::FileSystem>& fs,
|
||||
std::vector<std::string> paths) {
|
||||
std::vector<FileSource> sources;
|
||||
for (auto&& path : paths) {
|
||||
sources.emplace_back(std::move(path), fs);
|
||||
}
|
||||
return sources;
|
||||
}
|
||||
|
||||
/// \brief Return the type of raw compression on the file, if any.
|
||||
Compression::type compression() const { return compression_; }
|
||||
|
||||
/// \brief Return the file path, if any. Only valid when file source wraps a path.
|
||||
const std::string& path() const {
|
||||
static std::string buffer_path = "<Buffer>";
|
||||
static std::string custom_open_path = "<Buffer>";
|
||||
return filesystem_ ? file_info_.path() : buffer_ ? buffer_path : custom_open_path;
|
||||
}
|
||||
|
||||
/// \brief Return the filesystem, if any. Otherwise returns nullptr
|
||||
const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
|
||||
|
||||
/// \brief Return the buffer containing the file, if any. Otherwise returns nullptr
|
||||
const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
|
||||
|
||||
/// \brief Get a RandomAccessFile which views this file source
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> Open() const;
|
||||
Future<std::shared_ptr<io::RandomAccessFile>> OpenAsync() const;
|
||||
|
||||
/// \brief Get the size (in bytes) of the file or buffer
|
||||
/// If the file is compressed this should be the compressed (on-disk) size.
|
||||
int64_t Size() const;
|
||||
|
||||
/// \brief Get an InputStream which views this file source (and decompresses if needed)
|
||||
/// \param[in] compression If nullopt, guess the compression scheme from the
|
||||
/// filename, else decompress with the given codec
|
||||
Result<std::shared_ptr<io::InputStream>> OpenCompressed(
|
||||
std::optional<Compression::type> compression = std::nullopt) const;
|
||||
|
||||
/// \brief equality comparison with another FileSource
|
||||
bool Equals(const FileSource& other) const;
|
||||
|
||||
private:
|
||||
static Result<std::shared_ptr<io::RandomAccessFile>> InvalidOpen() {
|
||||
return Status::Invalid("Called Open() on an uninitialized FileSource");
|
||||
}
|
||||
|
||||
fs::FileInfo file_info_;
|
||||
std::shared_ptr<fs::FileSystem> filesystem_;
|
||||
std::shared_ptr<Buffer> buffer_;
|
||||
CustomOpen custom_open_;
|
||||
int64_t custom_size_ = 0;
|
||||
Compression::type compression_ = Compression::UNCOMPRESSED;
|
||||
};
|
||||
|
||||
/// \brief Base class for file format implementation
|
||||
class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileFormat> {
|
||||
public:
|
||||
/// Options affecting how this format is scanned.
|
||||
///
|
||||
/// The options here can be overridden at scan time.
|
||||
std::shared_ptr<FragmentScanOptions> default_fragment_scan_options;
|
||||
|
||||
virtual ~FileFormat() = default;
|
||||
|
||||
/// \brief The name identifying the kind of file format
|
||||
virtual std::string type_name() const = 0;
|
||||
|
||||
virtual bool Equals(const FileFormat& other) const = 0;
|
||||
|
||||
/// \brief Indicate if the FileSource is supported/readable by this format.
|
||||
virtual Result<bool> IsSupported(const FileSource& source) const = 0;
|
||||
|
||||
/// \brief Return the schema of the file if possible.
|
||||
virtual Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const = 0;
|
||||
|
||||
/// \brief Learn what we need about the file before we start scanning it
|
||||
virtual Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FileSource& source, const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const;
|
||||
|
||||
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options,
|
||||
const std::shared_ptr<FileFragment>& file) const = 0;
|
||||
|
||||
virtual Future<std::optional<int64_t>> CountRows(
|
||||
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options);
|
||||
|
||||
virtual Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const;
|
||||
|
||||
/// \brief Open a fragment
|
||||
virtual Result<std::shared_ptr<FileFragment>> MakeFragment(
|
||||
FileSource source, compute::Expression partition_expression,
|
||||
std::shared_ptr<Schema> physical_schema);
|
||||
|
||||
/// \brief Create a FileFragment for a FileSource.
|
||||
Result<std::shared_ptr<FileFragment>> MakeFragment(
|
||||
FileSource source, compute::Expression partition_expression);
|
||||
|
||||
/// \brief Create a FileFragment for a FileSource.
|
||||
Result<std::shared_ptr<FileFragment>> MakeFragment(
|
||||
FileSource source, std::shared_ptr<Schema> physical_schema = NULLPTR);
|
||||
|
||||
/// \brief Create a writer for this format.
|
||||
virtual Result<std::shared_ptr<FileWriter>> MakeWriter(
|
||||
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<FileWriteOptions> options,
|
||||
fs::FileLocator destination_locator) const = 0;
|
||||
|
||||
/// \brief Get default write options for this format.
|
||||
///
|
||||
/// May return null shared_ptr if this file format does not yet support
|
||||
/// writing datasets.
|
||||
virtual std::shared_ptr<FileWriteOptions> DefaultWriteOptions() = 0;
|
||||
|
||||
protected:
|
||||
explicit FileFormat(std::shared_ptr<FragmentScanOptions> default_fragment_scan_options)
|
||||
: default_fragment_scan_options(std::move(default_fragment_scan_options)) {}
|
||||
};
|
||||
|
||||
/// \brief A Fragment that is stored in a file with a known format
|
||||
class ARROW_DS_EXPORT FileFragment : public Fragment,
|
||||
public util::EqualityComparable<FileFragment> {
|
||||
public:
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) override;
|
||||
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) override;
|
||||
|
||||
std::string type_name() const override { return format_->type_name(); }
|
||||
std::string ToString() const override { return source_.path(); };
|
||||
|
||||
const FileSource& source() const { return source_; }
|
||||
const std::shared_ptr<FileFormat>& format() const { return format_; }
|
||||
|
||||
bool Equals(const FileFragment& other) const;
|
||||
|
||||
protected:
|
||||
FileFragment(FileSource source, std::shared_ptr<FileFormat> format,
|
||||
compute::Expression partition_expression,
|
||||
std::shared_ptr<Schema> physical_schema)
|
||||
: Fragment(std::move(partition_expression), std::move(physical_schema)),
|
||||
source_(std::move(source)),
|
||||
format_(std::move(format)) {}
|
||||
|
||||
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
|
||||
|
||||
FileSource source_;
|
||||
std::shared_ptr<FileFormat> format_;
|
||||
|
||||
friend class FileFormat;
|
||||
};
|
||||
|
||||
/// \brief A Dataset of FileFragments.
|
||||
///
|
||||
/// A FileSystemDataset is composed of one or more FileFragment. The fragments
|
||||
/// are independent and don't need to share the same format and/or filesystem.
|
||||
class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Create a FileSystemDataset.
|
||||
///
|
||||
/// \param[in] schema the schema of the dataset
|
||||
/// \param[in] root_partition the partition expression of the dataset
|
||||
/// \param[in] format the format of each FileFragment.
|
||||
/// \param[in] filesystem the filesystem of each FileFragment, or nullptr if the
|
||||
/// fragments wrap buffers.
|
||||
/// \param[in] fragments list of fragments to create the dataset from.
|
||||
/// \param[in] partitioning the Partitioning object in case the dataset is created
|
||||
/// with a known partitioning (e.g. from a discovered partitioning
|
||||
/// through a DatasetFactory), or nullptr if not known.
|
||||
///
|
||||
/// Note that fragments wrapping files resident in differing filesystems are not
|
||||
/// permitted; to work with multiple filesystems use a UnionDataset.
|
||||
///
|
||||
/// \return A constructed dataset.
|
||||
static Result<std::shared_ptr<FileSystemDataset>> Make(
|
||||
std::shared_ptr<Schema> schema, compute::Expression root_partition,
|
||||
std::shared_ptr<FileFormat> format, std::shared_ptr<fs::FileSystem> filesystem,
|
||||
std::vector<std::shared_ptr<FileFragment>> fragments,
|
||||
std::shared_ptr<Partitioning> partitioning = NULLPTR);
|
||||
|
||||
/// \brief Write a dataset.
|
||||
static Status Write(const FileSystemDatasetWriteOptions& write_options,
|
||||
std::shared_ptr<Scanner> scanner);
|
||||
|
||||
/// \brief Return the type name of the dataset.
|
||||
std::string type_name() const override { return "filesystem"; }
|
||||
|
||||
/// \brief Replace the schema of the dataset.
|
||||
Result<std::shared_ptr<Dataset>> ReplaceSchema(
|
||||
std::shared_ptr<Schema> schema) const override;
|
||||
|
||||
/// \brief Return the path of files.
|
||||
std::vector<std::string> files() const;
|
||||
|
||||
/// \brief Return the format.
|
||||
const std::shared_ptr<FileFormat>& format() const { return format_; }
|
||||
|
||||
/// \brief Return the filesystem. May be nullptr if the fragments wrap buffers.
|
||||
const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
|
||||
|
||||
/// \brief Return the partitioning. May be nullptr if the dataset was not constructed
|
||||
/// with a partitioning.
|
||||
const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
protected:
|
||||
struct FragmentSubtrees;
|
||||
|
||||
explicit FileSystemDataset(std::shared_ptr<Schema> schema)
|
||||
: Dataset(std::move(schema)) {}
|
||||
|
||||
FileSystemDataset(std::shared_ptr<Schema> schema,
|
||||
compute::Expression partition_expression)
|
||||
: Dataset(std::move(schema), partition_expression) {}
|
||||
|
||||
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
|
||||
|
||||
void SetupSubtreePruning();
|
||||
|
||||
std::shared_ptr<FileFormat> format_;
|
||||
std::shared_ptr<fs::FileSystem> filesystem_;
|
||||
std::vector<std::shared_ptr<FileFragment>> fragments_;
|
||||
std::shared_ptr<Partitioning> partitioning_;
|
||||
|
||||
std::shared_ptr<FragmentSubtrees> subtrees_;
|
||||
};
|
||||
|
||||
/// \brief Options for writing a file of this format.
|
||||
class ARROW_DS_EXPORT FileWriteOptions {
|
||||
public:
|
||||
virtual ~FileWriteOptions() = default;
|
||||
|
||||
const std::shared_ptr<FileFormat>& format() const { return format_; }
|
||||
|
||||
std::string type_name() const { return format_->type_name(); }
|
||||
|
||||
protected:
|
||||
explicit FileWriteOptions(std::shared_ptr<FileFormat> format)
|
||||
: format_(std::move(format)) {}
|
||||
|
||||
std::shared_ptr<FileFormat> format_;
|
||||
};
|
||||
|
||||
/// \brief A writer for this format.
|
||||
class ARROW_DS_EXPORT FileWriter {
|
||||
public:
|
||||
virtual ~FileWriter() = default;
|
||||
|
||||
/// \brief Write the given batch.
|
||||
virtual Status Write(const std::shared_ptr<RecordBatch>& batch) = 0;
|
||||
|
||||
/// \brief Write all batches from the reader.
|
||||
Status Write(RecordBatchReader* batches);
|
||||
|
||||
/// \brief Indicate that writing is done.
|
||||
virtual Future<> Finish();
|
||||
|
||||
const std::shared_ptr<FileFormat>& format() const { return options_->format(); }
|
||||
const std::shared_ptr<Schema>& schema() const { return schema_; }
|
||||
const std::shared_ptr<FileWriteOptions>& options() const { return options_; }
|
||||
const fs::FileLocator& destination() const { return destination_locator_; }
|
||||
|
||||
/// \brief After Finish() is called, provides number of bytes written to file.
|
||||
Result<int64_t> GetBytesWritten() const;
|
||||
|
||||
protected:
|
||||
FileWriter(std::shared_ptr<Schema> schema, std::shared_ptr<FileWriteOptions> options,
|
||||
std::shared_ptr<io::OutputStream> destination,
|
||||
fs::FileLocator destination_locator)
|
||||
: schema_(std::move(schema)),
|
||||
options_(std::move(options)),
|
||||
destination_(std::move(destination)),
|
||||
destination_locator_(std::move(destination_locator)) {}
|
||||
|
||||
virtual Future<> FinishInternal() = 0;
|
||||
|
||||
std::shared_ptr<Schema> schema_;
|
||||
std::shared_ptr<FileWriteOptions> options_;
|
||||
std::shared_ptr<io::OutputStream> destination_;
|
||||
fs::FileLocator destination_locator_;
|
||||
std::optional<int64_t> bytes_written_;
|
||||
};
|
||||
|
||||
/// \brief Options for writing a dataset.
|
||||
struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
|
||||
/// Options for individual fragment writing.
|
||||
std::shared_ptr<FileWriteOptions> file_write_options;
|
||||
|
||||
/// FileSystem into which a dataset will be written.
|
||||
std::shared_ptr<fs::FileSystem> filesystem;
|
||||
|
||||
/// Root directory into which the dataset will be written.
|
||||
std::string base_dir;
|
||||
|
||||
/// Partitioning used to generate fragment paths.
|
||||
std::shared_ptr<Partitioning> partitioning;
|
||||
|
||||
/// If true the order of rows in the dataset is preserved when writing with
|
||||
/// multiple threads. This may cause notable performance degradation.
|
||||
bool preserve_order = false;
|
||||
|
||||
/// Maximum number of partitions any batch may be written into, default is 1K.
|
||||
int max_partitions = 1024;
|
||||
|
||||
/// Template string used to generate fragment basenames.
|
||||
/// {i} will be replaced by an auto incremented integer.
|
||||
std::string basename_template;
|
||||
|
||||
/// A functor which will be applied on an incremented counter. The result will be
|
||||
/// inserted into the basename_template in place of {i}.
|
||||
///
|
||||
/// This can be used, for example, to left-pad the file counter.
|
||||
std::function<std::string(int)> basename_template_functor;
|
||||
|
||||
/// If greater than 0 then this will limit the maximum number of files that can be left
|
||||
/// open. If an attempt is made to open too many files then the least recently used file
|
||||
/// will be closed. If this setting is set too low you may end up fragmenting your data
|
||||
/// into many small files.
|
||||
///
|
||||
/// The default is 900 which also allows some # of files to be open by the scanner
|
||||
/// before hitting the default Linux limit of 1024
|
||||
uint32_t max_open_files = 900;
|
||||
|
||||
/// If greater than 0 then this will limit how many rows are placed in any single file.
|
||||
/// Otherwise there will be no limit and one file will be created in each output
|
||||
/// directory unless files need to be closed to respect max_open_files
|
||||
uint64_t max_rows_per_file = 0;
|
||||
|
||||
/// If greater than 0 then this will cause the dataset writer to batch incoming data
|
||||
/// and only write the row groups to the disk when sufficient rows have accumulated.
|
||||
/// The final row group size may be less than this value and other options such as
|
||||
/// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes.
|
||||
uint64_t min_rows_per_group = 0;
|
||||
|
||||
/// If greater than 0 then the dataset writer may split up large incoming batches into
|
||||
/// multiple row groups. If this value is set then min_rows_per_group should also be
|
||||
/// set or else you may end up with very small row groups (e.g. if the incoming row
|
||||
/// group size is just barely larger than this value).
|
||||
uint64_t max_rows_per_group = 1 << 20;
|
||||
|
||||
/// Controls what happens if an output directory already exists.
|
||||
ExistingDataBehavior existing_data_behavior = ExistingDataBehavior::kError;
|
||||
|
||||
/// \brief If false the dataset writer will not create directories
|
||||
/// This is mainly intended for filesystems that do not require directories such as S3.
|
||||
bool create_dir = true;
|
||||
|
||||
/// Callback to be invoked against all FileWriters before
|
||||
/// they are finalized with FileWriter::Finish().
|
||||
std::function<Status(FileWriter*)> writer_pre_finish = [](FileWriter*) {
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
/// Callback to be invoked against all FileWriters after they have
|
||||
/// called FileWriter::Finish().
|
||||
std::function<Status(FileWriter*)> writer_post_finish = [](FileWriter*) {
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
const std::shared_ptr<FileFormat>& format() const {
|
||||
return file_write_options->format();
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief Wraps FileSystemDatasetWriteOptions for consumption as compute::ExecNodeOptions
|
||||
class ARROW_DS_EXPORT WriteNodeOptions : public acero::ExecNodeOptions {
|
||||
public:
|
||||
explicit WriteNodeOptions(
|
||||
FileSystemDatasetWriteOptions options,
|
||||
std::shared_ptr<const KeyValueMetadata> custom_metadata = NULLPTR)
|
||||
: write_options(std::move(options)), custom_metadata(std::move(custom_metadata)) {}
|
||||
|
||||
/// \brief Options to control how to write the dataset
|
||||
FileSystemDatasetWriteOptions write_options;
|
||||
/// \brief Optional schema to attach to all written batches
|
||||
///
|
||||
/// By default, we will use the output schema of the input.
|
||||
///
|
||||
/// This can be used to alter schema metadata, field nullability, or field metadata.
|
||||
/// However, this cannot be used to change the type of data. If the custom schema does
|
||||
/// not have the same number of fields and the same data types as the input then the
|
||||
/// plan will fail.
|
||||
std::shared_ptr<Schema> custom_schema;
|
||||
/// \brief Optional metadata to attach to written batches
|
||||
std::shared_ptr<const KeyValueMetadata> custom_metadata;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
namespace internal {
|
||||
ARROW_DS_EXPORT void InitializeDatasetWriter(arrow::acero::ExecFactoryRegistry* registry);
|
||||
}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,144 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/csv/options.h"
|
||||
#include "arrow/dataset/dataset.h"
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/ipc/type_fwd.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/compression.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
constexpr char kCsvTypeName[] = "csv";
|
||||
|
||||
/// \addtogroup dataset-file-formats
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief A FileFormat implementation that reads from and writes to Csv files
|
||||
class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
|
||||
public:
|
||||
// TODO(ARROW-18328) Remove this, moved to CsvFragmentScanOptions
|
||||
/// Options affecting the parsing of CSV files
|
||||
csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
|
||||
|
||||
CsvFileFormat();
|
||||
|
||||
std::string type_name() const override { return kCsvTypeName; }
|
||||
|
||||
bool Equals(const FileFormat& other) const override;
|
||||
|
||||
Result<bool> IsSupported(const FileSource& source) const override;
|
||||
|
||||
/// \brief Return the schema of the file if possible.
|
||||
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
|
||||
|
||||
Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const override;
|
||||
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& scan_options,
|
||||
const std::shared_ptr<FileFragment>& file) const override;
|
||||
|
||||
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FileSource& source, const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const override;
|
||||
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
|
||||
Result<std::shared_ptr<FileWriter>> MakeWriter(
|
||||
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<FileWriteOptions> options,
|
||||
fs::FileLocator destination_locator) const override;
|
||||
|
||||
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
|
||||
};
|
||||
|
||||
/// \brief Per-scan options for CSV fragments
|
||||
struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
|
||||
std::string type_name() const override { return kCsvTypeName; }
|
||||
|
||||
using StreamWrapFunc = std::function<Result<std::shared_ptr<io::InputStream>>(
|
||||
std::shared_ptr<io::InputStream>)>;
|
||||
|
||||
/// CSV conversion options
|
||||
csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults();
|
||||
|
||||
/// CSV reading options
|
||||
///
|
||||
/// Note that use_threads is always ignored.
|
||||
csv::ReadOptions read_options = csv::ReadOptions::Defaults();
|
||||
|
||||
/// CSV parse options
|
||||
csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
|
||||
|
||||
/// Optional stream wrapping function
|
||||
///
|
||||
/// If defined, all open dataset file fragments will be passed
|
||||
/// through this function. One possible use case is to transparently
|
||||
/// transcode all input files from a given character set to utf8.
|
||||
StreamWrapFunc stream_transform_func{};
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT CsvFileWriteOptions : public FileWriteOptions {
|
||||
public:
|
||||
/// Options passed to csv::MakeCSVWriter.
|
||||
std::shared_ptr<csv::WriteOptions> write_options;
|
||||
|
||||
protected:
|
||||
explicit CsvFileWriteOptions(std::shared_ptr<FileFormat> format)
|
||||
: FileWriteOptions(std::move(format)) {}
|
||||
|
||||
friend class CsvFileFormat;
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT CsvFileWriter : public FileWriter {
|
||||
public:
|
||||
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
|
||||
|
||||
private:
|
||||
CsvFileWriter(std::shared_ptr<io::OutputStream> destination,
|
||||
std::shared_ptr<ipc::RecordBatchWriter> writer,
|
||||
std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<CsvFileWriteOptions> options,
|
||||
fs::FileLocator destination_locator);
|
||||
|
||||
Future<> FinishInternal() override;
|
||||
|
||||
std::shared_ptr<io::OutputStream> destination_;
|
||||
std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
|
||||
|
||||
friend class CsvFileFormat;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,123 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/io/type_fwd.h"
|
||||
#include "arrow/ipc/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
/// \addtogroup dataset-file-formats
|
||||
///
|
||||
/// @{
|
||||
|
||||
constexpr char kIpcTypeName[] = "ipc";
|
||||
|
||||
/// \brief A FileFormat implementation that reads from and writes to Ipc files
|
||||
class ARROW_DS_EXPORT IpcFileFormat : public FileFormat {
|
||||
public:
|
||||
std::string type_name() const override { return kIpcTypeName; }
|
||||
|
||||
IpcFileFormat();
|
||||
|
||||
bool Equals(const FileFormat& other) const override {
|
||||
return type_name() == other.type_name();
|
||||
}
|
||||
|
||||
Result<bool> IsSupported(const FileSource& source) const override;
|
||||
|
||||
/// \brief Return the schema of the file if possible.
|
||||
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
|
||||
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options,
|
||||
const std::shared_ptr<FileFragment>& file) const override;
|
||||
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
|
||||
Result<std::shared_ptr<FileWriter>> MakeWriter(
|
||||
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<FileWriteOptions> options,
|
||||
fs::FileLocator destination_locator) const override;
|
||||
|
||||
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
|
||||
};
|
||||
|
||||
/// \brief Per-scan options for IPC fragments
|
||||
class ARROW_DS_EXPORT IpcFragmentScanOptions : public FragmentScanOptions {
|
||||
public:
|
||||
std::string type_name() const override { return kIpcTypeName; }
|
||||
|
||||
/// Options passed to the IPC file reader.
|
||||
/// included_fields, memory_pool, and use_threads are ignored.
|
||||
std::shared_ptr<ipc::IpcReadOptions> options;
|
||||
/// If present, the async scanner will enable I/O coalescing.
|
||||
/// This is ignored by the sync scanner.
|
||||
std::shared_ptr<io::CacheOptions> cache_options;
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT IpcFileWriteOptions : public FileWriteOptions {
|
||||
public:
|
||||
/// Options passed to ipc::MakeFileWriter. use_threads is ignored
|
||||
std::shared_ptr<ipc::IpcWriteOptions> options;
|
||||
|
||||
/// custom_metadata written to the file's footer
|
||||
std::shared_ptr<const KeyValueMetadata> metadata;
|
||||
|
||||
protected:
|
||||
explicit IpcFileWriteOptions(std::shared_ptr<FileFormat> format)
|
||||
: FileWriteOptions(std::move(format)) {}
|
||||
|
||||
friend class IpcFileFormat;
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT IpcFileWriter : public FileWriter {
|
||||
public:
|
||||
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
|
||||
|
||||
private:
|
||||
IpcFileWriter(std::shared_ptr<io::OutputStream> destination,
|
||||
std::shared_ptr<ipc::RecordBatchWriter> writer,
|
||||
std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<IpcFileWriteOptions> options,
|
||||
fs::FileLocator destination_locator);
|
||||
|
||||
Future<> FinishInternal() override;
|
||||
|
||||
std::shared_ptr<io::OutputStream> destination_;
|
||||
std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
|
||||
|
||||
friend class IpcFileFormat;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user