Initial commit
This commit is contained in:
@@ -0,0 +1,33 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/result.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
|
||||
Status MaybeTransform(InputIterator first, InputIterator last, OutputIterator out,
|
||||
UnaryOperation unary_op) {
|
||||
for (; first != last; ++first, (void)++out) {
|
||||
ARROW_ASSIGN_OR_RAISE(*out, unary_op(*first));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,221 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
struct BitmapWordAlignParams {
|
||||
int64_t leading_bits;
|
||||
int64_t trailing_bits;
|
||||
int64_t trailing_bit_offset;
|
||||
const uint8_t* aligned_start;
|
||||
int64_t aligned_bits;
|
||||
int64_t aligned_words;
|
||||
};
|
||||
|
||||
// Compute parameters for accessing a bitmap using aligned word instructions.
|
||||
// The returned parameters describe:
|
||||
// - a leading area of size `leading_bits` before the aligned words
|
||||
// - a word-aligned area of size `aligned_bits`
|
||||
// - a trailing area of size `trailing_bits` after the aligned words
|
||||
template <uint64_t ALIGN_IN_BYTES>
|
||||
inline BitmapWordAlignParams BitmapWordAlign(const uint8_t* data, int64_t bit_offset,
|
||||
int64_t length) {
|
||||
static_assert(bit_util::IsPowerOf2(ALIGN_IN_BYTES),
|
||||
"ALIGN_IN_BYTES should be a positive power of two");
|
||||
constexpr uint64_t ALIGN_IN_BITS = ALIGN_IN_BYTES * 8;
|
||||
|
||||
BitmapWordAlignParams p;
|
||||
|
||||
// Compute a "bit address" that we can align up to ALIGN_IN_BITS.
|
||||
// We don't care about losing the upper bits since we are only interested in the
|
||||
// difference between both addresses.
|
||||
const uint64_t bit_addr =
|
||||
reinterpret_cast<size_t>(data) * 8 + static_cast<uint64_t>(bit_offset);
|
||||
const uint64_t aligned_bit_addr = bit_util::RoundUpToPowerOf2(bit_addr, ALIGN_IN_BITS);
|
||||
|
||||
p.leading_bits = std::min<int64_t>(length, aligned_bit_addr - bit_addr);
|
||||
p.aligned_words = (length - p.leading_bits) / ALIGN_IN_BITS;
|
||||
p.aligned_bits = p.aligned_words * ALIGN_IN_BITS;
|
||||
p.trailing_bits = length - p.leading_bits - p.aligned_bits;
|
||||
p.trailing_bit_offset = bit_offset + p.leading_bits + p.aligned_bits;
|
||||
|
||||
p.aligned_start = data + (bit_offset + p.leading_bits) / 8;
|
||||
return p;
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
namespace util {
|
||||
|
||||
// Functions to check if the provided Arrow object is aligned by the specified alignment
|
||||
|
||||
/// \brief Special alignment value to use data type-specific alignment
|
||||
///
|
||||
/// If this is passed as the `alignment` in one of the CheckAlignment or EnsureAlignment
|
||||
/// functions, then the function will ensure each buffer is suitably aligned
|
||||
/// for the data type of the array. For example, given an int32 buffer the values
|
||||
/// buffer's address must be a multiple of 4. Given a large_string buffer the offsets
|
||||
/// buffer's address must be a multiple of 8.
|
||||
constexpr int64_t kValueAlignment = -3;
|
||||
|
||||
/// \brief Calculate if the buffer's address is a multiple of `alignment`
|
||||
///
|
||||
/// If `alignment` is less than or equal to 0 then this method will always return true
|
||||
/// \param buffer the buffer to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
ARROW_EXPORT bool CheckAlignment(const Buffer& buffer, int64_t alignment);
|
||||
/// \brief Calculate if all buffers in the array data are aligned
|
||||
///
|
||||
/// This will also check the buffers in the dictionary and any children
|
||||
/// \param array the array data to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
ARROW_EXPORT bool CheckAlignment(const ArrayData& array, int64_t alignment);
|
||||
/// \brief Calculate if all buffers in the array are aligned
|
||||
///
|
||||
/// This will also check the buffers in the dictionary and any children
|
||||
/// \param array the array to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
ARROW_EXPORT bool CheckAlignment(const Array& array, int64_t alignment);
|
||||
|
||||
// Following functions require an additional boolean vector which stores the
|
||||
// alignment check bits of the constituent objects.
|
||||
// For example, needs_alignment vector for a ChunkedArray will contain the
|
||||
// check bits of the constituent Arrays.
|
||||
// The boolean vector check was introduced to minimize the repetitive checks
|
||||
// of the constituent objects during the EnsureAlignment function where certain
|
||||
// objects can be ignored for further checking if we already know that they are
|
||||
// completely aligned.
|
||||
|
||||
/// \brief Calculate which (if any) chunks in a chunked array are unaligned
|
||||
/// \param array the array to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
/// \param needs_alignment an output vector that will store the results of the check
|
||||
/// it must be set to a valid vector. Extra elements will be added to the end
|
||||
/// of the vector for each chunk that is checked. `true` will be stored if
|
||||
/// the chunk is unaligned.
|
||||
/// \param offset the index of the chunk to start checking
|
||||
/// \return true if all chunks (starting at `offset`) are aligned, false otherwise
|
||||
ARROW_EXPORT bool CheckAlignment(const ChunkedArray& array, int64_t alignment,
|
||||
std::vector<bool>* needs_alignment, int offset = 0);
|
||||
|
||||
/// \brief calculate which (if any) columns in a record batch are unaligned
|
||||
/// \param batch the batch to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
/// \param needs_alignment an output vector that will store the results of the
|
||||
/// check. It must be set to a valid vector. Extra elements will be added
|
||||
/// to the end of the vector for each column that is checked. `true` will be
|
||||
/// stored if the column is unaligned.
|
||||
ARROW_EXPORT bool CheckAlignment(const RecordBatch& batch, int64_t alignment,
|
||||
std::vector<bool>* needs_alignment);
|
||||
|
||||
/// \brief calculate which (if any) columns in a table are unaligned
|
||||
/// \param table the table to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
/// \param needs_alignment an output vector that will store the results of the
|
||||
/// check. It must be set to a valid vector. Extra elements will be added
|
||||
/// to the end of the vector for each column that is checked. `true` will be
|
||||
/// stored if the column is unaligned.
|
||||
ARROW_EXPORT bool CheckAlignment(const Table& table, int64_t alignment,
|
||||
std::vector<bool>* needs_alignment);
|
||||
|
||||
/// \brief return a buffer that has the given alignment and the same data as the input
|
||||
/// buffer
|
||||
///
|
||||
/// If the input buffer is already aligned then this method will return the input buffer
|
||||
/// If the input buffer is not already aligned then this method will allocate a new
|
||||
/// buffer. The alignment of the new buffer will have at least
|
||||
/// max(kDefaultBufferAlignment, alignment) bytes of alignment.
|
||||
///
|
||||
/// \param buffer the buffer to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
/// \param memory_pool a memory pool that will be used to allocate a new buffer if the
|
||||
/// input buffer is not sufficiently aligned
|
||||
ARROW_EXPORT Result<std::shared_ptr<Buffer>> EnsureAlignment(
|
||||
std::shared_ptr<Buffer> buffer, int64_t alignment, MemoryPool* memory_pool);
|
||||
|
||||
/// \brief return an array data where all buffers are aligned by the given alignment
|
||||
///
|
||||
/// If any input buffer is already aligned then this method will reuse that same input
|
||||
/// buffer.
|
||||
///
|
||||
/// \param array_data the array data to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
/// \param memory_pool a memory pool that will be used to allocate new buffers if any
|
||||
/// input buffer is not sufficiently aligned
|
||||
ARROW_EXPORT Result<std::shared_ptr<ArrayData>> EnsureAlignment(
|
||||
std::shared_ptr<ArrayData> array_data, int64_t alignment, MemoryPool* memory_pool);
|
||||
|
||||
/// \brief return an array where all buffers are aligned by the given alignment
|
||||
///
|
||||
/// If any input buffer is already aligned then this method will reuse that same input
|
||||
/// buffer.
|
||||
///
|
||||
/// \param array the array to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
/// \param memory_pool a memory pool that will be used to allocate new buffers if any
|
||||
/// input buffer is not sufficiently aligned
|
||||
ARROW_EXPORT Result<std::shared_ptr<Array>> EnsureAlignment(std::shared_ptr<Array> array,
|
||||
int64_t alignment,
|
||||
MemoryPool* memory_pool);
|
||||
|
||||
/// \brief return a chunked array where all buffers are aligned by the given alignment
|
||||
///
|
||||
/// If any input buffer is already aligned then this method will reuse that same input
|
||||
/// buffer.
|
||||
///
|
||||
/// \param array the chunked array to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
/// \param memory_pool a memory pool that will be used to allocate new buffers if any
|
||||
/// input buffer is not sufficiently aligned
|
||||
ARROW_EXPORT Result<std::shared_ptr<ChunkedArray>> EnsureAlignment(
|
||||
std::shared_ptr<ChunkedArray> array, int64_t alignment, MemoryPool* memory_pool);
|
||||
|
||||
/// \brief return a record batch where all buffers are aligned by the given alignment
|
||||
///
|
||||
/// If any input buffer is already aligned then this method will reuse that same input
|
||||
/// buffer.
|
||||
///
|
||||
/// \param batch the batch to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
/// \param memory_pool a memory pool that will be used to allocate new buffers if any
|
||||
/// input buffer is not sufficiently aligned
|
||||
ARROW_EXPORT Result<std::shared_ptr<RecordBatch>> EnsureAlignment(
|
||||
std::shared_ptr<RecordBatch> batch, int64_t alignment, MemoryPool* memory_pool);
|
||||
|
||||
/// \brief return a table where all buffers are aligned by the given alignment
|
||||
///
|
||||
/// If any input buffer is already aligned then this method will reuse that same input
|
||||
/// buffer.
|
||||
///
|
||||
/// \param table the table to check
|
||||
/// \param alignment the alignment (in bytes) to check for
|
||||
/// \param memory_pool a memory pool that will be used to allocate new buffers if any
|
||||
/// input buffer is not sufficiently aligned
|
||||
ARROW_EXPORT Result<std::shared_ptr<Table>> EnsureAlignment(std::shared_ptr<Table> table,
|
||||
int64_t alignment,
|
||||
MemoryPool* memory_pool);
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,126 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstring>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/util/launder.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
template <typename T>
|
||||
class AlignedStorage {
|
||||
public:
|
||||
static constexpr bool can_memcpy = std::is_trivial<T>::value;
|
||||
|
||||
constexpr T* get() noexcept {
|
||||
return arrow::internal::launder(reinterpret_cast<T*>(&data_));
|
||||
}
|
||||
|
||||
constexpr const T* get() const noexcept {
|
||||
// Use fully qualified name to avoid ambiguities with MSVC (ARROW-14800)
|
||||
return arrow::internal::launder(reinterpret_cast<const T*>(&data_));
|
||||
}
|
||||
|
||||
void destroy() noexcept {
|
||||
if (!std::is_trivially_destructible<T>::value) {
|
||||
get()->~T();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... A>
|
||||
void construct(A&&... args) noexcept {
|
||||
new (&data_) T(std::forward<A>(args)...);
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
void assign(V&& v) noexcept {
|
||||
*get() = std::forward<V>(v);
|
||||
}
|
||||
|
||||
void move_construct(AlignedStorage* other) noexcept {
|
||||
new (&data_) T(std::move(*other->get()));
|
||||
}
|
||||
|
||||
void move_assign(AlignedStorage* other) noexcept { *get() = std::move(*other->get()); }
|
||||
|
||||
template <bool CanMemcpy = can_memcpy>
|
||||
static typename std::enable_if<CanMemcpy>::type move_construct_several(
|
||||
AlignedStorage* ARROW_RESTRICT src, AlignedStorage* ARROW_RESTRICT dest, size_t n,
|
||||
size_t memcpy_length) noexcept {
|
||||
memcpy(dest->get(), src->get(), memcpy_length * sizeof(T));
|
||||
}
|
||||
|
||||
template <bool CanMemcpy = can_memcpy>
|
||||
static typename std::enable_if<CanMemcpy>::type
|
||||
move_construct_several_and_destroy_source(AlignedStorage* ARROW_RESTRICT src,
|
||||
AlignedStorage* ARROW_RESTRICT dest, size_t n,
|
||||
size_t memcpy_length) noexcept {
|
||||
memcpy(dest->get(), src->get(), memcpy_length * sizeof(T));
|
||||
}
|
||||
|
||||
template <bool CanMemcpy = can_memcpy>
|
||||
static typename std::enable_if<!CanMemcpy>::type move_construct_several(
|
||||
AlignedStorage* ARROW_RESTRICT src, AlignedStorage* ARROW_RESTRICT dest, size_t n,
|
||||
size_t memcpy_length) noexcept {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
new (dest[i].get()) T(std::move(*src[i].get()));
|
||||
}
|
||||
}
|
||||
|
||||
template <bool CanMemcpy = can_memcpy>
|
||||
static typename std::enable_if<!CanMemcpy>::type
|
||||
move_construct_several_and_destroy_source(AlignedStorage* ARROW_RESTRICT src,
|
||||
AlignedStorage* ARROW_RESTRICT dest, size_t n,
|
||||
size_t memcpy_length) noexcept {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
new (dest[i].get()) T(std::move(*src[i].get()));
|
||||
src[i].destroy();
|
||||
}
|
||||
}
|
||||
|
||||
static void move_construct_several(AlignedStorage* ARROW_RESTRICT src,
|
||||
AlignedStorage* ARROW_RESTRICT dest,
|
||||
size_t n) noexcept {
|
||||
move_construct_several(src, dest, n, n);
|
||||
}
|
||||
|
||||
static void move_construct_several_and_destroy_source(
|
||||
AlignedStorage* ARROW_RESTRICT src, AlignedStorage* ARROW_RESTRICT dest,
|
||||
size_t n) noexcept {
|
||||
move_construct_several_and_destroy_source(src, dest, n, n);
|
||||
}
|
||||
|
||||
static void destroy_several(AlignedStorage* p, size_t n) noexcept {
|
||||
if (!std::is_trivially_destructible<T>::value) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
p[i].destroy();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
alignas(T) std::byte data_[sizeof(T)];
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,69 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
template <typename T>
|
||||
using AsyncGenerator = std::function<Future<T>()>;
|
||||
|
||||
template <typename T, typename V>
|
||||
class MappingGenerator;
|
||||
|
||||
template <typename T, typename ComesAfter, typename IsNext>
|
||||
class SequencingGenerator;
|
||||
|
||||
template <typename T, typename V>
|
||||
class TransformingGenerator;
|
||||
|
||||
template <typename T>
|
||||
class SerialReadaheadGenerator;
|
||||
|
||||
template <typename T>
|
||||
class ReadaheadGenerator;
|
||||
|
||||
template <typename T>
|
||||
class PushGenerator;
|
||||
|
||||
template <typename T>
|
||||
class MergedGenerator;
|
||||
|
||||
template <typename T>
|
||||
class EnumeratingGenerator;
|
||||
|
||||
template <typename T>
|
||||
class TransferringGenerator;
|
||||
|
||||
template <typename T>
|
||||
class BackgroundGenerator;
|
||||
|
||||
template <typename T>
|
||||
class GeneratorIterator;
|
||||
|
||||
template <typename T>
|
||||
struct CancellableGenerator;
|
||||
|
||||
template <typename T>
|
||||
class DefaultIfEmptyGenerator;
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,460 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/cancel.h"
|
||||
#include "arrow/util/functional.h"
|
||||
#include "arrow/util/future.h"
|
||||
#include "arrow/util/iterator.h"
|
||||
#include "arrow/util/mutex.h"
|
||||
#include "arrow/util/thread_pool.h"
|
||||
#include "arrow/util/tracing.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using internal::FnOnce;
|
||||
|
||||
namespace util {
|
||||
|
||||
/// A utility which keeps tracks of, and schedules, asynchronous tasks
|
||||
///
|
||||
/// An asynchronous task has a synchronous component and an asynchronous component.
|
||||
/// The synchronous component typically schedules some kind of work on an external
|
||||
/// resource (e.g. the I/O thread pool or some kind of kernel-based asynchronous
|
||||
/// resource like io_uring). The asynchronous part represents the work
|
||||
/// done on that external resource. Executing the synchronous part will be referred
|
||||
/// to as "submitting the task" since this usually includes submitting the asynchronous
|
||||
/// portion to the external thread pool.
|
||||
///
|
||||
/// By default the scheduler will submit the task (execute the synchronous part) as
|
||||
/// soon as it is added, assuming the underlying thread pool hasn't terminated or the
|
||||
/// scheduler hasn't aborted. In this mode, the scheduler is simply acting as
|
||||
/// a simple task group.
|
||||
///
|
||||
/// A task scheduler starts with an initial task. That task, and all subsequent tasks
|
||||
/// are free to add subtasks. Once all submitted tasks finish the scheduler will
|
||||
/// finish. Note, it is not an error to add additional tasks after a scheduler has
|
||||
/// aborted. These tasks will be ignored and never submitted. The scheduler returns a
|
||||
/// future which will complete when all submitted tasks have finished executing. Once all
|
||||
/// tasks have been finished the scheduler is invalid and should no longer be used.
|
||||
///
|
||||
/// Task failure (either the synchronous portion or the asynchronous portion) will cause
|
||||
/// the scheduler to enter an aborted state. The first such failure will be reported in
|
||||
/// the final task future.
|
||||
class ARROW_EXPORT AsyncTaskScheduler {
|
||||
public:
|
||||
/// Destructor for AsyncTaskScheduler
|
||||
///
|
||||
/// The lifetime of the task scheduled is managed automatically. The scheduler
|
||||
/// will remain valid while any tasks are running (and can always be safely accessed)
|
||||
/// within tasks) and will be destroyed as soon as all tasks have finished.
|
||||
virtual ~AsyncTaskScheduler() = default;
|
||||
/// An interface for a task
|
||||
///
|
||||
/// Users may want to override this, for example, to add priority
|
||||
/// information for use by a queue.
|
||||
class Task {
|
||||
public:
|
||||
virtual ~Task() = default;
|
||||
/// Submit the task
|
||||
///
|
||||
/// This will be called by the scheduler at most once when there
|
||||
/// is space to run the task. This is expected to be a fairly quick
|
||||
/// function that simply submits the actual task work to an external
|
||||
/// resource (e.g. I/O thread pool).
|
||||
///
|
||||
/// If this call fails then the scheduler will enter an aborted state.
|
||||
virtual Result<Future<>> operator()() = 0;
|
||||
/// The cost of the task
|
||||
///
|
||||
/// A ThrottledAsyncTaskScheduler can be used to limit the number of concurrent tasks.
|
||||
/// A custom cost may be used, for example, if you would like to limit the number of
|
||||
/// tasks based on the total expected RAM usage of the tasks (this is done in the
|
||||
/// scanner)
|
||||
virtual int cost() const { return 1; }
|
||||
/// The name of the task
|
||||
///
|
||||
/// This is used for debugging and traceability. The returned view must remain
|
||||
/// valid for the lifetime of the task.
|
||||
virtual std::string_view name() const = 0;
|
||||
|
||||
/// a span tied to the lifetime of the task, for internal use only
|
||||
tracing::Span span;
|
||||
};
|
||||
|
||||
/// Add a task to the scheduler
|
||||
///
|
||||
/// If the scheduler is in an aborted state this call will return false and the task
|
||||
/// will never be run. This is harmless and does not need to be guarded against.
|
||||
///
|
||||
/// The return value for this call can usually be ignored. There is little harm in
|
||||
/// attempting to add tasks to an aborted scheduler. It is only included for callers
|
||||
/// that want to avoid future task generation to save effort.
|
||||
///
|
||||
/// \param task the task to submit
|
||||
///
|
||||
/// A task's name must remain valid for the duration of the task. It is used for
|
||||
/// debugging (e.g. when debugging a deadlock to see which tasks still remain) and for
|
||||
/// traceability (the name will be used for spans assigned to the task)
|
||||
///
|
||||
/// \return true if the task was submitted or queued, false if the task was ignored
|
||||
virtual bool AddTask(std::unique_ptr<Task> task) = 0;
|
||||
|
||||
/// Adds an async generator to the scheduler
|
||||
///
|
||||
/// The async generator will be visited, one item at a time. Submitting a task
|
||||
/// will consist of polling the generator for the next future. The generator's future
|
||||
/// will then represent the task itself.
|
||||
///
|
||||
/// This visits the task serially without readahead. If readahead or parallelism
|
||||
/// is desired then it should be added in the generator itself.
|
||||
///
|
||||
/// The generator itself will be kept alive until all tasks have been completed.
|
||||
/// However, if the scheduler is aborted, the generator will be destroyed as soon as the
|
||||
/// next item would be requested.
|
||||
///
|
||||
/// \param generator the generator to submit to the scheduler
|
||||
/// \param visitor a function which visits each generator future as it completes
|
||||
/// \param name a name which will be used for each submitted task
|
||||
template <typename T>
|
||||
bool AddAsyncGenerator(std::function<Future<T>()> generator,
|
||||
std::function<Status(const T&)> visitor, std::string_view name);
|
||||
|
||||
template <typename Callable>
|
||||
struct SimpleTask : public Task {
|
||||
SimpleTask(Callable callable, std::string_view name)
|
||||
: callable(std::move(callable)), name_(name) {}
|
||||
SimpleTask(Callable callable, std::string name)
|
||||
: callable(std::move(callable)), owned_name_(std::move(name)) {
|
||||
name_ = *owned_name_;
|
||||
}
|
||||
Result<Future<>> operator()() override { return callable(); }
|
||||
std::string_view name() const override { return name_; }
|
||||
Callable callable;
|
||||
std::string_view name_;
|
||||
std::optional<std::string> owned_name_;
|
||||
};
|
||||
|
||||
/// Add a task with cost 1 to the scheduler
|
||||
///
|
||||
/// \param callable a "submit" function that should return a future
|
||||
/// \param name a name for the task
|
||||
///
|
||||
/// `name` must remain valid until the task has been submitted AND the returned
|
||||
/// future completes. It is used for debugging and tracing.
|
||||
///
|
||||
/// \see AddTask for more details
|
||||
template <typename Callable>
|
||||
bool AddSimpleTask(Callable callable, std::string_view name) {
|
||||
return AddTask(std::make_unique<SimpleTask<Callable>>(std::move(callable), name));
|
||||
}
|
||||
|
||||
/// Add a task with cost 1 to the scheduler
|
||||
///
|
||||
/// This is an overload of \see AddSimpleTask that keeps `name` alive
|
||||
/// in the task.
|
||||
template <typename Callable>
|
||||
bool AddSimpleTask(Callable callable, std::string name) {
|
||||
return AddTask(
|
||||
std::make_unique<SimpleTask<Callable>>(std::move(callable), std::move(name)));
|
||||
}
|
||||
|
||||
/// Construct a scheduler
|
||||
///
|
||||
/// \param initial_task The initial task which is responsible for adding
|
||||
/// the first subtasks to the scheduler.
|
||||
/// \param abort_callback A callback that will be triggered immediately after a task
|
||||
/// fails while other tasks may still be running. Nothing needs to be done here,
|
||||
/// when a task fails the scheduler will stop accepting new tasks and eventually
|
||||
/// return the error. However, this callback can be used to more quickly end
|
||||
/// long running tasks that have already been submitted. Defaults to doing
|
||||
/// nothing.
|
||||
/// \param stop_token An optional stop token that will allow cancellation of the
|
||||
/// scheduler. This will be checked before each task is submitted and, in the
|
||||
/// event of a cancellation, the scheduler will enter an aborted state. This is
|
||||
/// a graceful cancellation and submitted tasks will still complete.
|
||||
/// \return A future that will be completed when the initial task and all subtasks have
|
||||
/// finished.
|
||||
static Future<> Make(
|
||||
FnOnce<Status(AsyncTaskScheduler*)> initial_task,
|
||||
FnOnce<void(const Status&)> abort_callback = [](const Status&) {},
|
||||
StopToken stop_token = StopToken::Unstoppable());
|
||||
|
||||
/// A span tracking execution of the scheduler's tasks, for internal use only
|
||||
virtual const tracing::Span& span() const = 0;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler {
|
||||
public:
|
||||
/// An interface for a task queue
|
||||
///
|
||||
/// A queue's methods will not be called concurrently
|
||||
class Queue {
|
||||
public:
|
||||
virtual ~Queue() = default;
|
||||
/// Push a task to the queue
|
||||
///
|
||||
/// \param task the task to enqueue
|
||||
virtual void Push(std::unique_ptr<Task> task) = 0;
|
||||
/// Pop the next task from the queue
|
||||
virtual std::unique_ptr<Task> Pop() = 0;
|
||||
/// Peek the next task in the queue
|
||||
virtual const Task& Peek() = 0;
|
||||
/// Check if the queue is empty
|
||||
virtual bool Empty() = 0;
|
||||
/// Purge the queue of all items
|
||||
virtual void Purge() = 0;
|
||||
virtual std::size_t Size() const = 0;
|
||||
};
|
||||
|
||||
class Throttle {
|
||||
public:
|
||||
virtual ~Throttle() = default;
|
||||
/// Acquire amt permits
|
||||
///
|
||||
/// If nullopt is returned then the permits were immediately
|
||||
/// acquired and the caller can proceed. If a future is returned then the caller
|
||||
/// should wait for the future to complete first. When the returned future completes
|
||||
/// the permits have NOT been acquired and the caller must call Acquire again
|
||||
///
|
||||
/// \param amt the number of permits to acquire
|
||||
virtual std::optional<Future<>> TryAcquire(int amt) = 0;
|
||||
/// Release amt permits
|
||||
///
|
||||
/// This will possibly complete waiting futures and should probably not be
|
||||
/// called while holding locks.
|
||||
///
|
||||
/// \param amt the number of permits to release
|
||||
virtual void Release(int amt) = 0;
|
||||
|
||||
/// The size of the largest task that can run
|
||||
///
|
||||
/// Incoming tasks will have their cost latched to this value to ensure
|
||||
/// they can still run (although they will be the only thing allowed to
|
||||
/// run at that time).
|
||||
virtual int Capacity() = 0;
|
||||
|
||||
/// Pause the throttle
|
||||
///
|
||||
/// Any tasks that have been submitted already will continue. However, no new tasks
|
||||
/// will be run until the throttle is resumed.
|
||||
virtual void Pause() = 0;
|
||||
/// Resume the throttle
|
||||
///
|
||||
/// Allows task to be submitted again. If there is a max_concurrent_cost limit then
|
||||
/// it will still apply.
|
||||
virtual void Resume() = 0;
|
||||
};
|
||||
|
||||
/// Pause the throttle
|
||||
///
|
||||
/// Any tasks that have been submitted already will continue. However, no new tasks
|
||||
/// will be run until the throttle is resumed.
|
||||
virtual void Pause() = 0;
|
||||
/// Resume the throttle
|
||||
///
|
||||
/// Allows task to be submitted again. If there is a max_concurrent_cost limit then
|
||||
/// it will still apply.
|
||||
virtual void Resume() = 0;
|
||||
/// Return the number of tasks queued but not yet submitted
|
||||
virtual std::size_t QueueSize() = 0;
|
||||
|
||||
/// Create a throttled view of a scheduler
|
||||
///
|
||||
/// Tasks added via this view will be subjected to the throttle and, if the tasks cannot
|
||||
/// run immediately, will be placed into a queue.
|
||||
///
|
||||
/// Although a shared_ptr is returned it should generally be assumed that the caller
|
||||
/// is being given exclusive ownership. The shared_ptr is used to share the view with
|
||||
/// queued and submitted tasks and the lifetime of those is unpredictable. It is
|
||||
/// important the caller keep the returned pointer alive for as long as they plan to add
|
||||
/// tasks to the view.
|
||||
///
|
||||
/// \param scheduler a scheduler to submit tasks to after throttling
|
||||
///
|
||||
/// This can be the root scheduler, another throttled scheduler, or a task group. These
|
||||
/// are all composable.
|
||||
///
|
||||
/// \param max_concurrent_cost the maximum amount of cost allowed to run at any one time
|
||||
///
|
||||
/// If a task is added that has a cost greater than max_concurrent_cost then its cost
|
||||
/// will be reduced to max_concurrent_cost so that it is still possible for the task to
|
||||
/// run.
|
||||
///
|
||||
/// \param queue the queue to use when tasks cannot be submitted
|
||||
///
|
||||
/// By default a FIFO queue will be used. However, a custom queue can be provided if
|
||||
/// some tasks have higher priority than other tasks.
|
||||
static std::shared_ptr<ThrottledAsyncTaskScheduler> Make(
|
||||
AsyncTaskScheduler* scheduler, int max_concurrent_cost,
|
||||
std::unique_ptr<Queue> queue = NULLPTR);
|
||||
|
||||
/// @brief Create a ThrottledAsyncTaskScheduler using a custom throttle
|
||||
///
|
||||
/// \see Make
|
||||
static std::shared_ptr<ThrottledAsyncTaskScheduler> MakeWithCustomThrottle(
|
||||
AsyncTaskScheduler* scheduler, std::unique_ptr<Throttle> throttle,
|
||||
std::unique_ptr<Queue> queue = NULLPTR);
|
||||
};
|
||||
|
||||
/// A utility to keep track of a collection of tasks
|
||||
///
|
||||
/// Often it is useful to keep track of some state that only needs to stay alive
|
||||
/// for some small collection of tasks, or to perform some kind of final cleanup
|
||||
/// when a collection of tasks is finished.
|
||||
///
|
||||
/// For example, when scanning, we need to keep the file reader alive while all scan
|
||||
/// tasks run for a given file, and then we can gracefully close it when we finish the
|
||||
/// file.
|
||||
class ARROW_EXPORT AsyncTaskGroup : public AsyncTaskScheduler {
|
||||
public:
|
||||
/// Destructor for the task group
|
||||
///
|
||||
/// The destructor might trigger the finish callback. If the finish callback fails
|
||||
/// then the error will be reported as a task on the scheduler.
|
||||
///
|
||||
/// Failure to destroy the async task group will not prevent the scheduler from
|
||||
/// finishing. If the scheduler finishes before the async task group is done then
|
||||
/// the finish callback will be run immediately when the async task group finishes.
|
||||
///
|
||||
/// If the scheduler has aborted then the finish callback will not run.
|
||||
~AsyncTaskGroup() = default;
|
||||
/// Create an async task group
|
||||
///
|
||||
/// The finish callback will not run until the task group is destroyed and all
|
||||
/// tasks are finished so you will generally want to reset / destroy the returned
|
||||
/// unique_ptr at some point.
|
||||
///
|
||||
/// \param scheduler The underlying scheduler to submit tasks to
|
||||
/// \param finish_callback A callback that will be run only after the task group has
|
||||
/// been destroyed and all tasks added by the group have
|
||||
/// finished.
|
||||
///
|
||||
/// Note: in error scenarios the finish callback may not run. However, it will still,
|
||||
/// of course, be destroyed.
|
||||
static std::unique_ptr<AsyncTaskGroup> Make(AsyncTaskScheduler* scheduler,
|
||||
FnOnce<Status()> finish_callback);
|
||||
};
|
||||
|
||||
/// Create a task group that is also throttled
|
||||
///
|
||||
/// This is a utility factory that creates a throttled view of a scheduler and then
|
||||
/// wraps that throttled view with a task group that destroys the throttle when finished.
|
||||
///
|
||||
/// \see ThrottledAsyncTaskScheduler
|
||||
/// \see AsyncTaskGroup
|
||||
/// \param target the underlying scheduler to submit tasks to
|
||||
/// \param max_concurrent_cost the maximum amount of cost allowed to run at any one time
|
||||
/// \param queue the queue to use when tasks cannot be submitted
|
||||
/// \param finish_callback A callback that will be run only after the task group has
|
||||
/// been destroyed and all tasks added by the group have finished
|
||||
ARROW_EXPORT std::unique_ptr<ThrottledAsyncTaskScheduler> MakeThrottledAsyncTaskGroup(
|
||||
AsyncTaskScheduler* target, int max_concurrent_cost,
|
||||
std::unique_ptr<ThrottledAsyncTaskScheduler::Queue> queue,
|
||||
FnOnce<Status()> finish_callback);
|
||||
|
||||
// Defined down here to avoid circular dependency between AsyncTaskScheduler and
|
||||
// AsyncTaskGroup
|
||||
template <typename T>
|
||||
bool AsyncTaskScheduler::AddAsyncGenerator(std::function<Future<T>()> generator,
|
||||
std::function<Status(const T&)> visitor,
|
||||
std::string_view name) {
|
||||
struct State {
|
||||
State(std::function<Future<T>()> generator, std::function<Status(const T&)> visitor,
|
||||
std::unique_ptr<AsyncTaskGroup> task_group, std::string_view name)
|
||||
: generator(std::move(generator)),
|
||||
visitor(std::move(visitor)),
|
||||
task_group(std::move(task_group)),
|
||||
name(name) {}
|
||||
std::function<Future<T>()> generator;
|
||||
std::function<Status(const T&)> visitor;
|
||||
std::unique_ptr<AsyncTaskGroup> task_group;
|
||||
std::string_view name;
|
||||
};
|
||||
struct SubmitTask : public Task {
|
||||
explicit SubmitTask(std::unique_ptr<State> state_holder)
|
||||
: state_holder(std::move(state_holder)) {}
|
||||
|
||||
struct SubmitTaskCallback {
|
||||
SubmitTaskCallback(std::unique_ptr<State> state_holder, Future<> task_completion)
|
||||
: state_holder(std::move(state_holder)),
|
||||
task_completion(std::move(task_completion)) {}
|
||||
void operator()(const Result<T>& maybe_item) {
|
||||
if (!maybe_item.ok()) {
|
||||
task_completion.MarkFinished(maybe_item.status());
|
||||
return;
|
||||
}
|
||||
const auto& item = *maybe_item;
|
||||
if (IsIterationEnd(item)) {
|
||||
task_completion.MarkFinished();
|
||||
return;
|
||||
}
|
||||
Status visit_st = state_holder->visitor(item);
|
||||
if (!visit_st.ok()) {
|
||||
task_completion.MarkFinished(std::move(visit_st));
|
||||
return;
|
||||
}
|
||||
state_holder->task_group->AddTask(
|
||||
std::make_unique<SubmitTask>(std::move(state_holder)));
|
||||
task_completion.MarkFinished();
|
||||
}
|
||||
std::unique_ptr<State> state_holder;
|
||||
Future<> task_completion;
|
||||
};
|
||||
|
||||
Result<Future<>> operator()() {
|
||||
Future<> task = Future<>::Make();
|
||||
// Consume as many items as we can (those that are already finished)
|
||||
// synchronously to avoid recursion / stack overflow.
|
||||
while (true) {
|
||||
Future<T> next = state_holder->generator();
|
||||
if (next.TryAddCallback(
|
||||
[&] { return SubmitTaskCallback(std::move(state_holder), task); })) {
|
||||
return task;
|
||||
}
|
||||
ARROW_ASSIGN_OR_RAISE(T item, next.result());
|
||||
if (IsIterationEnd(item)) {
|
||||
task.MarkFinished();
|
||||
return task;
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(state_holder->visitor(item));
|
||||
}
|
||||
}
|
||||
|
||||
std::string_view name() const { return state_holder->name; }
|
||||
|
||||
std::unique_ptr<State> state_holder;
|
||||
};
|
||||
std::unique_ptr<AsyncTaskGroup> task_group =
|
||||
AsyncTaskGroup::Make(this, [] { return Status::OK(); });
|
||||
AsyncTaskGroup* task_group_view = task_group.get();
|
||||
std::unique_ptr<State> state_holder = std::make_unique<State>(
|
||||
std::move(generator), std::move(visitor), std::move(task_group), name);
|
||||
task_group_view->AddTask(std::make_unique<SubmitTask>(std::move(state_holder)));
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,35 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
ARROW_EXPORT
|
||||
std::string base64_encode(std::string_view s);
|
||||
|
||||
ARROW_EXPORT
|
||||
std::string base64_decode(std::string_view s);
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,887 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/util/endian.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/type_traits.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
enum class DecimalStatus {
|
||||
kSuccess,
|
||||
kDivideByZero,
|
||||
kOverflow,
|
||||
kRescaleDataLoss,
|
||||
};
|
||||
|
||||
template <typename Derived, int BIT_WIDTH, int NWORDS = BIT_WIDTH / 64>
|
||||
class GenericBasicDecimal {
|
||||
protected:
|
||||
struct LittleEndianArrayTag {};
|
||||
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
static constexpr int kHighWordIndex = NWORDS - 1;
|
||||
static constexpr int kLowWordIndex = 0;
|
||||
#else
|
||||
static constexpr int kHighWordIndex = 0;
|
||||
static constexpr int kLowWordIndex = NWORDS - 1;
|
||||
#endif
|
||||
|
||||
public:
|
||||
static constexpr int kBitWidth = BIT_WIDTH;
|
||||
static constexpr int kByteWidth = kBitWidth / 8;
|
||||
static constexpr int kNumWords = NWORDS;
|
||||
|
||||
// A constructor tag to introduce a little-endian encoded array
|
||||
static constexpr LittleEndianArrayTag LittleEndianArray{};
|
||||
|
||||
using WordArray = std::array<uint64_t, NWORDS>;
|
||||
|
||||
/// \brief Empty constructor creates a decimal with a value of 0.
|
||||
constexpr GenericBasicDecimal() noexcept : array_({0}) {}
|
||||
|
||||
/// \brief Create a decimal from the two's complement representation.
|
||||
///
|
||||
/// Input array is assumed to be in native endianness.
|
||||
explicit constexpr GenericBasicDecimal(const WordArray& array) noexcept
|
||||
: array_(array) {}
|
||||
|
||||
/// \brief Create a decimal from the two's complement representation.
|
||||
///
|
||||
/// Input array is assumed to be in little endianness, with native endian elements.
|
||||
GenericBasicDecimal(LittleEndianArrayTag, const WordArray& array) noexcept
|
||||
: GenericBasicDecimal(bit_util::little_endian::ToNative(array)) {}
|
||||
|
||||
/// \brief Create a decimal from any integer not wider than 64 bits.
|
||||
template <typename T,
|
||||
typename = typename std::enable_if<
|
||||
std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
|
||||
constexpr GenericBasicDecimal(T value) noexcept // NOLINT(runtime/explicit)
|
||||
: array_(WordsFromLowBits(value)) {}
|
||||
|
||||
/// \brief Create a decimal from an array of bytes.
|
||||
///
|
||||
/// Bytes are assumed to be in native-endian byte order.
|
||||
explicit GenericBasicDecimal(const uint8_t* bytes) {
|
||||
memcpy(array_.data(), bytes, sizeof(array_));
|
||||
}
|
||||
|
||||
/// \brief Get the bits of the two's complement representation of the number.
|
||||
///
|
||||
/// The elements are in native endian order. The bits within each uint64_t element
|
||||
/// are in native endian order. For example, on a little endian machine,
|
||||
/// BasicDecimal128(123).native_endian_array() = {123, 0};
|
||||
/// but on a big endian machine,
|
||||
/// BasicDecimal128(123).native_endian_array() = {0, 123};
|
||||
constexpr const WordArray& native_endian_array() const { return array_; }
|
||||
|
||||
/// \brief Get the bits of the two's complement representation of the number.
|
||||
///
|
||||
/// The elements are in little endian order. However, the bits within each
|
||||
/// uint64_t element are in native endian order.
|
||||
/// For example, BasicDecimal128(123).little_endian_array() = {123, 0};
|
||||
WordArray little_endian_array() const {
|
||||
return bit_util::little_endian::FromNative(array_);
|
||||
}
|
||||
|
||||
const uint8_t* native_endian_bytes() const {
|
||||
return reinterpret_cast<const uint8_t*>(array_.data());
|
||||
}
|
||||
|
||||
uint8_t* mutable_native_endian_bytes() {
|
||||
return reinterpret_cast<uint8_t*>(array_.data());
|
||||
}
|
||||
|
||||
/// \brief Return the raw bytes of the value in native-endian byte order.
|
||||
std::array<uint8_t, kByteWidth> ToBytes() const {
|
||||
std::array<uint8_t, kByteWidth> out{{0}};
|
||||
memcpy(out.data(), array_.data(), kByteWidth);
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Copy the raw bytes of the value in native-endian byte order.
|
||||
void ToBytes(uint8_t* out) const { memcpy(out, array_.data(), kByteWidth); }
|
||||
|
||||
/// Return 1 if positive or zero, -1 if strictly negative.
|
||||
int64_t Sign() const {
|
||||
return 1 | (static_cast<int64_t>(array_[kHighWordIndex]) >> 63);
|
||||
}
|
||||
|
||||
bool IsNegative() const { return static_cast<int64_t>(array_[kHighWordIndex]) < 0; }
|
||||
|
||||
explicit operator bool() const { return array_ != WordArray{}; }
|
||||
|
||||
friend bool operator==(const GenericBasicDecimal& left,
|
||||
const GenericBasicDecimal& right) {
|
||||
return left.array_ == right.array_;
|
||||
}
|
||||
|
||||
friend bool operator!=(const GenericBasicDecimal& left,
|
||||
const GenericBasicDecimal& right) {
|
||||
return left.array_ != right.array_;
|
||||
}
|
||||
|
||||
protected:
|
||||
WordArray array_;
|
||||
|
||||
template <typename T>
|
||||
static constexpr uint64_t SignExtend(T low_bits) noexcept {
|
||||
return low_bits >= T{} ? uint64_t{0} : ~uint64_t{0};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static constexpr WordArray WordsFromLowBits(T low_bits) {
|
||||
WordArray words{};
|
||||
if (low_bits < T{}) {
|
||||
for (auto& word : words) {
|
||||
word = ~uint64_t{0};
|
||||
}
|
||||
}
|
||||
words[kLowWordIndex] = static_cast<uint64_t>(low_bits);
|
||||
return words;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DigitType>
|
||||
class ARROW_EXPORT SmallBasicDecimal {
|
||||
public:
|
||||
static_assert(
|
||||
std::is_same_v<DigitType, int32_t> || std::is_same_v<DigitType, int64_t>,
|
||||
"for bitwidths larger than 64 bits use BasicDecimal128 and BasicDecimal256");
|
||||
|
||||
static constexpr int kMaxPrecision = std::numeric_limits<DigitType>::digits10;
|
||||
static constexpr int kMaxScale = kMaxPrecision;
|
||||
static constexpr int kBitWidth = sizeof(DigitType) * CHAR_BIT;
|
||||
static constexpr int kByteWidth = sizeof(DigitType);
|
||||
|
||||
using WordArray = std::array<std::make_unsigned_t<DigitType>, 1>;
|
||||
|
||||
/// \brief Empty constructor creates a decimal with a value of 0.
|
||||
constexpr SmallBasicDecimal() noexcept : value_(0) {}
|
||||
|
||||
/// \brief Create a decimal from any integer not wider than 64 bits.
|
||||
template <typename T,
|
||||
typename = typename std::enable_if<
|
||||
std::is_integral<T>::value && (sizeof(T) <= sizeof(int64_t)), T>::type>
|
||||
constexpr SmallBasicDecimal(T value) noexcept // NOLINT(runtime/explicit)
|
||||
: value_(static_cast<DigitType>(value)) {}
|
||||
|
||||
/// \brief Create a decimal from an array of bytes.
|
||||
///
|
||||
/// Bytes are assumed to be in native-endian byte order.
|
||||
explicit SmallBasicDecimal(const uint8_t* bytes) {
|
||||
memcpy(&value_, bytes, sizeof(value_));
|
||||
}
|
||||
|
||||
constexpr const WordArray native_endian_array() const {
|
||||
return WordArray{static_cast<typename WordArray::value_type>(value_)};
|
||||
}
|
||||
|
||||
constexpr const WordArray little_endian_array() const {
|
||||
return bit_util::little_endian::FromNative(
|
||||
WordArray{static_cast<typename WordArray::value_type>(value_)});
|
||||
}
|
||||
|
||||
const uint8_t* native_endian_bytes() const {
|
||||
return reinterpret_cast<const uint8_t*>(&value_);
|
||||
}
|
||||
|
||||
uint8_t* mutable_native_endian_bytes() { return reinterpret_cast<uint8_t*>(&value_); }
|
||||
|
||||
/// \brief Return the raw bytes of the value in native-endian byte order.
|
||||
std::array<uint8_t, kByteWidth> ToBytes() const {
|
||||
std::array<uint8_t, kByteWidth> out{{0}};
|
||||
memcpy(out.data(), &value_, kByteWidth);
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Copy the raw bytes of the value in native-endian byte order
|
||||
void ToBytes(uint8_t* out) const { memcpy(out, &value_, kByteWidth); }
|
||||
|
||||
/// \brief Return 1 if positive or 0, -1 if strictly negative
|
||||
int64_t Sign() const { return 1 | (value_ >> (kBitWidth - 1)); }
|
||||
|
||||
bool IsNegative() const { return value_ < 0; }
|
||||
|
||||
explicit operator bool() const { return value_ != 0; }
|
||||
|
||||
friend bool operator==(const SmallBasicDecimal& left, const SmallBasicDecimal& right) {
|
||||
return left.value_ == right.value_;
|
||||
}
|
||||
|
||||
friend bool operator!=(const SmallBasicDecimal& left, const SmallBasicDecimal& right) {
|
||||
return left.value_ != right.value_;
|
||||
}
|
||||
|
||||
DigitType value() const { return value_; }
|
||||
|
||||
/// \brief count the number of leading binary zeroes.
|
||||
int32_t CountLeadingBinaryZeros() const;
|
||||
|
||||
constexpr uint64_t low_bits() const { return static_cast<uint64_t>(value_); }
|
||||
|
||||
protected:
|
||||
DigitType value_;
|
||||
};
|
||||
|
||||
class BasicDecimal32;
|
||||
class BasicDecimal64;
|
||||
|
||||
ARROW_EXPORT bool operator<(const BasicDecimal32& left, const BasicDecimal32& right);
|
||||
ARROW_EXPORT bool operator<=(const BasicDecimal32& left, const BasicDecimal32& right);
|
||||
ARROW_EXPORT bool operator>(const BasicDecimal32& left, const BasicDecimal32& right);
|
||||
ARROW_EXPORT bool operator>=(const BasicDecimal32& left, const BasicDecimal32& right);
|
||||
|
||||
ARROW_EXPORT BasicDecimal32 operator-(const BasicDecimal32& self);
|
||||
ARROW_EXPORT BasicDecimal32 operator~(const BasicDecimal32& self);
|
||||
ARROW_EXPORT BasicDecimal32 operator+(const BasicDecimal32& left,
|
||||
const BasicDecimal32& right);
|
||||
ARROW_EXPORT BasicDecimal32 operator-(const BasicDecimal32& left,
|
||||
const BasicDecimal32& right);
|
||||
ARROW_EXPORT BasicDecimal32 operator*(const BasicDecimal32& left,
|
||||
const BasicDecimal32& right);
|
||||
ARROW_EXPORT BasicDecimal32 operator/(const BasicDecimal32& left,
|
||||
const BasicDecimal32& right);
|
||||
ARROW_EXPORT BasicDecimal32 operator%(const BasicDecimal32& left,
|
||||
const BasicDecimal32& right);
|
||||
|
||||
class ARROW_EXPORT BasicDecimal32 : public SmallBasicDecimal<int32_t> {
|
||||
public:
|
||||
using SmallBasicDecimal<int32_t>::SmallBasicDecimal;
|
||||
using ValueType = int32_t;
|
||||
|
||||
/// \brief Negate the current value (in-place)
|
||||
BasicDecimal32& Negate();
|
||||
|
||||
/// \brief Absolute value (in-place)
|
||||
BasicDecimal32& Abs() { return *this < 0 ? Negate() : *this; }
|
||||
|
||||
/// \brief Absolute value
|
||||
static BasicDecimal32 Abs(const BasicDecimal32& in) {
|
||||
BasicDecimal32 result(in);
|
||||
return result.Abs();
|
||||
}
|
||||
|
||||
/// \brief Add a number to this one. The result is truncated to 32 bits.
|
||||
BasicDecimal32& operator+=(const BasicDecimal32& right) {
|
||||
value_ += right.value_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Subtract a number from this one. The result is truncated to 32 bits.
|
||||
BasicDecimal32& operator-=(const BasicDecimal32& right) {
|
||||
value_ -= right.value_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Multiply this number by another. The result is truncated to 32 bits.
|
||||
BasicDecimal32& operator*=(const BasicDecimal32& right) {
|
||||
value_ *= static_cast<uint64_t>(right.value_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Divide this number by the divisor and return the result.
|
||||
///
|
||||
/// This operation is not destructive.
|
||||
/// The answer rounds to zero. Signs work like:
|
||||
/// 21 / 5 -> 4, 1
|
||||
/// -21 / 5 -> -4, -1
|
||||
/// 21 / -5 -> -4, 1
|
||||
/// -21 / -5 -> 4, -1
|
||||
/// \param[in] divisor the number to divide by
|
||||
/// \param[out] result the quotient
|
||||
/// \param[out] remainder the remainder after the division
|
||||
DecimalStatus Divide(const BasicDecimal32& divisor, BasicDecimal32* result,
|
||||
BasicDecimal32* remainder) const;
|
||||
|
||||
/// \brief In-place division
|
||||
BasicDecimal32& operator/=(const BasicDecimal32& right) {
|
||||
value_ /= right.value_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Bitwise "or" between two BasicDecimal32s
|
||||
BasicDecimal32& operator|=(const BasicDecimal32& right) {
|
||||
value_ |= right.value_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Bitwise "and" between two BasicDecimal32s
|
||||
BasicDecimal32& operator&=(const BasicDecimal32& right) {
|
||||
value_ &= right.value_;
|
||||
return *this;
|
||||
}
|
||||
/// \brief Shift left by the given number of bits.
|
||||
BasicDecimal32& operator<<=(uint32_t bits);
|
||||
|
||||
BasicDecimal32 operator<<(uint32_t bits) const {
|
||||
auto res = *this;
|
||||
res <<= bits;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// \brief Shift right by the given number of bits.
|
||||
///
|
||||
/// Negative values will sign-extend
|
||||
BasicDecimal32& operator>>=(uint32_t bits);
|
||||
|
||||
BasicDecimal32 operator>>(uint32_t bits) const {
|
||||
auto res = *this;
|
||||
res >>= bits;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// \brief Convert BasicDecimal32 from one scale to another
|
||||
DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
|
||||
BasicDecimal32* out) const;
|
||||
|
||||
void GetWholeAndFraction(int scale, BasicDecimal32* whole,
|
||||
BasicDecimal32* fraction) const;
|
||||
|
||||
/// \brief Scale up.
|
||||
BasicDecimal32 IncreaseScaleBy(int32_t increase_by) const;
|
||||
|
||||
/// \brief Scale down.
|
||||
///
|
||||
/// - If 'round' is true, the right-most digits are dropped and the result value is
|
||||
/// rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits
|
||||
/// (>= 10^reduce_by / 2).
|
||||
/// - If 'round' is false, the right-most digits are simply dropped.
|
||||
BasicDecimal32 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
|
||||
|
||||
/// \brief Whether this number fits in the given precision
|
||||
///
|
||||
/// Return true if the number of significant digits is less or equal to 'precision'.
|
||||
bool FitsInPrecision(int32_t precision) const;
|
||||
|
||||
/// \brief Get the maximum valid unscaled decimal value.
|
||||
static const BasicDecimal32& GetMaxValue();
|
||||
/// \brief Get the maximum valid unscaled decimal value for the given precision.
|
||||
static BasicDecimal32 GetMaxValue(int32_t precision);
|
||||
|
||||
/// \brief Get the maximum decimal value (is not a valid value).
|
||||
static constexpr BasicDecimal32 GetMaxSentinel() {
|
||||
return BasicDecimal32(std::numeric_limits<int32_t>::max());
|
||||
}
|
||||
|
||||
/// \brief Get the minimum decimal value (is not a valid value).
|
||||
static constexpr BasicDecimal32 GetMinSentinel() {
|
||||
return BasicDecimal32(std::numeric_limits<int32_t>::min());
|
||||
}
|
||||
|
||||
/// \brief Scale multiplier for a given scale value.
|
||||
static const BasicDecimal32& GetScaleMultiplier(int32_t scale);
|
||||
/// \brief Half-scale multiplier for a given scale value.
|
||||
static const BasicDecimal32& GetHalfScaleMultiplier(int32_t scale);
|
||||
|
||||
explicit operator BasicDecimal64() const;
|
||||
};
|
||||
|
||||
ARROW_EXPORT bool operator<(const BasicDecimal64& left, const BasicDecimal64& right);
|
||||
ARROW_EXPORT bool operator<=(const BasicDecimal64& left, const BasicDecimal64& right);
|
||||
ARROW_EXPORT bool operator>(const BasicDecimal64& left, const BasicDecimal64& right);
|
||||
ARROW_EXPORT bool operator>=(const BasicDecimal64& left, const BasicDecimal64& right);
|
||||
|
||||
ARROW_EXPORT BasicDecimal64 operator-(const BasicDecimal64& self);
|
||||
ARROW_EXPORT BasicDecimal64 operator~(const BasicDecimal64& self);
|
||||
ARROW_EXPORT BasicDecimal64 operator+(const BasicDecimal64& left,
|
||||
const BasicDecimal64& right);
|
||||
ARROW_EXPORT BasicDecimal64 operator-(const BasicDecimal64& left,
|
||||
const BasicDecimal64& right);
|
||||
ARROW_EXPORT BasicDecimal64 operator*(const BasicDecimal64& left,
|
||||
const BasicDecimal64& right);
|
||||
ARROW_EXPORT BasicDecimal64 operator/(const BasicDecimal64& left,
|
||||
const BasicDecimal64& right);
|
||||
ARROW_EXPORT BasicDecimal64 operator%(const BasicDecimal64& left,
|
||||
const BasicDecimal64& right);
|
||||
|
||||
class ARROW_EXPORT BasicDecimal64 : public SmallBasicDecimal<int64_t> {
|
||||
public:
|
||||
using SmallBasicDecimal<int64_t>::SmallBasicDecimal;
|
||||
using ValueType = int64_t;
|
||||
|
||||
/// \brief Negate the current value (in-place)
|
||||
BasicDecimal64& Negate();
|
||||
|
||||
/// \brief Absolute value (in-place)
|
||||
BasicDecimal64& Abs() { return *this < 0 ? Negate() : *this; }
|
||||
|
||||
/// \brief Absolute value
|
||||
static BasicDecimal64 Abs(const BasicDecimal64& in) {
|
||||
BasicDecimal64 result(in);
|
||||
return result.Abs();
|
||||
}
|
||||
|
||||
/// \brief Add a number to this one. The result is truncated to 32 bits.
|
||||
BasicDecimal64& operator+=(const BasicDecimal64& right) {
|
||||
value_ += right.value_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Subtract a number from this one. The result is truncated to 32 bits.
|
||||
BasicDecimal64& operator-=(const BasicDecimal64& right) {
|
||||
value_ -= right.value_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Multiply this number by another. The result is truncated to 32 bits.
|
||||
BasicDecimal64& operator*=(const BasicDecimal64& right) {
|
||||
value_ *= static_cast<uint64_t>(right.value_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Divide this number by the divisor and return the result.
|
||||
///
|
||||
/// This operation is not destructive.
|
||||
/// The answer rounds to zero. Signs work like:
|
||||
/// 21 / 5 -> 4, 1
|
||||
/// -21 / 5 -> -4, -1
|
||||
/// 21 / -5 -> -4, 1
|
||||
/// -21 / -5 -> 4, -1
|
||||
/// \param[in] divisor the number to divide by
|
||||
/// \param[out] result the quotient
|
||||
/// \param[out] remainder the remainder after the division
|
||||
DecimalStatus Divide(const BasicDecimal64& divisor, BasicDecimal64* result,
|
||||
BasicDecimal64* remainder) const;
|
||||
|
||||
/// \brief In-place division
|
||||
BasicDecimal64& operator/=(const BasicDecimal64& right) {
|
||||
value_ /= right.value_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Bitwise "or" between two BasicDecimal64s
|
||||
BasicDecimal64& operator|=(const BasicDecimal64& right) {
|
||||
value_ |= right.value_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Bitwise "and" between two BasicDecimal64s
|
||||
BasicDecimal64& operator&=(const BasicDecimal64& right) {
|
||||
value_ &= right.value_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Shift left by the given number of bits.
|
||||
BasicDecimal64& operator<<=(uint32_t bits);
|
||||
|
||||
BasicDecimal64 operator<<(uint32_t bits) const {
|
||||
auto res = *this;
|
||||
res <<= bits;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// \brief Shift right by the given number of bits.
|
||||
///
|
||||
/// Negative values will sign-extend
|
||||
BasicDecimal64& operator>>=(uint32_t bits);
|
||||
|
||||
BasicDecimal64 operator>>(uint32_t bits) const {
|
||||
auto res = *this;
|
||||
res >>= bits;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// \brief Convert BasicDecimal32 from one scale to another
|
||||
DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
|
||||
BasicDecimal64* out) const;
|
||||
|
||||
void GetWholeAndFraction(int scale, BasicDecimal64* whole,
|
||||
BasicDecimal64* fraction) const;
|
||||
|
||||
/// \brief Scale up.
|
||||
BasicDecimal64 IncreaseScaleBy(int32_t increase_by) const;
|
||||
|
||||
/// \brief Scale down.
|
||||
///
|
||||
/// - If 'round' is true, the right-most digits are dropped and the result value is
|
||||
/// rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits
|
||||
/// (>= 10^reduce_by / 2).
|
||||
/// - If 'round' is false, the right-most digits are simply dropped.
|
||||
BasicDecimal64 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
|
||||
|
||||
/// \brief Whether this number fits in the given precision
|
||||
///
|
||||
/// Return true if the number of significant digits is less or equal to 'precision'.
|
||||
bool FitsInPrecision(int32_t precision) const;
|
||||
|
||||
/// \brief Get the maximum valid unscaled decimal value.
|
||||
static const BasicDecimal64& GetMaxValue();
|
||||
/// \brief Get the maximum valid unscaled decimal value for the given precision.
|
||||
static BasicDecimal64 GetMaxValue(int32_t precision);
|
||||
|
||||
/// \brief Get the maximum decimal value (is not a valid value).
|
||||
static constexpr BasicDecimal64 GetMaxSentinel() {
|
||||
return BasicDecimal64(std::numeric_limits<int32_t>::max());
|
||||
}
|
||||
|
||||
/// \brief Get the minimum decimal value (is not a valid value).
|
||||
static constexpr BasicDecimal64 GetMinSentinel() {
|
||||
return BasicDecimal64(std::numeric_limits<int32_t>::min());
|
||||
}
|
||||
|
||||
/// \brief Scale multiplier for a given scale value.
|
||||
static const BasicDecimal64& GetScaleMultiplier(int32_t scale);
|
||||
/// \brief Half-scale multiplier for a given scale value.
|
||||
static const BasicDecimal64& GetHalfScaleMultiplier(int32_t scale);
|
||||
};
|
||||
|
||||
/// Represents a signed 128-bit integer in two's complement.
|
||||
///
|
||||
/// This class is also compiled into LLVM IR - so, it should not have cpp references like
|
||||
/// streams and boost.
|
||||
class ARROW_EXPORT BasicDecimal128 : public GenericBasicDecimal<BasicDecimal128, 128> {
|
||||
public:
|
||||
static constexpr int kMaxPrecision = 38;
|
||||
static constexpr int kMaxScale = 38;
|
||||
|
||||
using GenericBasicDecimal::GenericBasicDecimal;
|
||||
|
||||
constexpr BasicDecimal128() noexcept : GenericBasicDecimal() {}
|
||||
|
||||
/// \brief Create a BasicDecimal128 from the two's complement representation.
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept
|
||||
: BasicDecimal128(WordArray{low, static_cast<uint64_t>(high)}) {}
|
||||
#else
|
||||
constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept
|
||||
: BasicDecimal128(WordArray{static_cast<uint64_t>(high), low}) {}
|
||||
#endif
|
||||
|
||||
/// \brief Negate the current value (in-place)
|
||||
BasicDecimal128& Negate();
|
||||
|
||||
/// \brief Absolute value (in-place)
|
||||
BasicDecimal128& Abs();
|
||||
|
||||
/// \brief Absolute value
|
||||
static BasicDecimal128 Abs(const BasicDecimal128& left);
|
||||
|
||||
/// \brief Add a number to this one. The result is truncated to 128 bits.
|
||||
BasicDecimal128& operator+=(const BasicDecimal128& right);
|
||||
|
||||
/// \brief Subtract a number from this one. The result is truncated to 128 bits.
|
||||
BasicDecimal128& operator-=(const BasicDecimal128& right);
|
||||
|
||||
/// \brief Multiply this number by another number. The result is truncated to 128 bits.
|
||||
BasicDecimal128& operator*=(const BasicDecimal128& right);
|
||||
|
||||
/// Divide this number by right and return the result.
|
||||
///
|
||||
/// This operation is not destructive.
|
||||
/// The answer rounds to zero. Signs work like:
|
||||
/// 21 / 5 -> 4, 1
|
||||
/// -21 / 5 -> -4, -1
|
||||
/// 21 / -5 -> -4, 1
|
||||
/// -21 / -5 -> 4, -1
|
||||
/// \param[in] divisor the number to divide by
|
||||
/// \param[out] result the quotient
|
||||
/// \param[out] remainder the remainder after the division
|
||||
DecimalStatus Divide(const BasicDecimal128& divisor, BasicDecimal128* result,
|
||||
BasicDecimal128* remainder) const;
|
||||
|
||||
/// \brief In-place division.
|
||||
BasicDecimal128& operator/=(const BasicDecimal128& right);
|
||||
|
||||
/// \brief Bitwise "or" between two BasicDecimal128.
|
||||
BasicDecimal128& operator|=(const BasicDecimal128& right);
|
||||
|
||||
/// \brief Bitwise "and" between two BasicDecimal128.
|
||||
BasicDecimal128& operator&=(const BasicDecimal128& right);
|
||||
|
||||
/// \brief Shift left by the given number of bits.
|
||||
BasicDecimal128& operator<<=(uint32_t bits);
|
||||
|
||||
BasicDecimal128 operator<<(uint32_t bits) const {
|
||||
auto res = *this;
|
||||
res <<= bits;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// \brief Shift right by the given number of bits.
|
||||
///
|
||||
/// Negative values will sign-extend.
|
||||
BasicDecimal128& operator>>=(uint32_t bits);
|
||||
|
||||
BasicDecimal128 operator>>(uint32_t bits) const {
|
||||
auto res = *this;
|
||||
res >>= bits;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// \brief Get the high bits of the two's complement representation of the number.
|
||||
constexpr int64_t high_bits() const {
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
return static_cast<int64_t>(array_[1]);
|
||||
#else
|
||||
return static_cast<int64_t>(array_[0]);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Get the low bits of the two's complement representation of the number.
|
||||
constexpr uint64_t low_bits() const {
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
return array_[0];
|
||||
#else
|
||||
return array_[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief separate the integer and fractional parts for the given scale.
|
||||
void GetWholeAndFraction(int32_t scale, BasicDecimal128* whole,
|
||||
BasicDecimal128* fraction) const;
|
||||
|
||||
/// \brief Scale multiplier for given scale value.
|
||||
static const BasicDecimal128& GetScaleMultiplier(int32_t scale);
|
||||
/// \brief Half-scale multiplier for given scale value.
|
||||
static const BasicDecimal128& GetHalfScaleMultiplier(int32_t scale);
|
||||
|
||||
/// \brief Convert BasicDecimal128 from one scale to another
|
||||
DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
|
||||
BasicDecimal128* out) const;
|
||||
|
||||
/// \brief Scale up.
|
||||
BasicDecimal128 IncreaseScaleBy(int32_t increase_by) const;
|
||||
|
||||
/// \brief Scale down.
|
||||
/// - If 'round' is true, the right-most digits are dropped and the result value is
|
||||
/// rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits
|
||||
/// (>= 10^reduce_by / 2).
|
||||
/// - If 'round' is false, the right-most digits are simply dropped.
|
||||
BasicDecimal128 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
|
||||
|
||||
/// \brief Whether this number fits in the given precision
|
||||
///
|
||||
/// Return true if the number of significant digits is less or equal to `precision`.
|
||||
bool FitsInPrecision(int32_t precision) const;
|
||||
|
||||
/// \brief count the number of leading binary zeroes.
|
||||
int32_t CountLeadingBinaryZeros() const;
|
||||
|
||||
/// \brief Get the maximum valid unscaled decimal value.
|
||||
static const BasicDecimal128& GetMaxValue();
|
||||
|
||||
/// \brief Get the maximum valid unscaled decimal value for the given precision.
|
||||
static BasicDecimal128 GetMaxValue(int32_t precision);
|
||||
|
||||
/// \brief Get the maximum decimal value (is not a valid value).
|
||||
static constexpr BasicDecimal128 GetMaxSentinel() {
|
||||
return BasicDecimal128(/*high=*/std::numeric_limits<int64_t>::max(),
|
||||
/*low=*/std::numeric_limits<uint64_t>::max());
|
||||
}
|
||||
/// \brief Get the minimum decimal value (is not a valid value).
|
||||
static constexpr BasicDecimal128 GetMinSentinel() {
|
||||
return BasicDecimal128(/*high=*/std::numeric_limits<int64_t>::min(),
|
||||
/*low=*/std::numeric_limits<uint64_t>::min());
|
||||
}
|
||||
};
|
||||
|
||||
ARROW_EXPORT bool operator<(const BasicDecimal128& left, const BasicDecimal128& right);
|
||||
ARROW_EXPORT bool operator<=(const BasicDecimal128& left, const BasicDecimal128& right);
|
||||
ARROW_EXPORT bool operator>(const BasicDecimal128& left, const BasicDecimal128& right);
|
||||
ARROW_EXPORT bool operator>=(const BasicDecimal128& left, const BasicDecimal128& right);
|
||||
|
||||
ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& operand);
|
||||
ARROW_EXPORT BasicDecimal128 operator~(const BasicDecimal128& operand);
|
||||
ARROW_EXPORT BasicDecimal128 operator+(const BasicDecimal128& left,
|
||||
const BasicDecimal128& right);
|
||||
ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& left,
|
||||
const BasicDecimal128& right);
|
||||
ARROW_EXPORT BasicDecimal128 operator*(const BasicDecimal128& left,
|
||||
const BasicDecimal128& right);
|
||||
ARROW_EXPORT BasicDecimal128 operator/(const BasicDecimal128& left,
|
||||
const BasicDecimal128& right);
|
||||
ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left,
|
||||
const BasicDecimal128& right);
|
||||
|
||||
class ARROW_EXPORT BasicDecimal256 : public GenericBasicDecimal<BasicDecimal256, 256> {
|
||||
public:
|
||||
using GenericBasicDecimal::GenericBasicDecimal;
|
||||
|
||||
static constexpr int kMaxPrecision = 76;
|
||||
static constexpr int kMaxScale = 76;
|
||||
|
||||
constexpr BasicDecimal256() noexcept : GenericBasicDecimal() {}
|
||||
|
||||
explicit BasicDecimal256(const BasicDecimal128& value) noexcept
|
||||
: BasicDecimal256(bit_util::little_endian::ToNative<uint64_t, 4>(
|
||||
{value.low_bits(), static_cast<uint64_t>(value.high_bits()),
|
||||
SignExtend(value.high_bits()), SignExtend(value.high_bits())})) {}
|
||||
|
||||
explicit BasicDecimal256(const BasicDecimal64& value) noexcept
|
||||
: BasicDecimal256(bit_util::little_endian::ToNative<uint64_t, 4>(
|
||||
{value.low_bits(), SignExtend(value.value()), SignExtend(value.value()),
|
||||
SignExtend(value.value())})) {}
|
||||
|
||||
explicit BasicDecimal256(const BasicDecimal32& value) noexcept
|
||||
: BasicDecimal256(bit_util::little_endian::ToNative<uint64_t, 4>(
|
||||
{value.low_bits(), SignExtend(value.value()), SignExtend(value.value()),
|
||||
SignExtend(value.value())})) {}
|
||||
|
||||
/// \brief Negate the current value (in-place)
|
||||
BasicDecimal256& Negate();
|
||||
|
||||
/// \brief Absolute value (in-place)
|
||||
BasicDecimal256& Abs();
|
||||
|
||||
/// \brief Absolute value
|
||||
static BasicDecimal256 Abs(const BasicDecimal256& left);
|
||||
|
||||
/// \brief Add a number to this one. The result is truncated to 256 bits.
|
||||
BasicDecimal256& operator+=(const BasicDecimal256& right);
|
||||
|
||||
/// \brief Subtract a number from this one. The result is truncated to 256 bits.
|
||||
BasicDecimal256& operator-=(const BasicDecimal256& right);
|
||||
|
||||
/// \brief Get the lowest bits of the two's complement representation of the number.
|
||||
uint64_t low_bits() const { return bit_util::little_endian::Make(array_)[0]; }
|
||||
|
||||
/// \brief separate the integer and fractional parts for the given scale.
|
||||
void GetWholeAndFraction(int32_t scale, BasicDecimal256* whole,
|
||||
BasicDecimal256* fraction) const;
|
||||
|
||||
/// \brief Scale multiplier for given scale value.
|
||||
static const BasicDecimal256& GetScaleMultiplier(int32_t scale);
|
||||
/// \brief Half-scale multiplier for given scale value.
|
||||
static const BasicDecimal256& GetHalfScaleMultiplier(int32_t scale);
|
||||
|
||||
/// \brief Convert BasicDecimal256 from one scale to another
|
||||
DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
|
||||
BasicDecimal256* out) const;
|
||||
|
||||
/// \brief Scale up.
|
||||
BasicDecimal256 IncreaseScaleBy(int32_t increase_by) const;
|
||||
|
||||
/// \brief Scale down.
|
||||
/// - If 'round' is true, the right-most digits are dropped and the result value is
|
||||
/// rounded up (+1 for positive, -1 for negative) based on the value of the
|
||||
/// dropped digits (>= 10^reduce_by / 2).
|
||||
/// - If 'round' is false, the right-most digits are simply dropped.
|
||||
BasicDecimal256 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
|
||||
|
||||
/// \brief Whether this number fits in the given precision
|
||||
///
|
||||
/// Return true if the number of significant digits is less or equal to `precision`.
|
||||
bool FitsInPrecision(int32_t precision) const;
|
||||
|
||||
/// \brief Multiply this number by another number. The result is truncated to 256 bits.
|
||||
BasicDecimal256& operator*=(const BasicDecimal256& right);
|
||||
|
||||
/// Divide this number by right and return the result.
|
||||
///
|
||||
/// This operation is not destructive.
|
||||
/// The answer rounds to zero. Signs work like:
|
||||
/// 21 / 5 -> 4, 1
|
||||
/// -21 / 5 -> -4, -1
|
||||
/// 21 / -5 -> -4, 1
|
||||
/// -21 / -5 -> 4, -1
|
||||
/// \param[in] divisor the number to divide by
|
||||
/// \param[out] result the quotient
|
||||
/// \param[out] remainder the remainder after the division
|
||||
DecimalStatus Divide(const BasicDecimal256& divisor, BasicDecimal256* result,
|
||||
BasicDecimal256* remainder) const;
|
||||
|
||||
/// \brief Shift left by the given number of bits.
|
||||
BasicDecimal256& operator<<=(uint32_t bits);
|
||||
|
||||
BasicDecimal256 operator<<(uint32_t bits) const {
|
||||
auto res = *this;
|
||||
res <<= bits;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// \brief Shift right by the given number of bits.
|
||||
///
|
||||
/// Negative values will sign-extend.
|
||||
BasicDecimal256& operator>>=(uint32_t bits);
|
||||
|
||||
BasicDecimal256 operator>>(uint32_t bits) const {
|
||||
auto res = *this;
|
||||
res >>= bits;
|
||||
return res;
|
||||
}
|
||||
|
||||
/// \brief In-place division.
|
||||
BasicDecimal256& operator/=(const BasicDecimal256& right);
|
||||
|
||||
/// \brief Get the maximum valid unscaled decimal value for the given precision.
|
||||
static BasicDecimal256 GetMaxValue(int32_t precision);
|
||||
|
||||
/// \brief Get the maximum decimal value (is not a valid value).
|
||||
static constexpr BasicDecimal256 GetMaxSentinel() {
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
return BasicDecimal256({std::numeric_limits<uint64_t>::max(),
|
||||
std::numeric_limits<uint64_t>::max(),
|
||||
std::numeric_limits<uint64_t>::max(),
|
||||
static_cast<uint64_t>(std::numeric_limits<int64_t>::max())});
|
||||
#else
|
||||
return BasicDecimal256({static_cast<uint64_t>(std::numeric_limits<int64_t>::max()),
|
||||
std::numeric_limits<uint64_t>::max(),
|
||||
std::numeric_limits<uint64_t>::max(),
|
||||
std::numeric_limits<uint64_t>::max()});
|
||||
#endif
|
||||
}
|
||||
/// \brief Get the minimum decimal value (is not a valid value).
|
||||
static constexpr BasicDecimal256 GetMinSentinel() {
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
return BasicDecimal256(
|
||||
{0, 0, 0, static_cast<uint64_t>(std::numeric_limits<int64_t>::min())});
|
||||
#else
|
||||
return BasicDecimal256(
|
||||
{static_cast<uint64_t>(std::numeric_limits<int64_t>::min()), 0, 0, 0});
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256& right);
|
||||
|
||||
ARROW_EXPORT inline bool operator<=(const BasicDecimal256& left,
|
||||
const BasicDecimal256& right) {
|
||||
return !operator<(right, left);
|
||||
}
|
||||
|
||||
ARROW_EXPORT inline bool operator>(const BasicDecimal256& left,
|
||||
const BasicDecimal256& right) {
|
||||
return operator<(right, left);
|
||||
}
|
||||
|
||||
ARROW_EXPORT inline bool operator>=(const BasicDecimal256& left,
|
||||
const BasicDecimal256& right) {
|
||||
return !operator<(left, right);
|
||||
}
|
||||
|
||||
ARROW_EXPORT BasicDecimal256 operator-(const BasicDecimal256& operand);
|
||||
ARROW_EXPORT BasicDecimal256 operator~(const BasicDecimal256& operand);
|
||||
ARROW_EXPORT BasicDecimal256 operator+(const BasicDecimal256& left,
|
||||
const BasicDecimal256& right);
|
||||
ARROW_EXPORT BasicDecimal256 operator*(const BasicDecimal256& left,
|
||||
const BasicDecimal256& right);
|
||||
ARROW_EXPORT BasicDecimal256 operator/(const BasicDecimal256& left,
|
||||
const BasicDecimal256& right);
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,211 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/cpu_info.h"
|
||||
#include "arrow/util/logging.h" // IWYU pragma: keep
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// Benchmark changed its parameter type between releases from
|
||||
// int to int64_t. As it doesn't have version macros, we need
|
||||
// to apply C++ template magic.
|
||||
|
||||
template <typename Func>
|
||||
struct BenchmarkArgsType;
|
||||
|
||||
// Pattern matching that extracts the vector element type of Benchmark::Args()
|
||||
template <typename Values>
|
||||
struct BenchmarkArgsType<benchmark::internal::Benchmark* (
|
||||
benchmark::internal::Benchmark::*)(const std::vector<Values>&)> {
|
||||
using type = Values;
|
||||
};
|
||||
|
||||
using ArgsType =
|
||||
typename BenchmarkArgsType<decltype(&benchmark::internal::Benchmark::Args)>::type;
|
||||
|
||||
using internal::CpuInfo;
|
||||
|
||||
static const CpuInfo* cpu_info = CpuInfo::GetInstance();
|
||||
|
||||
static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L1);
|
||||
static const int64_t kL2Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L2);
|
||||
static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L3);
|
||||
static const int64_t kCantFitInL3Size = kL3Size * 4;
|
||||
static const std::vector<int64_t> kMemorySizes = {kL1Size, kL2Size, kL3Size,
|
||||
kCantFitInL3Size};
|
||||
// 0 is treated as "no nulls"
|
||||
static const std::vector<ArgsType> kInverseNullProportions = {10000, 100, 10, 2, 1, 0};
|
||||
|
||||
struct GenericItemsArgs {
|
||||
// number of items processed per iteration
|
||||
const int64_t size;
|
||||
|
||||
// proportion of nulls in generated arrays
|
||||
double null_proportion;
|
||||
|
||||
explicit GenericItemsArgs(benchmark::State& state)
|
||||
: size(state.range(0)), state_(state) {
|
||||
if (state.range(1) == 0) {
|
||||
this->null_proportion = 0.0;
|
||||
} else {
|
||||
this->null_proportion = std::min(1., 1. / static_cast<double>(state.range(1)));
|
||||
}
|
||||
}
|
||||
|
||||
~GenericItemsArgs() {
|
||||
state_.counters["size"] = static_cast<double>(size);
|
||||
state_.counters["null_percent"] = null_proportion * 100;
|
||||
state_.SetItemsProcessed(state_.iterations() * size);
|
||||
}
|
||||
|
||||
private:
|
||||
benchmark::State& state_;
|
||||
};
|
||||
|
||||
void BenchmarkSetArgsWithSizes(benchmark::internal::Benchmark* bench,
|
||||
const std::vector<int64_t>& sizes = kMemorySizes) {
|
||||
bench->Unit(benchmark::kMicrosecond);
|
||||
|
||||
for (const auto size : sizes) {
|
||||
for (const auto inverse_null_proportion : kInverseNullProportions) {
|
||||
bench->Args({static_cast<ArgsType>(size), inverse_null_proportion});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) {
|
||||
BenchmarkSetArgsWithSizes(bench, kMemorySizes);
|
||||
}
|
||||
|
||||
void RegressionSetArgs(benchmark::internal::Benchmark* bench) {
|
||||
// Regression do not need to account for cache hierarchy, thus optimize for
|
||||
// the best case.
|
||||
BenchmarkSetArgsWithSizes(bench, {kL1Size});
|
||||
}
|
||||
|
||||
// RAII struct to handle some of the boilerplate in regression benchmarks
|
||||
struct RegressionArgs {
|
||||
// size of memory tested (per iteration) in bytes
|
||||
int64_t size;
|
||||
|
||||
// proportion of nulls in generated arrays
|
||||
double null_proportion;
|
||||
|
||||
// If size_is_bytes is true, then it's a number of bytes, otherwise it's the
|
||||
// number of items processed (for reporting)
|
||||
explicit RegressionArgs(benchmark::State& state, bool size_is_bytes = true)
|
||||
: size(state.range(0)), state_(state), size_is_bytes_(size_is_bytes) {
|
||||
if (state.range(1) == 0) {
|
||||
this->null_proportion = 0.0;
|
||||
} else {
|
||||
this->null_proportion = std::min(1., 1. / static_cast<double>(state.range(1)));
|
||||
}
|
||||
}
|
||||
|
||||
~RegressionArgs() {
|
||||
state_.counters["size"] = static_cast<double>(size);
|
||||
state_.counters["null_percent"] = null_proportion * 100;
|
||||
if (size_is_bytes_) {
|
||||
state_.SetBytesProcessed(state_.iterations() * size);
|
||||
} else {
|
||||
state_.SetItemsProcessed(state_.iterations() * size);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
benchmark::State& state_;
|
||||
bool size_is_bytes_;
|
||||
};
|
||||
|
||||
class MemoryPoolMemoryManager : public benchmark::MemoryManager {
|
||||
void Start() override {
|
||||
memory_pool = std::make_shared<ProxyMemoryPool>(default_memory_pool());
|
||||
|
||||
MemoryPool* default_pool = default_memory_pool();
|
||||
global_allocations_start = default_pool->num_allocations();
|
||||
}
|
||||
|
||||
// BENCHMARK_DONT_OPTIMIZE is used here to detect Google Benchmark
|
||||
// 1.8.0. We can remove this Stop(Result*) when we require Google
|
||||
// Benchmark 1.8.0 or later.
|
||||
#ifndef BENCHMARK_DONT_OPTIMIZE
|
||||
void Stop(Result* result) override { Stop(*result); }
|
||||
#endif
|
||||
|
||||
void Stop(benchmark::MemoryManager::Result& result) override {
|
||||
// If num_allocations is still zero, we assume that the memory pool wasn't passed down
|
||||
// so we should record them.
|
||||
MemoryPool* default_pool = default_memory_pool();
|
||||
int64_t new_default_allocations =
|
||||
default_pool->num_allocations() - global_allocations_start;
|
||||
|
||||
// Only record metrics if (1) there were allocations and (2) we
|
||||
// recorded at least one.
|
||||
if (new_default_allocations > 0 && memory_pool->num_allocations() > 0) {
|
||||
if (new_default_allocations > memory_pool->num_allocations()) {
|
||||
// If we missed some, let's report that.
|
||||
int64_t missed_allocations =
|
||||
new_default_allocations - memory_pool->num_allocations();
|
||||
ARROW_LOG(WARNING) << "BenchmarkMemoryTracker recorded some allocations "
|
||||
<< "for a benchmark, but missed " << missed_allocations
|
||||
<< " allocations.\n";
|
||||
}
|
||||
|
||||
result.max_bytes_used = memory_pool->max_memory();
|
||||
result.total_allocated_bytes = memory_pool->total_bytes_allocated();
|
||||
result.num_allocs = memory_pool->num_allocations();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
std::shared_ptr<::arrow::ProxyMemoryPool> memory_pool;
|
||||
|
||||
protected:
|
||||
int64_t global_allocations_start;
|
||||
};
|
||||
|
||||
/// \brief Track memory pool allocations in benchmarks.
|
||||
///
|
||||
/// Instantiate as a global variable to register the hooks into Google Benchmark
|
||||
/// to collect memory metrics. Before each benchmark, a new ProxyMemoryPool is
|
||||
/// created. It can then be accessed with memory_pool(). Once the benchmark is
|
||||
/// complete, the hook will record the maximum memory used, the total bytes
|
||||
/// allocated, and the total number of allocations. If no allocations were seen,
|
||||
/// (for example, if you forgot to pass down the memory pool), then these metrics
|
||||
/// will not be saved.
|
||||
///
|
||||
/// Since this is used as one global variable, this will not work if multiple
|
||||
/// benchmarks are run concurrently or for multi-threaded benchmarks (ones
|
||||
/// that use `->ThreadRange(...)`).
|
||||
class BenchmarkMemoryTracker {
|
||||
public:
|
||||
BenchmarkMemoryTracker() : manager_() { ::benchmark::RegisterMemoryManager(&manager_); }
|
||||
::arrow::MemoryPool* memory_pool() const { return manager_.memory_pool.get(); }
|
||||
|
||||
protected:
|
||||
::arrow::MemoryPoolMemoryManager manager_;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,115 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/span.h"
|
||||
|
||||
namespace arrow::util {
|
||||
|
||||
inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t size) {
|
||||
assert(size <= BinaryViewType::kInlineSize);
|
||||
// Small string: inlined. Bytes beyond size are zeroed
|
||||
BinaryViewType::c_type out;
|
||||
out.inlined = {size, {}};
|
||||
memcpy(&out.inlined.data, data, size);
|
||||
return out;
|
||||
}
|
||||
|
||||
inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) {
|
||||
assert(v.size() <= BinaryViewType::kInlineSize);
|
||||
return ToInlineBinaryView(v.data(), static_cast<int32_t>(v.size()));
|
||||
}
|
||||
|
||||
inline BinaryViewType::c_type ToNonInlineBinaryView(const void* data, int32_t size,
|
||||
int32_t buffer_index,
|
||||
int32_t offset) {
|
||||
// Large string: store index/offset.
|
||||
BinaryViewType::c_type out;
|
||||
out.ref = {size, {}, buffer_index, offset};
|
||||
memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix));
|
||||
return out;
|
||||
}
|
||||
|
||||
inline BinaryViewType::c_type ToBinaryView(const void* data, int32_t size,
|
||||
int32_t buffer_index, int32_t offset) {
|
||||
if (size <= BinaryViewType::kInlineSize) {
|
||||
return ToInlineBinaryView(data, size);
|
||||
}
|
||||
return ToNonInlineBinaryView(data, size, buffer_index, offset);
|
||||
}
|
||||
|
||||
inline BinaryViewType::c_type ToBinaryView(std::string_view v, int32_t buffer_index,
|
||||
int32_t offset) {
|
||||
return ToBinaryView(v.data(), static_cast<int32_t>(v.size()), buffer_index, offset);
|
||||
}
|
||||
|
||||
template <typename BufferPtr>
|
||||
std::string_view FromBinaryView(const BinaryViewType::c_type& v,
|
||||
const BufferPtr* data_buffers) {
|
||||
auto* data = v.is_inline() ? v.inlined.data.data()
|
||||
: data_buffers[v.ref.buffer_index]->data() + v.ref.offset;
|
||||
return {reinterpret_cast<const char*>(data), static_cast<size_t>(v.size())};
|
||||
}
|
||||
template <typename BufferPtr>
|
||||
std::string_view FromBinaryView(BinaryViewType::c_type&&, const BufferPtr*) = delete;
|
||||
|
||||
template <typename BufferPtr>
|
||||
bool EqualBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r,
|
||||
const BufferPtr* l_buffers, const BufferPtr* r_buffers) {
|
||||
int64_t l_size_and_prefix, r_size_and_prefix;
|
||||
memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix));
|
||||
memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix));
|
||||
|
||||
if (l_size_and_prefix != r_size_and_prefix) return false;
|
||||
|
||||
if (l.is_inline()) {
|
||||
// The columnar spec mandates that the inlined part be zero-padded, so we can compare
|
||||
// a word at a time regardless of the exact size.
|
||||
int64_t l_inlined, r_inlined;
|
||||
memcpy(&l_inlined, l.inline_data() + BinaryViewType::kPrefixSize, sizeof(l_inlined));
|
||||
memcpy(&r_inlined, r.inline_data() + BinaryViewType::kPrefixSize, sizeof(r_inlined));
|
||||
return l_inlined == r_inlined;
|
||||
}
|
||||
|
||||
// Sizes are equal and this is not inline, therefore both are out
|
||||
// of line and have kPrefixSize first in common.
|
||||
const uint8_t* l_data = l_buffers[l.ref.buffer_index]->data() + l.ref.offset;
|
||||
const uint8_t* r_data = r_buffers[r.ref.buffer_index]->data() + r.ref.offset;
|
||||
return memcmp(l_data + BinaryViewType::kPrefixSize,
|
||||
r_data + BinaryViewType::kPrefixSize,
|
||||
l.size() - BinaryViewType::kPrefixSize) == 0;
|
||||
}
|
||||
|
||||
/// \brief Compute the total size of a list of binary views including null
|
||||
/// views.
|
||||
///
|
||||
/// This is useful when calculating the necessary memory to store all the string
|
||||
/// data from the views.
|
||||
inline int64_t SumOfBinaryViewSizes(const BinaryViewType::c_type* views, int64_t length) {
|
||||
int64_t total = 0;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
total += views[i].size();
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
} // namespace arrow::util
|
||||
@@ -0,0 +1,570 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/endian.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/ubsan.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
namespace detail {
|
||||
|
||||
inline uint64_t LoadWord(const uint8_t* bytes) {
|
||||
return bit_util::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
|
||||
}
|
||||
|
||||
inline uint64_t ShiftWord(uint64_t current, uint64_t next, int64_t shift) {
|
||||
if (shift == 0) {
|
||||
return current;
|
||||
}
|
||||
return (current >> shift) | (next << (64 - shift));
|
||||
}
|
||||
|
||||
// These templates are here to help with unit tests
|
||||
|
||||
template <typename T>
|
||||
constexpr T BitNot(T x) {
|
||||
return ~x;
|
||||
}
|
||||
|
||||
template <>
|
||||
constexpr bool BitNot(bool x) {
|
||||
return !x;
|
||||
}
|
||||
|
||||
struct BitBlockAnd {
|
||||
template <typename T>
|
||||
static constexpr T Call(T left, T right) {
|
||||
return left & right;
|
||||
}
|
||||
};
|
||||
|
||||
struct BitBlockAndNot {
|
||||
template <typename T>
|
||||
static constexpr T Call(T left, T right) {
|
||||
return left & BitNot(right);
|
||||
}
|
||||
};
|
||||
|
||||
struct BitBlockOr {
|
||||
template <typename T>
|
||||
static constexpr T Call(T left, T right) {
|
||||
return left | right;
|
||||
}
|
||||
};
|
||||
|
||||
struct BitBlockOrNot {
|
||||
template <typename T>
|
||||
static constexpr T Call(T left, T right) {
|
||||
return left | BitNot(right);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// \brief Return value from bit block counters: the total number of bits and
|
||||
/// the number of set bits.
|
||||
struct BitBlockCount {
|
||||
int16_t length;
|
||||
int16_t popcount;
|
||||
|
||||
bool NoneSet() const { return this->popcount == 0; }
|
||||
bool AllSet() const { return this->length == this->popcount; }
|
||||
};
|
||||
|
||||
/// \brief A class that scans through a true/false bitmap to compute popcounts
|
||||
/// 64 or 256 bits at a time. This is used to accelerate processing of
|
||||
/// mostly-not-null array data.
|
||||
class ARROW_EXPORT BitBlockCounter {
|
||||
public:
|
||||
BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
|
||||
: bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
|
||||
bits_remaining_(length),
|
||||
offset_(start_offset % 8) {}
|
||||
|
||||
/// \brief The bit size of each word run
|
||||
static constexpr int64_t kWordBits = 64;
|
||||
|
||||
/// \brief The bit size of four words run
|
||||
static constexpr int64_t kFourWordsBits = kWordBits * 4;
|
||||
|
||||
/// \brief Return the next run of available bits, usually 256. The returned
|
||||
/// pair contains the size of run and the number of true values. The last
|
||||
/// block will have a length less than 256 if the bitmap length is not a
|
||||
/// multiple of 256, and will return 0-length blocks in subsequent
|
||||
/// invocations.
|
||||
BitBlockCount NextFourWords() {
|
||||
using detail::LoadWord;
|
||||
using detail::ShiftWord;
|
||||
|
||||
if (!bits_remaining_) {
|
||||
return {0, 0};
|
||||
}
|
||||
int64_t total_popcount = 0;
|
||||
if (offset_ == 0) {
|
||||
if (bits_remaining_ < kFourWordsBits) {
|
||||
return GetBlockSlow(kFourWordsBits);
|
||||
}
|
||||
total_popcount += bit_util::PopCount(LoadWord(bitmap_));
|
||||
total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 8));
|
||||
total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 16));
|
||||
total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 24));
|
||||
} else {
|
||||
// When the offset is > 0, we need there to be a word beyond the last
|
||||
// aligned word in the bitmap for the bit shifting logic.
|
||||
if (bits_remaining_ < 5 * kFourWordsBits - offset_) {
|
||||
return GetBlockSlow(kFourWordsBits);
|
||||
}
|
||||
auto current = LoadWord(bitmap_);
|
||||
auto next = LoadWord(bitmap_ + 8);
|
||||
total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
|
||||
current = next;
|
||||
next = LoadWord(bitmap_ + 16);
|
||||
total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
|
||||
current = next;
|
||||
next = LoadWord(bitmap_ + 24);
|
||||
total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
|
||||
current = next;
|
||||
next = LoadWord(bitmap_ + 32);
|
||||
total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
|
||||
}
|
||||
bitmap_ += bit_util::BytesForBits(kFourWordsBits);
|
||||
bits_remaining_ -= kFourWordsBits;
|
||||
return {256, static_cast<int16_t>(total_popcount)};
|
||||
}
|
||||
|
||||
/// \brief Return the next run of available bits, usually 64. The returned
|
||||
/// pair contains the size of run and the number of true values. The last
|
||||
/// block will have a length less than 64 if the bitmap length is not a
|
||||
/// multiple of 64, and will return 0-length blocks in subsequent
|
||||
/// invocations.
|
||||
BitBlockCount NextWord() {
|
||||
using detail::LoadWord;
|
||||
using detail::ShiftWord;
|
||||
|
||||
if (!bits_remaining_) {
|
||||
return {0, 0};
|
||||
}
|
||||
int64_t popcount = 0;
|
||||
if (offset_ == 0) {
|
||||
if (bits_remaining_ < kWordBits) {
|
||||
return GetBlockSlow(kWordBits);
|
||||
}
|
||||
popcount = bit_util::PopCount(LoadWord(bitmap_));
|
||||
} else {
|
||||
// When the offset is > 0, we need there to be a word beyond the last
|
||||
// aligned word in the bitmap for the bit shifting logic.
|
||||
if (bits_remaining_ < 2 * kWordBits - offset_) {
|
||||
return GetBlockSlow(kWordBits);
|
||||
}
|
||||
popcount = bit_util::PopCount(
|
||||
ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_));
|
||||
}
|
||||
bitmap_ += kWordBits / 8;
|
||||
bits_remaining_ -= kWordBits;
|
||||
return {64, static_cast<int16_t>(popcount)};
|
||||
}
|
||||
|
||||
private:
|
||||
/// \brief Return block with the requested size when doing word-wise
|
||||
/// computation is not possible due to inadequate bits remaining.
|
||||
BitBlockCount GetBlockSlow(int64_t block_size) noexcept;
|
||||
|
||||
const uint8_t* bitmap_;
|
||||
int64_t bits_remaining_;
|
||||
int64_t offset_;
|
||||
};
|
||||
|
||||
/// \brief A tool to iterate through a possibly nonexistent validity bitmap,
|
||||
/// to allow us to write one code path for both the with-nulls and no-nulls
|
||||
/// cases without giving up a lot of performance.
|
||||
class ARROW_EXPORT OptionalBitBlockCounter {
|
||||
public:
|
||||
// validity_bitmap may be NULLPTR
|
||||
OptionalBitBlockCounter(const uint8_t* validity_bitmap, int64_t offset, int64_t length);
|
||||
|
||||
// validity_bitmap may be null
|
||||
OptionalBitBlockCounter(const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset,
|
||||
int64_t length);
|
||||
|
||||
/// Return block count for next word when the bitmap is available otherwise
|
||||
/// return a block with length up to INT16_MAX when there is no validity
|
||||
/// bitmap (so all the referenced values are not null).
|
||||
BitBlockCount NextBlock() {
|
||||
static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
|
||||
if (has_bitmap_) {
|
||||
BitBlockCount block = counter_.NextWord();
|
||||
position_ += block.length;
|
||||
return block;
|
||||
} else {
|
||||
int16_t block_size =
|
||||
static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
|
||||
position_ += block_size;
|
||||
// All values are non-null
|
||||
return {block_size, block_size};
|
||||
}
|
||||
}
|
||||
|
||||
// Like NextBlock, but returns a word-sized block even when there is no
|
||||
// validity bitmap
|
||||
BitBlockCount NextWord() {
|
||||
static constexpr int64_t kWordSize = 64;
|
||||
if (has_bitmap_) {
|
||||
BitBlockCount block = counter_.NextWord();
|
||||
position_ += block.length;
|
||||
return block;
|
||||
} else {
|
||||
int16_t block_size = static_cast<int16_t>(std::min(kWordSize, length_ - position_));
|
||||
position_ += block_size;
|
||||
// All values are non-null
|
||||
return {block_size, block_size};
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const bool has_bitmap_;
|
||||
int64_t position_;
|
||||
int64_t length_;
|
||||
BitBlockCounter counter_;
|
||||
};
|
||||
|
||||
/// \brief A class that computes popcounts on the result of bitwise operations
|
||||
/// between two bitmaps, 64 bits at a time. A 64-bit word is loaded from each
|
||||
/// bitmap, then the popcount is computed on e.g. the bitwise-and of the two
|
||||
/// words.
|
||||
class ARROW_EXPORT BinaryBitBlockCounter {
|
||||
public:
|
||||
BinaryBitBlockCounter(const uint8_t* left_bitmap, int64_t left_offset,
|
||||
const uint8_t* right_bitmap, int64_t right_offset, int64_t length)
|
||||
: left_bitmap_(util::MakeNonNull(left_bitmap) + left_offset / 8),
|
||||
left_offset_(left_offset % 8),
|
||||
right_bitmap_(util::MakeNonNull(right_bitmap) + right_offset / 8),
|
||||
right_offset_(right_offset % 8),
|
||||
bits_remaining_(length) {}
|
||||
|
||||
/// \brief Return the popcount of the bitwise-and of the next run of
|
||||
/// available bits, up to 64. The returned pair contains the size of run and
|
||||
/// the number of true values. The last block will have a length less than 64
|
||||
/// if the bitmap length is not a multiple of 64, and will return 0-length
|
||||
/// blocks in subsequent invocations.
|
||||
BitBlockCount NextAndWord() { return NextWord<detail::BitBlockAnd>(); }
|
||||
|
||||
/// \brief Computes "x & ~y" block for each available run of bits.
|
||||
BitBlockCount NextAndNotWord() { return NextWord<detail::BitBlockAndNot>(); }
|
||||
|
||||
/// \brief Computes "x | y" block for each available run of bits.
|
||||
BitBlockCount NextOrWord() { return NextWord<detail::BitBlockOr>(); }
|
||||
|
||||
/// \brief Computes "x | ~y" block for each available run of bits.
|
||||
BitBlockCount NextOrNotWord() { return NextWord<detail::BitBlockOrNot>(); }
|
||||
|
||||
private:
|
||||
template <class Op>
|
||||
BitBlockCount NextWord() {
|
||||
using detail::LoadWord;
|
||||
using detail::ShiftWord;
|
||||
|
||||
if (!bits_remaining_) {
|
||||
return {0, 0};
|
||||
}
|
||||
// When the offset is > 0, we need there to be a word beyond the last aligned
|
||||
// word in the bitmap for the bit shifting logic.
|
||||
constexpr int64_t kWordBits = BitBlockCounter::kWordBits;
|
||||
const int64_t bits_required_to_use_words =
|
||||
std::max(left_offset_ == 0 ? 64 : 64 + (64 - left_offset_),
|
||||
right_offset_ == 0 ? 64 : 64 + (64 - right_offset_));
|
||||
if (bits_remaining_ < bits_required_to_use_words) {
|
||||
const int16_t run_length =
|
||||
static_cast<int16_t>(std::min(bits_remaining_, kWordBits));
|
||||
int16_t popcount = 0;
|
||||
for (int64_t i = 0; i < run_length; ++i) {
|
||||
if (Op::Call(bit_util::GetBit(left_bitmap_, left_offset_ + i),
|
||||
bit_util::GetBit(right_bitmap_, right_offset_ + i))) {
|
||||
++popcount;
|
||||
}
|
||||
}
|
||||
// This code path should trigger _at most_ 2 times. In the "two times"
|
||||
// case, the first time the run length will be a multiple of 8.
|
||||
left_bitmap_ += run_length / 8;
|
||||
right_bitmap_ += run_length / 8;
|
||||
bits_remaining_ -= run_length;
|
||||
return {run_length, popcount};
|
||||
}
|
||||
|
||||
int64_t popcount = 0;
|
||||
if (left_offset_ == 0 && right_offset_ == 0) {
|
||||
popcount =
|
||||
bit_util::PopCount(Op::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_)));
|
||||
} else {
|
||||
auto left_word =
|
||||
ShiftWord(LoadWord(left_bitmap_), LoadWord(left_bitmap_ + 8), left_offset_);
|
||||
auto right_word =
|
||||
ShiftWord(LoadWord(right_bitmap_), LoadWord(right_bitmap_ + 8), right_offset_);
|
||||
popcount = bit_util::PopCount(Op::Call(left_word, right_word));
|
||||
}
|
||||
left_bitmap_ += kWordBits / 8;
|
||||
right_bitmap_ += kWordBits / 8;
|
||||
bits_remaining_ -= kWordBits;
|
||||
return {64, static_cast<int16_t>(popcount)};
|
||||
}
|
||||
|
||||
const uint8_t* left_bitmap_;
|
||||
int64_t left_offset_;
|
||||
const uint8_t* right_bitmap_;
|
||||
int64_t right_offset_;
|
||||
int64_t bits_remaining_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT OptionalBinaryBitBlockCounter {
|
||||
public:
|
||||
// Any bitmap may be NULLPTR
|
||||
OptionalBinaryBitBlockCounter(const uint8_t* left_bitmap, int64_t left_offset,
|
||||
const uint8_t* right_bitmap, int64_t right_offset,
|
||||
int64_t length);
|
||||
|
||||
// Any bitmap may be null
|
||||
OptionalBinaryBitBlockCounter(const std::shared_ptr<Buffer>& left_bitmap,
|
||||
int64_t left_offset,
|
||||
const std::shared_ptr<Buffer>& right_bitmap,
|
||||
int64_t right_offset, int64_t length);
|
||||
|
||||
BitBlockCount NextAndBlock() {
|
||||
static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
|
||||
switch (has_bitmap_) {
|
||||
case HasBitmap::BOTH: {
|
||||
BitBlockCount block = binary_counter_.NextAndWord();
|
||||
position_ += block.length;
|
||||
return block;
|
||||
}
|
||||
case HasBitmap::ONE: {
|
||||
BitBlockCount block = unary_counter_.NextWord();
|
||||
position_ += block.length;
|
||||
return block;
|
||||
}
|
||||
case HasBitmap::NONE:
|
||||
default: {
|
||||
const int16_t block_size =
|
||||
static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
|
||||
position_ += block_size;
|
||||
// All values are non-null
|
||||
return {block_size, block_size};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BitBlockCount NextOrNotBlock() {
|
||||
static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
|
||||
switch (has_bitmap_) {
|
||||
case HasBitmap::BOTH: {
|
||||
BitBlockCount block = binary_counter_.NextOrNotWord();
|
||||
position_ += block.length;
|
||||
return block;
|
||||
}
|
||||
case HasBitmap::ONE: {
|
||||
BitBlockCount block = unary_counter_.NextWord();
|
||||
position_ += block.length;
|
||||
return block;
|
||||
}
|
||||
case HasBitmap::NONE:
|
||||
default: {
|
||||
const int16_t block_size =
|
||||
static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
|
||||
position_ += block_size;
|
||||
// All values are non-null
|
||||
return {block_size, block_size};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
enum class HasBitmap : int { BOTH, ONE, NONE };
|
||||
|
||||
const HasBitmap has_bitmap_;
|
||||
int64_t position_;
|
||||
int64_t length_;
|
||||
BitBlockCounter unary_counter_;
|
||||
BinaryBitBlockCounter binary_counter_;
|
||||
|
||||
static HasBitmap HasBitmapFromBitmaps(bool has_left, bool has_right) {
|
||||
switch (static_cast<int>(has_left) + static_cast<int>(has_right)) {
|
||||
case 0:
|
||||
return HasBitmap::NONE;
|
||||
case 1:
|
||||
return HasBitmap::ONE;
|
||||
default: // 2
|
||||
return HasBitmap::BOTH;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Functional-style bit block visitors.
|
||||
|
||||
template <typename VisitNotNull, typename VisitNull>
|
||||
static Status VisitBitBlocks(const uint8_t* bitmap, int64_t offset, int64_t length,
|
||||
VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
|
||||
internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
|
||||
int64_t position = 0;
|
||||
while (position < length) {
|
||||
internal::BitBlockCount block = bit_counter.NextBlock();
|
||||
if (block.AllSet()) {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
ARROW_RETURN_NOT_OK(visit_not_null(position));
|
||||
}
|
||||
} else if (block.NoneSet()) {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
ARROW_RETURN_NOT_OK(visit_null());
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
if (bit_util::GetBit(bitmap, offset + position)) {
|
||||
ARROW_RETURN_NOT_OK(visit_not_null(position));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(visit_null());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename VisitNotNull, typename VisitNull>
|
||||
static void VisitBitBlocksVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
|
||||
VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
|
||||
internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
|
||||
int64_t position = 0;
|
||||
while (position < length) {
|
||||
internal::BitBlockCount block = bit_counter.NextBlock();
|
||||
if (block.AllSet()) {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
visit_not_null(position);
|
||||
}
|
||||
} else if (block.NoneSet()) {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
visit_null();
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
if (bit_util::GetBit(bitmap, offset + position)) {
|
||||
visit_not_null(position);
|
||||
} else {
|
||||
visit_null();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename VisitNotNull, typename VisitNull>
|
||||
static Status VisitTwoBitBlocks(const uint8_t* left_bitmap, int64_t left_offset,
|
||||
const uint8_t* right_bitmap, int64_t right_offset,
|
||||
int64_t length, VisitNotNull&& visit_not_null,
|
||||
VisitNull&& visit_null) {
|
||||
if (left_bitmap == NULLPTR || right_bitmap == NULLPTR) {
|
||||
// At most one bitmap is present
|
||||
if (left_bitmap == NULLPTR) {
|
||||
return VisitBitBlocks(right_bitmap, right_offset, length,
|
||||
std::forward<VisitNotNull>(visit_not_null),
|
||||
std::forward<VisitNull>(visit_null));
|
||||
} else {
|
||||
return VisitBitBlocks(left_bitmap, left_offset, length,
|
||||
std::forward<VisitNotNull>(visit_not_null),
|
||||
std::forward<VisitNull>(visit_null));
|
||||
}
|
||||
}
|
||||
BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
|
||||
length);
|
||||
int64_t position = 0;
|
||||
while (position < length) {
|
||||
BitBlockCount block = bit_counter.NextAndWord();
|
||||
if (block.AllSet()) {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
ARROW_RETURN_NOT_OK(visit_not_null(position));
|
||||
}
|
||||
} else if (block.NoneSet()) {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
ARROW_RETURN_NOT_OK(visit_null());
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
if (bit_util::GetBit(left_bitmap, left_offset + position) &&
|
||||
bit_util::GetBit(right_bitmap, right_offset + position)) {
|
||||
ARROW_RETURN_NOT_OK(visit_not_null(position));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(visit_null());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename VisitNotNull, typename VisitNull>
|
||||
static void VisitTwoBitBlocksVoid(const uint8_t* left_bitmap, int64_t left_offset,
|
||||
const uint8_t* right_bitmap, int64_t right_offset,
|
||||
int64_t length, VisitNotNull&& visit_not_null,
|
||||
VisitNull&& visit_null) {
|
||||
if (left_bitmap == NULLPTR || right_bitmap == NULLPTR) {
|
||||
// At most one bitmap is present
|
||||
if (left_bitmap == NULLPTR) {
|
||||
return VisitBitBlocksVoid(right_bitmap, right_offset, length,
|
||||
std::forward<VisitNotNull>(visit_not_null),
|
||||
std::forward<VisitNull>(visit_null));
|
||||
} else {
|
||||
return VisitBitBlocksVoid(left_bitmap, left_offset, length,
|
||||
std::forward<VisitNotNull>(visit_not_null),
|
||||
std::forward<VisitNull>(visit_null));
|
||||
}
|
||||
}
|
||||
BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
|
||||
length);
|
||||
int64_t position = 0;
|
||||
while (position < length) {
|
||||
BitBlockCount block = bit_counter.NextAndWord();
|
||||
if (block.AllSet()) {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
visit_not_null(position);
|
||||
}
|
||||
} else if (block.NoneSet()) {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
visit_null();
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < block.length; ++i, ++position) {
|
||||
if (bit_util::GetBit(left_bitmap, left_offset + position) &&
|
||||
bit_util::GetBit(right_bitmap, right_offset + position)) {
|
||||
visit_not_null(position);
|
||||
} else {
|
||||
visit_null();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,539 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/bitmap_reader.h"
|
||||
#include "arrow/util/endian.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
struct BitRun {
|
||||
int64_t length;
|
||||
// Whether bits are set at this point.
|
||||
bool set;
|
||||
|
||||
std::string ToString() const {
|
||||
return std::string("{Length: ") + std::to_string(length) +
|
||||
", set=" + std::to_string(set) + "}";
|
||||
}
|
||||
};
|
||||
|
||||
inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
|
||||
return lhs.length == rhs.length && lhs.set == rhs.set;
|
||||
}
|
||||
|
||||
inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {
|
||||
return lhs.length != rhs.length || lhs.set != rhs.set;
|
||||
}
|
||||
|
||||
class BitRunReaderLinear {
|
||||
public:
|
||||
BitRunReaderLinear() = default;
|
||||
|
||||
BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length)
|
||||
: reader_(bitmap, start_offset, length) {}
|
||||
|
||||
BitRun NextRun() {
|
||||
BitRun rl = {/*length=*/0, reader_.IsSet()};
|
||||
// Advance while the values are equal and not at the end of list.
|
||||
while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) {
|
||||
rl.length++;
|
||||
reader_.Next();
|
||||
}
|
||||
return rl;
|
||||
}
|
||||
|
||||
private:
|
||||
BitmapReader reader_;
|
||||
};
|
||||
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
/// A convenience class for counting the number of contiguous set/unset bits
|
||||
/// in a bitmap.
|
||||
class ARROW_EXPORT BitRunReader {
|
||||
public:
|
||||
BitRunReader() = default;
|
||||
|
||||
/// \brief Constructs new BitRunReader.
|
||||
///
|
||||
/// \param[in] bitmap source data
|
||||
/// \param[in] start_offset bit offset into the source data
|
||||
/// \param[in] length number of bits to copy
|
||||
BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
|
||||
|
||||
/// Returns a new BitRun containing the number of contiguous
|
||||
/// bits with the same value. length == 0 indicates the
|
||||
/// end of the bitmap.
|
||||
BitRun NextRun() {
|
||||
if (ARROW_PREDICT_FALSE(position_ >= length_)) {
|
||||
return {/*length=*/0, false};
|
||||
}
|
||||
// This implementation relies on a efficient implementations of
|
||||
// CountTrailingZeros and assumes that runs are more often then
|
||||
// not. The logic is to incrementally find the next bit change
|
||||
// from the current position. This is done by zeroing all
|
||||
// bits in word_ up to position_ and using the TrailingZeroCount
|
||||
// to find the index of the next set bit.
|
||||
|
||||
// The runs alternate on each call, so flip the bit.
|
||||
current_run_bit_set_ = !current_run_bit_set_;
|
||||
|
||||
int64_t start_position = position_;
|
||||
int64_t start_bit_offset = start_position & 63;
|
||||
// Invert the word for proper use of CountTrailingZeros and
|
||||
// clear bits so CountTrailingZeros can do it magic.
|
||||
word_ = ~word_ & ~bit_util::LeastSignificantBitMask(start_bit_offset);
|
||||
|
||||
// Go forward until the next change from unset to set.
|
||||
int64_t new_bits = bit_util::CountTrailingZeros(word_) - start_bit_offset;
|
||||
position_ += new_bits;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(bit_util::IsMultipleOf64(position_)) &&
|
||||
ARROW_PREDICT_TRUE(position_ < length_)) {
|
||||
// Continue extending position while we can advance an entire word.
|
||||
// (updates position_ accordingly).
|
||||
AdvanceUntilChange();
|
||||
}
|
||||
|
||||
return {/*length=*/position_ - start_position, current_run_bit_set_};
|
||||
}
|
||||
|
||||
private:
|
||||
void AdvanceUntilChange() {
|
||||
int64_t new_bits = 0;
|
||||
do {
|
||||
// Advance the position of the bitmap for loading.
|
||||
bitmap_ += sizeof(uint64_t);
|
||||
LoadNextWord();
|
||||
new_bits = bit_util::CountTrailingZeros(word_);
|
||||
// Continue calculating run length.
|
||||
position_ += new_bits;
|
||||
} while (ARROW_PREDICT_FALSE(bit_util::IsMultipleOf64(position_)) &&
|
||||
ARROW_PREDICT_TRUE(position_ < length_) && new_bits > 0);
|
||||
}
|
||||
|
||||
void LoadNextWord() { return LoadWord(length_ - position_); }
|
||||
|
||||
// Helper method for Loading the next word.
|
||||
void LoadWord(int64_t bits_remaining) {
|
||||
word_ = 0;
|
||||
// we need at least an extra byte in this case.
|
||||
if (ARROW_PREDICT_TRUE(bits_remaining >= 64)) {
|
||||
std::memcpy(&word_, bitmap_, 8);
|
||||
} else {
|
||||
int64_t bytes_to_load = bit_util::BytesForBits(bits_remaining);
|
||||
auto word_ptr = reinterpret_cast<uint8_t*>(&word_);
|
||||
std::memcpy(word_ptr, bitmap_, bytes_to_load);
|
||||
// Ensure stoppage at last bit in bitmap by reversing the next higher
|
||||
// order bit.
|
||||
bit_util::SetBitTo(word_ptr, bits_remaining,
|
||||
!bit_util::GetBit(word_ptr, bits_remaining - 1));
|
||||
}
|
||||
|
||||
// Two cases:
|
||||
// 1. For unset, CountTrailingZeros works naturally so we don't
|
||||
// invert the word.
|
||||
// 2. Otherwise invert so we can use CountTrailingZeros.
|
||||
if (current_run_bit_set_) {
|
||||
word_ = ~word_;
|
||||
}
|
||||
}
|
||||
const uint8_t* bitmap_;
|
||||
int64_t position_;
|
||||
int64_t length_;
|
||||
uint64_t word_;
|
||||
bool current_run_bit_set_;
|
||||
};
|
||||
#else
|
||||
using BitRunReader = BitRunReaderLinear;
|
||||
#endif
|
||||
|
||||
struct SetBitRun {
|
||||
int64_t position;
|
||||
int64_t length;
|
||||
|
||||
bool AtEnd() const { return length == 0; }
|
||||
|
||||
std::string ToString() const {
|
||||
return std::string("{pos=") + std::to_string(position) +
|
||||
", len=" + std::to_string(length) + "}";
|
||||
}
|
||||
|
||||
bool operator==(const SetBitRun& other) const {
|
||||
return position == other.position && length == other.length;
|
||||
}
|
||||
bool operator!=(const SetBitRun& other) const {
|
||||
return position != other.position || length != other.length;
|
||||
}
|
||||
};
|
||||
|
||||
template <bool Reverse>
|
||||
class BaseSetBitRunReader {
|
||||
public:
|
||||
/// \brief Constructs new SetBitRunReader.
|
||||
///
|
||||
/// \param[in] bitmap source data
|
||||
/// \param[in] start_offset bit offset into the source data
|
||||
/// \param[in] length number of bits to copy
|
||||
ARROW_NOINLINE
|
||||
BaseSetBitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
|
||||
: bitmap_(util::MakeNonNull(bitmap)),
|
||||
length_(length),
|
||||
remaining_(length_),
|
||||
current_word_(0),
|
||||
current_num_bits_(0) {
|
||||
if (Reverse) {
|
||||
bitmap_ += (start_offset + length) / 8;
|
||||
const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 8);
|
||||
if (length > 0 && end_bit_offset) {
|
||||
// Get LSBs from last byte
|
||||
++bitmap_;
|
||||
current_num_bits_ =
|
||||
std::min(static_cast<int32_t>(length), static_cast<int32_t>(end_bit_offset));
|
||||
current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
|
||||
}
|
||||
} else {
|
||||
bitmap_ += start_offset / 8;
|
||||
const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
|
||||
if (length > 0 && bit_offset) {
|
||||
// Get MSBs from first byte
|
||||
current_num_bits_ =
|
||||
std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - bit_offset));
|
||||
current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ARROW_NOINLINE
|
||||
SetBitRun NextRun() {
|
||||
int64_t pos = 0;
|
||||
int64_t len = 0;
|
||||
if (current_num_bits_) {
|
||||
const auto run = FindCurrentRun();
|
||||
assert(remaining_ >= 0);
|
||||
if (run.length && current_num_bits_) {
|
||||
// The run ends in current_word_
|
||||
return AdjustRun(run);
|
||||
}
|
||||
pos = run.position;
|
||||
len = run.length;
|
||||
}
|
||||
if (!len) {
|
||||
// We didn't get any ones in current_word_, so we can skip any zeros
|
||||
// in the following words
|
||||
SkipNextZeros();
|
||||
if (remaining_ == 0) {
|
||||
return {0, 0};
|
||||
}
|
||||
assert(current_num_bits_);
|
||||
pos = position();
|
||||
} else if (!current_num_bits_) {
|
||||
if (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
|
||||
current_word_ = LoadFullWord();
|
||||
current_num_bits_ = 64;
|
||||
} else if (remaining_ > 0) {
|
||||
current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
|
||||
current_num_bits_ = static_cast<int32_t>(remaining_);
|
||||
} else {
|
||||
// No bits remaining, perhaps we found a run?
|
||||
return AdjustRun({pos, len});
|
||||
}
|
||||
// If current word starts with a zero, we got a full run
|
||||
if (!(current_word_ & kFirstBit)) {
|
||||
return AdjustRun({pos, len});
|
||||
}
|
||||
}
|
||||
// Current word should now start with a set bit
|
||||
len += CountNextOnes();
|
||||
return AdjustRun({pos, len});
|
||||
}
|
||||
|
||||
protected:
|
||||
int64_t position() const {
|
||||
if (Reverse) {
|
||||
return remaining_;
|
||||
} else {
|
||||
return length_ - remaining_;
|
||||
}
|
||||
}
|
||||
|
||||
SetBitRun AdjustRun(SetBitRun run) {
|
||||
if (Reverse) {
|
||||
assert(run.position >= run.length);
|
||||
run.position -= run.length;
|
||||
}
|
||||
return run;
|
||||
}
|
||||
|
||||
uint64_t LoadFullWord() {
|
||||
uint64_t word;
|
||||
if (Reverse) {
|
||||
bitmap_ -= 8;
|
||||
}
|
||||
memcpy(&word, bitmap_, 8);
|
||||
if (!Reverse) {
|
||||
bitmap_ += 8;
|
||||
}
|
||||
return bit_util::ToLittleEndian(word);
|
||||
}
|
||||
|
||||
uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
|
||||
assert(num_bits > 0);
|
||||
uint64_t word = 0;
|
||||
const int64_t num_bytes = bit_util::BytesForBits(num_bits);
|
||||
if (Reverse) {
|
||||
// Read in the most significant bytes of the word
|
||||
bitmap_ -= num_bytes;
|
||||
memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
|
||||
// XXX MostSignificantBitmask
|
||||
return (bit_util::ToLittleEndian(word) << bit_offset) &
|
||||
~bit_util::LeastSignificantBitMask(64 - num_bits);
|
||||
} else {
|
||||
memcpy(&word, bitmap_, num_bytes);
|
||||
bitmap_ += num_bytes;
|
||||
return (bit_util::ToLittleEndian(word) >> bit_offset) &
|
||||
bit_util::LeastSignificantBitMask(num_bits);
|
||||
}
|
||||
}
|
||||
|
||||
void SkipNextZeros() {
|
||||
assert(current_num_bits_ == 0);
|
||||
while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
|
||||
current_word_ = LoadFullWord();
|
||||
const auto num_zeros = CountFirstZeros(current_word_);
|
||||
if (num_zeros < 64) {
|
||||
// Run of zeros ends here
|
||||
current_word_ = ConsumeBits(current_word_, num_zeros);
|
||||
current_num_bits_ = 64 - num_zeros;
|
||||
remaining_ -= num_zeros;
|
||||
assert(remaining_ >= 0);
|
||||
assert(current_num_bits_ >= 0);
|
||||
return;
|
||||
}
|
||||
remaining_ -= 64;
|
||||
}
|
||||
// Run of zeros continues in last bitmap word
|
||||
if (remaining_ > 0) {
|
||||
current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
|
||||
current_num_bits_ = static_cast<int32_t>(remaining_);
|
||||
const auto num_zeros =
|
||||
std::min<int32_t>(current_num_bits_, CountFirstZeros(current_word_));
|
||||
current_word_ = ConsumeBits(current_word_, num_zeros);
|
||||
current_num_bits_ -= num_zeros;
|
||||
remaining_ -= num_zeros;
|
||||
assert(remaining_ >= 0);
|
||||
assert(current_num_bits_ >= 0);
|
||||
}
|
||||
}
|
||||
|
||||
int64_t CountNextOnes() {
|
||||
assert(current_word_ & kFirstBit);
|
||||
|
||||
int64_t len;
|
||||
if (~current_word_) {
|
||||
const auto num_ones = CountFirstZeros(~current_word_);
|
||||
assert(num_ones <= current_num_bits_);
|
||||
assert(num_ones <= remaining_);
|
||||
remaining_ -= num_ones;
|
||||
current_word_ = ConsumeBits(current_word_, num_ones);
|
||||
current_num_bits_ -= num_ones;
|
||||
if (current_num_bits_) {
|
||||
// Run of ones ends here
|
||||
return num_ones;
|
||||
}
|
||||
len = num_ones;
|
||||
} else {
|
||||
// current_word_ is all ones
|
||||
remaining_ -= 64;
|
||||
current_num_bits_ = 0;
|
||||
len = 64;
|
||||
}
|
||||
|
||||
while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
|
||||
current_word_ = LoadFullWord();
|
||||
const auto num_ones = CountFirstZeros(~current_word_);
|
||||
len += num_ones;
|
||||
remaining_ -= num_ones;
|
||||
if (num_ones < 64) {
|
||||
// Run of ones ends here
|
||||
current_word_ = ConsumeBits(current_word_, num_ones);
|
||||
current_num_bits_ = 64 - num_ones;
|
||||
return len;
|
||||
}
|
||||
}
|
||||
// Run of ones continues in last bitmap word
|
||||
if (remaining_ > 0) {
|
||||
current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
|
||||
current_num_bits_ = static_cast<int32_t>(remaining_);
|
||||
const auto num_ones = CountFirstZeros(~current_word_);
|
||||
assert(num_ones <= current_num_bits_);
|
||||
assert(num_ones <= remaining_);
|
||||
current_word_ = ConsumeBits(current_word_, num_ones);
|
||||
current_num_bits_ -= num_ones;
|
||||
remaining_ -= num_ones;
|
||||
len += num_ones;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
SetBitRun FindCurrentRun() {
|
||||
// Skip any pending zeros
|
||||
const auto num_zeros = CountFirstZeros(current_word_);
|
||||
if (num_zeros >= current_num_bits_) {
|
||||
remaining_ -= current_num_bits_;
|
||||
current_word_ = 0;
|
||||
current_num_bits_ = 0;
|
||||
return {0, 0};
|
||||
}
|
||||
assert(num_zeros <= remaining_);
|
||||
current_word_ = ConsumeBits(current_word_, num_zeros);
|
||||
current_num_bits_ -= num_zeros;
|
||||
remaining_ -= num_zeros;
|
||||
const int64_t pos = position();
|
||||
// Count any ones
|
||||
const auto num_ones = CountFirstZeros(~current_word_);
|
||||
assert(num_ones <= current_num_bits_);
|
||||
assert(num_ones <= remaining_);
|
||||
current_word_ = ConsumeBits(current_word_, num_ones);
|
||||
current_num_bits_ -= num_ones;
|
||||
remaining_ -= num_ones;
|
||||
return {pos, num_ones};
|
||||
}
|
||||
|
||||
inline int CountFirstZeros(uint64_t word);
|
||||
inline uint64_t ConsumeBits(uint64_t word, int32_t num_bits);
|
||||
|
||||
const uint8_t* bitmap_;
|
||||
const int64_t length_;
|
||||
int64_t remaining_;
|
||||
uint64_t current_word_;
|
||||
int32_t current_num_bits_;
|
||||
|
||||
static constexpr uint64_t kFirstBit = Reverse ? 0x8000000000000000ULL : 1;
|
||||
};
|
||||
|
||||
template <>
|
||||
inline int BaseSetBitRunReader<false>::CountFirstZeros(uint64_t word) {
|
||||
return bit_util::CountTrailingZeros(word);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline int BaseSetBitRunReader<true>::CountFirstZeros(uint64_t word) {
|
||||
return bit_util::CountLeadingZeros(word);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint64_t BaseSetBitRunReader<false>::ConsumeBits(uint64_t word, int32_t num_bits) {
|
||||
return word >> num_bits;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint64_t BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t num_bits) {
|
||||
return word << num_bits;
|
||||
}
|
||||
|
||||
using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>;
|
||||
using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>;
|
||||
|
||||
// Functional-style bit run visitors.
|
||||
|
||||
template <typename Visit>
|
||||
inline Status VisitBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
|
||||
Visit&& visit) {
|
||||
if (bitmap == NULLPTR) {
|
||||
// Assuming all set (as in a null bitmap)
|
||||
return visit(static_cast<int64_t>(0), length, true);
|
||||
}
|
||||
BitRunReader reader(bitmap, offset, length);
|
||||
int64_t position = 0;
|
||||
while (true) {
|
||||
const auto run = reader.NextRun();
|
||||
if (run.length == 0) {
|
||||
break;
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(visit(position, run.length, run.set));
|
||||
position += run.length;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// XXX: Try to make this function small so the compiler can inline and optimize
|
||||
// the `visit` function, which is normally a hot loop with vectorizable code.
|
||||
// - don't inline SetBitRunReader constructor, it doesn't hurt performance
|
||||
// - un-inline NextRun hurts 'many null' cases a bit, but improves normal cases
|
||||
template <typename Visit>
|
||||
inline Status VisitSetBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
|
||||
Visit&& visit) {
|
||||
if (bitmap == NULLPTR) {
|
||||
// Assuming all set (as in a null bitmap)
|
||||
return visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
|
||||
}
|
||||
SetBitRunReader reader(bitmap, offset, length);
|
||||
while (true) {
|
||||
const auto run = reader.NextRun();
|
||||
if (run.length == 0) {
|
||||
break;
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(visit(run.position, run.length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename Visit>
|
||||
inline void VisitSetBitRunsVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
|
||||
Visit&& visit) {
|
||||
if (bitmap == NULLPTR) {
|
||||
// Assuming all set (as in a null bitmap)
|
||||
visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
|
||||
return;
|
||||
}
|
||||
SetBitRunReader reader(bitmap, offset, length);
|
||||
while (true) {
|
||||
const auto run = reader.NextRun();
|
||||
if (run.length == 0) {
|
||||
break;
|
||||
}
|
||||
visit(run.position, run.length);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Visit>
|
||||
inline Status VisitSetBitRuns(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
|
||||
int64_t length, Visit&& visit) {
|
||||
return VisitSetBitRuns(bitmap ? bitmap->data() : NULLPTR, offset, length,
|
||||
std::forward<Visit>(visit));
|
||||
}
|
||||
|
||||
template <typename Visit>
|
||||
inline void VisitSetBitRunsVoid(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
|
||||
int64_t length, Visit&& visit) {
|
||||
VisitSetBitRunsVoid(bitmap ? bitmap->data() : NULLPTR, offset, length,
|
||||
std::forward<Visit>(visit));
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,490 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# if defined(_M_AMD64) || defined(_M_X64)
|
||||
# include <intrin.h> // IWYU pragma: keep
|
||||
# endif
|
||||
|
||||
# pragma intrinsic(_BitScanReverse)
|
||||
# pragma intrinsic(_BitScanForward)
|
||||
# define ARROW_POPCOUNT64 __popcnt64
|
||||
# define ARROW_POPCOUNT32 __popcnt
|
||||
#else
|
||||
# define ARROW_POPCOUNT64 __builtin_popcountll
|
||||
# define ARROW_POPCOUNT32 __builtin_popcount
|
||||
#endif
|
||||
|
||||
#include <cstdint>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace detail {
|
||||
|
||||
template <typename Integer>
|
||||
typename std::make_unsigned<Integer>::type as_unsigned(Integer x) {
|
||||
return static_cast<typename std::make_unsigned<Integer>::type>(x);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
namespace bit_util {
|
||||
|
||||
// The number of set bits in a given unsigned byte value, pre-computed
|
||||
//
|
||||
// Generated with the following Python code
|
||||
// output = 'static constexpr uint8_t kBytePopcount[] = {{{0}}};'
|
||||
// popcounts = [str(bin(i).count('1')) for i in range(0, 256)]
|
||||
// print(output.format(', '.join(popcounts)))
|
||||
static constexpr uint8_t kBytePopcount[] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3,
|
||||
4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4,
|
||||
4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4,
|
||||
5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,
|
||||
4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2,
|
||||
3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5,
|
||||
5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4,
|
||||
5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
|
||||
|
||||
static inline uint64_t PopCount(uint64_t bitmap) { return ARROW_POPCOUNT64(bitmap); }
|
||||
static inline uint32_t PopCount(uint32_t bitmap) { return ARROW_POPCOUNT32(bitmap); }
|
||||
|
||||
//
|
||||
// Bit-related computations on integer values
|
||||
//
|
||||
|
||||
// Returns the ceil of value/divisor
|
||||
constexpr int64_t CeilDiv(int64_t value, int64_t divisor) {
|
||||
return (value == 0) ? 0 : 1 + (value - 1) / divisor;
|
||||
}
|
||||
|
||||
// Return the number of bytes needed to fit the given number of bits
|
||||
constexpr int64_t BytesForBits(int64_t bits) {
|
||||
// This formula avoids integer overflow on very large `bits`
|
||||
return (bits >> 3) + ((bits & 7) != 0);
|
||||
}
|
||||
|
||||
constexpr bool IsPowerOf2(int64_t value) {
|
||||
return value > 0 && (value & (value - 1)) == 0;
|
||||
}
|
||||
|
||||
constexpr bool IsPowerOf2(uint64_t value) {
|
||||
return value > 0 && (value & (value - 1)) == 0;
|
||||
}
|
||||
|
||||
// Returns the smallest power of two that contains v. If v is already a
|
||||
// power of two, it is returned as is.
|
||||
static inline int64_t NextPower2(int64_t n) {
|
||||
// Taken from
|
||||
// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
|
||||
n--;
|
||||
n |= n >> 1;
|
||||
n |= n >> 2;
|
||||
n |= n >> 4;
|
||||
n |= n >> 8;
|
||||
n |= n >> 16;
|
||||
n |= n >> 32;
|
||||
n++;
|
||||
return n;
|
||||
}
|
||||
|
||||
constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; }
|
||||
|
||||
constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
|
||||
|
||||
// Returns a mask for the bit_index lower order bits.
|
||||
// Only valid for bit_index in the range [0, 64).
|
||||
constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
|
||||
return (static_cast<uint64_t>(1) << bit_index) - 1;
|
||||
}
|
||||
|
||||
// Returns 'value' rounded up to the nearest multiple of 'factor'
|
||||
constexpr int64_t RoundUp(int64_t value, int64_t factor) {
|
||||
return CeilDiv(value, factor) * factor;
|
||||
}
|
||||
|
||||
// Returns 'value' rounded down to the nearest multiple of 'factor'
|
||||
constexpr int64_t RoundDown(int64_t value, int64_t factor) {
|
||||
return (value / factor) * factor;
|
||||
}
|
||||
|
||||
// Returns 'value' rounded up to the nearest multiple of 'factor' when factor
|
||||
// is a power of two.
|
||||
// The result is undefined on overflow, i.e. if `value > 2**64 - factor`,
|
||||
// since we cannot return the correct result which would be 2**64.
|
||||
constexpr int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
|
||||
// DCHECK(value >= 0);
|
||||
// DCHECK(IsPowerOf2(factor));
|
||||
return (value + (factor - 1)) & ~(factor - 1);
|
||||
}
|
||||
|
||||
constexpr uint64_t RoundUpToPowerOf2(uint64_t value, uint64_t factor) {
|
||||
// DCHECK(IsPowerOf2(factor));
|
||||
return (value + (factor - 1)) & ~(factor - 1);
|
||||
}
|
||||
|
||||
constexpr int64_t RoundUpToMultipleOf8(int64_t num) { return RoundUpToPowerOf2(num, 8); }
|
||||
|
||||
constexpr int64_t RoundUpToMultipleOf64(int64_t num) {
|
||||
return RoundUpToPowerOf2(num, 64);
|
||||
}
|
||||
|
||||
// Returns the number of bytes covering a sliced bitmap. Find the length
|
||||
// rounded to cover full bytes on both extremities.
|
||||
//
|
||||
// The following example represents a slice (offset=10, length=9)
|
||||
//
|
||||
// 0 8 16 24
|
||||
// |-------|-------|------|
|
||||
// [ ] (slice)
|
||||
// [ ] (same slice aligned to bytes bounds, length=16)
|
||||
//
|
||||
// The covering bytes is the length (in bytes) of this new aligned slice.
|
||||
constexpr int64_t CoveringBytes(int64_t offset, int64_t length) {
|
||||
return (bit_util::RoundUp(length + offset, 8) - bit_util::RoundDown(offset, 8)) / 8;
|
||||
}
|
||||
|
||||
// Returns the 'num_bits' least-significant bits of 'v'.
|
||||
static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
|
||||
if (ARROW_PREDICT_FALSE(num_bits == 0)) return 0;
|
||||
if (ARROW_PREDICT_FALSE(num_bits >= 64)) return v;
|
||||
int n = 64 - num_bits;
|
||||
return (v << n) >> n;
|
||||
}
|
||||
|
||||
/// \brief Count the number of leading zeros in an unsigned integer.
|
||||
static inline int CountLeadingZeros(uint32_t value) {
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
if (value == 0) return 32;
|
||||
return static_cast<int>(__builtin_clz(value));
|
||||
#elif defined(_MSC_VER)
|
||||
unsigned long index; // NOLINT
|
||||
if (_BitScanReverse(&index, static_cast<unsigned long>(value))) { // NOLINT
|
||||
return 31 - static_cast<int>(index);
|
||||
} else {
|
||||
return 32;
|
||||
}
|
||||
#else
|
||||
int bitpos = 0;
|
||||
while (value != 0) {
|
||||
value >>= 1;
|
||||
++bitpos;
|
||||
}
|
||||
return 32 - bitpos;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int CountLeadingZeros(uint64_t value) {
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
if (value == 0) return 64;
|
||||
return static_cast<int>(__builtin_clzll(value));
|
||||
#elif defined(_MSC_VER)
|
||||
unsigned long index; // NOLINT
|
||||
if (_BitScanReverse64(&index, value)) { // NOLINT
|
||||
return 63 - static_cast<int>(index);
|
||||
} else {
|
||||
return 64;
|
||||
}
|
||||
#else
|
||||
int bitpos = 0;
|
||||
while (value != 0) {
|
||||
value >>= 1;
|
||||
++bitpos;
|
||||
}
|
||||
return 64 - bitpos;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int CountTrailingZeros(uint32_t value) {
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
if (value == 0) return 32;
|
||||
return static_cast<int>(__builtin_ctzl(value));
|
||||
#elif defined(_MSC_VER)
|
||||
unsigned long index; // NOLINT
|
||||
if (_BitScanForward(&index, value)) {
|
||||
return static_cast<int>(index);
|
||||
} else {
|
||||
return 32;
|
||||
}
|
||||
#else
|
||||
int bitpos = 0;
|
||||
if (value) {
|
||||
while (value & 1 == 0) {
|
||||
value >>= 1;
|
||||
++bitpos;
|
||||
}
|
||||
} else {
|
||||
bitpos = 32;
|
||||
}
|
||||
return bitpos;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int CountTrailingZeros(uint64_t value) {
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
if (value == 0) return 64;
|
||||
return static_cast<int>(__builtin_ctzll(value));
|
||||
#elif defined(_MSC_VER)
|
||||
unsigned long index; // NOLINT
|
||||
if (_BitScanForward64(&index, value)) {
|
||||
return static_cast<int>(index);
|
||||
} else {
|
||||
return 64;
|
||||
}
|
||||
#else
|
||||
int bitpos = 0;
|
||||
if (value) {
|
||||
while (value & 1 == 0) {
|
||||
value >>= 1;
|
||||
++bitpos;
|
||||
}
|
||||
} else {
|
||||
bitpos = 64;
|
||||
}
|
||||
return bitpos;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Returns the minimum number of bits needed to represent an unsigned value
|
||||
static inline int NumRequiredBits(uint64_t x) { return 64 - CountLeadingZeros(x); }
|
||||
|
||||
// Returns ceil(log2(x)).
|
||||
static inline int Log2(uint64_t x) {
|
||||
// DCHECK_GT(x, 0);
|
||||
return NumRequiredBits(x - 1);
|
||||
}
|
||||
|
||||
//
|
||||
// Utilities for reading and writing individual bits by their index
|
||||
// in a memory area.
|
||||
//
|
||||
|
||||
// Bitmask selecting the k-th bit in a byte
|
||||
static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128};
|
||||
|
||||
// the bitwise complement version of kBitmask
|
||||
static constexpr uint8_t kFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127};
|
||||
|
||||
// Bitmask selecting the (k - 1) preceding bits in a byte
|
||||
static constexpr uint8_t kPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127};
|
||||
static constexpr uint8_t kPrecedingWrappingBitmask[] = {255, 1, 3, 7, 15, 31, 63, 127};
|
||||
|
||||
// the bitwise complement version of kPrecedingBitmask
|
||||
static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128};
|
||||
|
||||
static constexpr bool GetBit(const uint8_t* bits, uint64_t i) {
|
||||
return (bits[i >> 3] >> (i & 0x07)) & 1;
|
||||
}
|
||||
|
||||
// Gets the i-th bit from a byte. Should only be used with i <= 7.
|
||||
static constexpr bool GetBitFromByte(uint8_t byte, uint8_t i) {
|
||||
return byte & kBitmask[i];
|
||||
}
|
||||
|
||||
static inline void ClearBit(uint8_t* bits, int64_t i) {
|
||||
bits[i / 8] &= kFlippedBitmask[i % 8];
|
||||
}
|
||||
|
||||
static inline void SetBit(uint8_t* bits, int64_t i) { bits[i / 8] |= kBitmask[i % 8]; }
|
||||
|
||||
static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
|
||||
// https://graphics.stanford.edu/~seander/bithacks.html
|
||||
// "Conditionally set or clear bits without branching"
|
||||
// NOTE: this seems to confuse Valgrind as it reads from potentially
|
||||
// uninitialized memory
|
||||
bits[i / 8] ^= static_cast<uint8_t>(-static_cast<uint8_t>(bit_is_set) ^ bits[i / 8]) &
|
||||
kBitmask[i % 8];
|
||||
}
|
||||
|
||||
/// \brief set or clear a range of bits quickly
|
||||
ARROW_EXPORT
|
||||
void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
|
||||
|
||||
/// \brief Sets all bits in the bitmap to true
|
||||
ARROW_EXPORT
|
||||
void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
|
||||
|
||||
/// \brief Clears all bits in the bitmap (set to false)
|
||||
ARROW_EXPORT
|
||||
void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
|
||||
|
||||
/// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be
|
||||
/// returned
|
||||
/// ex:
|
||||
/// ref: https://stackoverflow.com/a/59523400
|
||||
template <typename Word>
|
||||
constexpr Word PrecedingWordBitmask(const unsigned int i) {
|
||||
return static_cast<Word>(static_cast<Word>(i < sizeof(Word) * 8)
|
||||
<< (i & (sizeof(Word) * 8 - 1))) -
|
||||
1;
|
||||
}
|
||||
static_assert(PrecedingWordBitmask<uint8_t>(0) == 0x00, "");
|
||||
static_assert(PrecedingWordBitmask<uint8_t>(4) == 0x0f, "");
|
||||
static_assert(PrecedingWordBitmask<uint8_t>(8) == 0xff, "");
|
||||
static_assert(PrecedingWordBitmask<uint16_t>(8) == 0x00ff, "");
|
||||
|
||||
/// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
|
||||
/// from `high`.
|
||||
/// Word ret
|
||||
/// for (i = 0; i < sizeof(Word)*8; i++){
|
||||
/// ret[i]= i < n ? low[i]: high[i];
|
||||
/// }
|
||||
template <typename Word>
|
||||
constexpr Word SpliceWord(int n, Word low, Word high) {
|
||||
return (high & ~PrecedingWordBitmask<Word>(n)) | (low & PrecedingWordBitmask<Word>(n));
|
||||
}
|
||||
|
||||
/// \brief Pack integers into a bitmap in batches of 8
|
||||
template <int batch_size>
|
||||
void PackBits(const uint32_t* values, uint8_t* out) {
|
||||
for (int i = 0; i < batch_size / 8; ++i) {
|
||||
*out++ = static_cast<uint8_t>(values[0] | values[1] << 1 | values[2] << 2 |
|
||||
values[3] << 3 | values[4] << 4 | values[5] << 5 |
|
||||
values[6] << 6 | values[7] << 7);
|
||||
values += 8;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 7); }
|
||||
|
||||
template <typename Int>
|
||||
constexpr int64_t kMaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8);
|
||||
|
||||
/// Write a integer as LEB128
|
||||
///
|
||||
/// Write the input value as LEB128 into the outptut buffer and return the number of bytes
|
||||
/// written.
|
||||
/// If the output buffer size is insufficient, return 0 but the output may have been
|
||||
/// written to.
|
||||
/// The input value can be a signed integer, but must be non negative.
|
||||
///
|
||||
/// \see https://en.wikipedia.org/wiki/LEB128
|
||||
/// \see MaxLEB128ByteLenFor
|
||||
template <typename Int>
|
||||
constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) {
|
||||
constexpr Int kLow7Mask = Int(0x7F);
|
||||
constexpr Int kHigh7Mask = ~kLow7Mask;
|
||||
constexpr uint8_t kContinuationBit = 0x80;
|
||||
|
||||
// This encoding does not work for negative values
|
||||
if constexpr (std::is_signed_v<Int>) {
|
||||
if (ARROW_PREDICT_FALSE(value < 0)) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
const auto out_first = out;
|
||||
|
||||
// Write as many bytes as we could be for the given input
|
||||
while ((value & kHigh7Mask) != Int(0)) {
|
||||
// We do not have enough room to write the LEB128
|
||||
if (ARROW_PREDICT_FALSE(out - out_first >= max_out_size)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Write the encoded byte with continuation bit
|
||||
*out = static_cast<uint8_t>(value & kLow7Mask) | kContinuationBit;
|
||||
++out;
|
||||
// Shift remaining data
|
||||
value >>= 7;
|
||||
}
|
||||
|
||||
// We do not have enough room to write the LEB128
|
||||
if (ARROW_PREDICT_FALSE(out - out_first >= max_out_size)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Write last non-continuing byte
|
||||
*out = static_cast<uint8_t>(value & kLow7Mask);
|
||||
++out;
|
||||
|
||||
return static_cast<int32_t>(out - out_first);
|
||||
}
|
||||
|
||||
/// Parse a leading LEB128
|
||||
///
|
||||
/// Take as input a data pointer and the maximum number of bytes that can be read from it
|
||||
/// (typically the array size).
|
||||
/// When a valid LEB128 is found at the start of the data, the function writes it to the
|
||||
/// out pointer and return the number of bytes read.
|
||||
/// Otherwise, the out pointer is unmodified and zero is returned.
|
||||
///
|
||||
/// \see https://en.wikipedia.org/wiki/LEB128
|
||||
/// \see MaxLEB128ByteLenFor
|
||||
template <typename Int>
|
||||
constexpr int32_t ParseLeadingLEB128(const uint8_t* data, int32_t max_data_size,
|
||||
Int* out) {
|
||||
constexpr auto kMaxBytes = static_cast<int32_t>(kMaxLEB128ByteLenFor<Int>);
|
||||
static_assert(kMaxBytes >= 1);
|
||||
constexpr uint8_t kLow7Mask = 0x7F;
|
||||
constexpr uint8_t kContinuationBit = 0x80;
|
||||
constexpr int32_t kSignBitCount = std::is_signed_v<Int> ? 1 : 0;
|
||||
// Number of bits allowed for encoding data on the last byte to avoid overflow
|
||||
constexpr uint8_t kHighBitCount = (8 * sizeof(Int) - kSignBitCount) % 7;
|
||||
// kHighBitCount least significant `0` bits and the rest with `1`
|
||||
constexpr uint8_t kHighForbiddenMask = ~((1 << kHighBitCount) - 1);
|
||||
|
||||
// Iteratively building the value
|
||||
std::make_unsigned_t<Int> value = 0;
|
||||
|
||||
// Read as many bytes as we could be for the given output.
|
||||
for (int32_t i = 0; i < kMaxBytes - 1; i++) {
|
||||
// We have not finished reading a valid LEB128, yet we run out of data
|
||||
if (ARROW_PREDICT_FALSE(i >= max_data_size)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Read the byte and set its 7 LSB to in the final value
|
||||
const uint8_t byte = data[i];
|
||||
value |= static_cast<Int>(byte & kLow7Mask) << (7 * i);
|
||||
|
||||
// Check for lack of continuation flag in MSB
|
||||
if ((byte & kContinuationBit) == 0) {
|
||||
*out = value;
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Process the last index avoiding overflowing
|
||||
constexpr int32_t last = kMaxBytes - 1;
|
||||
|
||||
// We have not finished reading a valid LEB128, yet we run out of data
|
||||
if (ARROW_PREDICT_FALSE(last >= max_data_size)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const uint8_t byte = data[last];
|
||||
|
||||
// Need to check if there are bits that would overflow the output.
|
||||
// Also checks that there is no continuation.
|
||||
if (ARROW_PREDICT_FALSE((byte & kHighForbiddenMask) != 0)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// No longer need to mask since we ensured
|
||||
value |= static_cast<Int>(byte) << (7 * last);
|
||||
*out = value;
|
||||
return last + 1;
|
||||
}
|
||||
} // namespace bit_util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,466 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <bitset>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/bitmap_ops.h"
|
||||
#include "arrow/util/bitmap_reader.h"
|
||||
#include "arrow/util/bitmap_writer.h"
|
||||
#include "arrow/util/compare.h"
|
||||
#include "arrow/util/endian.h"
|
||||
#include "arrow/util/functional.h"
|
||||
#include "arrow/util/span.h"
|
||||
#include "arrow/util/string_util.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class BooleanArray;
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
|
||||
public util::EqualityComparable<Bitmap> {
|
||||
public:
|
||||
Bitmap() = default;
|
||||
|
||||
Bitmap(const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length)
|
||||
: data_(buffer->data()), offset_(offset), length_(length) {
|
||||
if (buffer->is_mutable()) {
|
||||
mutable_data_ = buffer->mutable_data();
|
||||
}
|
||||
}
|
||||
|
||||
Bitmap(const void* data, int64_t offset, int64_t length)
|
||||
: data_(reinterpret_cast<const uint8_t*>(data)), offset_(offset), length_(length) {}
|
||||
|
||||
Bitmap(void* data, int64_t offset, int64_t length)
|
||||
: data_(reinterpret_cast<const uint8_t*>(data)),
|
||||
mutable_data_(reinterpret_cast<uint8_t*>(data)),
|
||||
offset_(offset),
|
||||
length_(length) {}
|
||||
|
||||
Bitmap Slice(int64_t offset) const {
|
||||
if (mutable_data_ != NULLPTR) {
|
||||
return {mutable_data_, offset_ + offset, length_ - offset};
|
||||
} else {
|
||||
return {data_, offset_ + offset, length_ - offset};
|
||||
}
|
||||
}
|
||||
|
||||
Bitmap Slice(int64_t offset, int64_t length) const {
|
||||
if (mutable_data_ != NULLPTR) {
|
||||
return {mutable_data_, offset_ + offset, length};
|
||||
} else {
|
||||
return {data_, offset_ + offset, length};
|
||||
}
|
||||
}
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
bool Equals(const Bitmap& other) const;
|
||||
|
||||
std::string Diff(const Bitmap& other) const;
|
||||
|
||||
bool GetBit(int64_t i) const { return bit_util::GetBit(data_, i + offset_); }
|
||||
|
||||
bool operator[](int64_t i) const { return GetBit(i); }
|
||||
|
||||
void SetBitTo(int64_t i, bool v) const {
|
||||
bit_util::SetBitTo(mutable_data_, i + offset_, v);
|
||||
}
|
||||
|
||||
void SetBitsTo(bool v) { bit_util::SetBitsTo(mutable_data_, offset_, length_, v); }
|
||||
|
||||
void CopyFrom(const Bitmap& other);
|
||||
void CopyFromInverted(const Bitmap& other);
|
||||
|
||||
/// \brief Visit bits from each bitmap as bitset<N>
|
||||
///
|
||||
/// All bitmaps must have identical length.
|
||||
template <size_t N, typename Visitor>
|
||||
static void VisitBits(const Bitmap (&bitmaps)[N], Visitor&& visitor) {
|
||||
int64_t bit_length = BitLength(bitmaps, N);
|
||||
std::bitset<N> bits;
|
||||
for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
bits[i] = bitmaps[i].GetBit(bit_i);
|
||||
}
|
||||
visitor(bits);
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Visit bits from each bitmap as bitset<N>
|
||||
///
|
||||
/// All bitmaps must have identical length.
|
||||
template <size_t N, typename Visitor>
|
||||
static void VisitBits(const std::array<Bitmap, N>& bitmaps, Visitor&& visitor) {
|
||||
int64_t bit_length = BitLength(bitmaps);
|
||||
std::bitset<N> bits;
|
||||
for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
bits[i] = bitmaps[i].GetBit(bit_i);
|
||||
}
|
||||
visitor(bits);
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Visit words of bits from each bitmap as array<Word, N>
|
||||
///
|
||||
/// All bitmaps must have identical length. The first bit in a visited bitmap
|
||||
/// may be offset within the first visited word, but words will otherwise contain
|
||||
/// densely packed bits loaded from the bitmap. That offset within the first word is
|
||||
/// returned.
|
||||
///
|
||||
/// TODO(bkietz) allow for early termination
|
||||
// NOTE: this function is efficient on 3+ sufficiently large bitmaps.
|
||||
// It also has a large prolog / epilog overhead and should be used
|
||||
// carefully in other cases.
|
||||
// For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
|
||||
// and BitmapUInt64Reader.
|
||||
template <size_t N, typename Visitor,
|
||||
typename Word = typename std::decay<
|
||||
internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
|
||||
static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) {
|
||||
constexpr int64_t kBitWidth = sizeof(Word) * 8;
|
||||
|
||||
// local, mutable variables which will be sliced/decremented to represent consumption:
|
||||
Bitmap bitmaps[N];
|
||||
int64_t offsets[N];
|
||||
int64_t bit_length = BitLength(bitmaps_arg, N);
|
||||
util::span<const Word> words[N];
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
bitmaps[i] = bitmaps_arg[i];
|
||||
offsets[i] = bitmaps[i].template word_offset<Word>();
|
||||
assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
|
||||
words[i] = bitmaps[i].template words<Word>();
|
||||
}
|
||||
|
||||
auto consume = [&](int64_t consumed_bits) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
|
||||
offsets[i] = bitmaps[i].template word_offset<Word>();
|
||||
assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
|
||||
words[i] = bitmaps[i].template words<Word>();
|
||||
}
|
||||
bit_length -= consumed_bits;
|
||||
};
|
||||
|
||||
std::array<Word, N> visited_words;
|
||||
visited_words.fill(0);
|
||||
|
||||
if (bit_length <= kBitWidth * 2) {
|
||||
// bitmaps fit into one or two words so don't bother with optimization
|
||||
while (bit_length > 0) {
|
||||
auto leading_bits = std::min(bit_length, kBitWidth);
|
||||
SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
|
||||
visitor(visited_words);
|
||||
consume(leading_bits);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t max_offset = *std::max_element(offsets, offsets + N);
|
||||
int64_t min_offset = *std::min_element(offsets, offsets + N);
|
||||
if (max_offset > 0) {
|
||||
// consume leading bits
|
||||
auto leading_bits = kBitWidth - min_offset;
|
||||
SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
|
||||
visitor(visited_words);
|
||||
consume(leading_bits);
|
||||
}
|
||||
assert(*std::min_element(offsets, offsets + N) == 0);
|
||||
|
||||
int64_t whole_word_count = bit_length / kBitWidth;
|
||||
assert(whole_word_count >= 1);
|
||||
|
||||
if (min_offset == max_offset) {
|
||||
// all offsets were identical, all leading bits have been consumed
|
||||
assert(
|
||||
std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; }));
|
||||
|
||||
for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
visited_words[i] = words[i][word_i];
|
||||
}
|
||||
visitor(visited_words);
|
||||
}
|
||||
consume(whole_word_count * kBitWidth);
|
||||
} else {
|
||||
// leading bits from potentially incomplete words have been consumed
|
||||
|
||||
// word_i such that words[i][word_i] and words[i][word_i + 1] are lie entirely
|
||||
// within the bitmap for all i
|
||||
for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (offsets[i] == 0) {
|
||||
visited_words[i] = words[i][word_i];
|
||||
} else {
|
||||
auto words0 = bit_util::ToLittleEndian(words[i][word_i]);
|
||||
auto words1 = bit_util::ToLittleEndian(words[i][word_i + 1]);
|
||||
visited_words[i] = bit_util::FromLittleEndian(
|
||||
(words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i])));
|
||||
}
|
||||
}
|
||||
visitor(visited_words);
|
||||
}
|
||||
consume((whole_word_count - 1) * kBitWidth);
|
||||
|
||||
SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
|
||||
|
||||
visitor(visited_words);
|
||||
consume(kBitWidth);
|
||||
}
|
||||
|
||||
// load remaining bits
|
||||
if (bit_length > 0) {
|
||||
SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
|
||||
visitor(visited_words);
|
||||
}
|
||||
|
||||
return min_offset;
|
||||
}
|
||||
|
||||
template <size_t N, size_t M, typename ReaderT, typename WriterT, typename Visitor,
|
||||
typename Word = typename std::decay<
|
||||
internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
|
||||
static void RunVisitWordsAndWriteLoop(int64_t bit_length,
|
||||
std::array<ReaderT, N>& readers,
|
||||
std::array<WriterT, M>& writers,
|
||||
Visitor&& visitor) {
|
||||
constexpr int64_t kBitWidth = sizeof(Word) * 8;
|
||||
|
||||
std::array<Word, N> visited_words;
|
||||
std::array<Word, M> output_words;
|
||||
|
||||
// every reader will have same number of words, since they are same length'ed
|
||||
// TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond
|
||||
// Word boundary, every Word would have to be created from 2 adjoining Words
|
||||
auto n_words = readers[0].words();
|
||||
bit_length -= n_words * kBitWidth;
|
||||
while (n_words--) {
|
||||
// first collect all words to visited_words array
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
visited_words[i] = readers[i].NextWord();
|
||||
}
|
||||
visitor(visited_words, &output_words);
|
||||
for (size_t i = 0; i < M; i++) {
|
||||
writers[i].PutNextWord(output_words[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// every reader will have same number of trailing bytes, because of the above reason
|
||||
// tailing portion could be more than one word! (ref: BitmapWordReader constructor)
|
||||
// remaining full/ partial words to write
|
||||
|
||||
if (bit_length) {
|
||||
// convert the word visitor lambda to a byte_visitor
|
||||
auto byte_visitor = [&](const std::array<uint8_t, N>& in,
|
||||
std::array<uint8_t, M>* out) {
|
||||
std::array<Word, N> in_words;
|
||||
std::array<Word, M> out_words;
|
||||
std::copy(in.begin(), in.end(), in_words.begin());
|
||||
visitor(in_words, &out_words);
|
||||
for (size_t i = 0; i < M; i++) {
|
||||
out->at(i) = static_cast<uint8_t>(out_words[i]);
|
||||
}
|
||||
};
|
||||
|
||||
std::array<uint8_t, N> visited_bytes;
|
||||
std::array<uint8_t, M> output_bytes;
|
||||
int n_bytes = readers[0].trailing_bytes();
|
||||
while (n_bytes--) {
|
||||
visited_bytes.fill(0);
|
||||
output_bytes.fill(0);
|
||||
int valid_bits;
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
visited_bytes[i] = readers[i].NextTrailingByte(valid_bits);
|
||||
}
|
||||
byte_visitor(visited_bytes, &output_bytes);
|
||||
for (size_t i = 0; i < M; i++) {
|
||||
writers[i].PutNextTrailingByte(output_bytes[i], valid_bits);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
|
||||
/// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
|
||||
///
|
||||
/// All bitmaps must have identical length. The first bit in a visited bitmap
|
||||
/// may be offset within the first visited word, but words will otherwise contain
|
||||
/// densely packed bits loaded from the bitmap. That offset within the first word is
|
||||
/// returned.
|
||||
/// Visitor is expected to have the following signature
|
||||
/// [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
|
||||
///
|
||||
// NOTE: this function is efficient on 3+ sufficiently large bitmaps.
|
||||
// It also has a large prolog / epilog overhead and should be used
|
||||
// carefully in other cases.
|
||||
// For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
|
||||
// and BitmapUInt64Reader.
|
||||
template <size_t N, size_t M, typename Visitor,
|
||||
typename Word = typename std::decay<
|
||||
internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
|
||||
static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
|
||||
std::array<Bitmap, M>* out_bitmaps_arg,
|
||||
Visitor&& visitor) {
|
||||
int64_t bit_length = BitLength(bitmaps_arg);
|
||||
assert(bit_length == BitLength(*out_bitmaps_arg));
|
||||
|
||||
// if both input and output bitmaps have no byte offset, then use special template
|
||||
if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(),
|
||||
[](const Bitmap& b) { return b.offset_ % 8 == 0; }) &&
|
||||
std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(),
|
||||
[](const Bitmap& b) { return b.offset_ % 8 == 0; })) {
|
||||
std::array<BitmapWordReader<Word, /*may_have_byte_offset=*/false>, N> readers;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const Bitmap& in_bitmap = bitmaps_arg[i];
|
||||
readers[i] = BitmapWordReader<Word, /*may_have_byte_offset=*/false>(
|
||||
in_bitmap.data_, in_bitmap.offset_, in_bitmap.length_);
|
||||
}
|
||||
|
||||
std::array<BitmapWordWriter<Word, /*may_have_byte_offset=*/false>, M> writers;
|
||||
for (size_t i = 0; i < M; ++i) {
|
||||
const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
|
||||
writers[i] = BitmapWordWriter<Word, /*may_have_byte_offset=*/false>(
|
||||
out_bitmap.mutable_data_, out_bitmap.offset_, out_bitmap.length_);
|
||||
}
|
||||
|
||||
RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
|
||||
} else {
|
||||
std::array<BitmapWordReader<Word>, N> readers;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const Bitmap& in_bitmap = bitmaps_arg[i];
|
||||
readers[i] =
|
||||
BitmapWordReader<Word>(in_bitmap.data_, in_bitmap.offset_, in_bitmap.length_);
|
||||
}
|
||||
|
||||
std::array<BitmapWordWriter<Word>, M> writers;
|
||||
for (size_t i = 0; i < M; ++i) {
|
||||
const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
|
||||
writers[i] = BitmapWordWriter<Word>(out_bitmap.mutable_data_, out_bitmap.offset_,
|
||||
out_bitmap.length_);
|
||||
}
|
||||
|
||||
RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t* data() const { return data_; }
|
||||
uint8_t* mutable_data() { return mutable_data_; }
|
||||
|
||||
/// offset of first bit relative to buffer().data()
|
||||
int64_t offset() const { return offset_; }
|
||||
|
||||
/// number of bits in this Bitmap
|
||||
int64_t length() const { return length_; }
|
||||
|
||||
/// span of all bytes which contain any bit in this Bitmap
|
||||
util::span<const uint8_t> bytes() const {
|
||||
auto byte_offset = offset_ / 8;
|
||||
auto byte_count = bit_util::CeilDiv(offset_ + length_, 8) - byte_offset;
|
||||
return {data_ + byte_offset, static_cast<size_t>(byte_count)};
|
||||
}
|
||||
|
||||
private:
|
||||
/// span of all Words which contain any bit in this Bitmap
|
||||
///
|
||||
/// For example, given Word=uint16_t and a bitmap spanning bits [20, 36)
|
||||
/// words() would span bits [16, 48).
|
||||
///
|
||||
/// 0 16 32 48 64
|
||||
/// |-------|-------|------|------| (buffer)
|
||||
/// [ ] (bitmap)
|
||||
/// |-------|------| (returned words)
|
||||
///
|
||||
/// \warning The words may contain bytes which lie outside the buffer or are
|
||||
/// uninitialized.
|
||||
template <typename Word>
|
||||
util::span<const Word> words() const {
|
||||
auto bytes_addr = reinterpret_cast<intptr_t>(bytes().data());
|
||||
auto words_addr = bytes_addr - bytes_addr % sizeof(Word);
|
||||
auto word_byte_count =
|
||||
bit_util::RoundUpToPowerOf2(static_cast<int64_t>(bytes_addr + bytes().size()),
|
||||
static_cast<int64_t>(sizeof(Word))) -
|
||||
words_addr;
|
||||
return {reinterpret_cast<const Word*>(words_addr),
|
||||
static_cast<size_t>(word_byte_count / sizeof(Word))};
|
||||
}
|
||||
|
||||
/// offset of first bit relative to words<Word>().data()
|
||||
template <typename Word>
|
||||
int64_t word_offset() const {
|
||||
return offset_ + 8 * (reinterpret_cast<intptr_t>(data_) -
|
||||
reinterpret_cast<intptr_t>(words<Word>().data()));
|
||||
}
|
||||
|
||||
/// load words from bitmaps bitwise
|
||||
template <size_t N, typename Word>
|
||||
static void SafeLoadWords(const Bitmap (&bitmaps)[N], int64_t offset,
|
||||
int64_t out_length, bool set_trailing_bits,
|
||||
std::array<Word, N>* out) {
|
||||
out->fill(0);
|
||||
|
||||
int64_t out_offset = set_trailing_bits ? sizeof(Word) * 8 - out_length : 0;
|
||||
|
||||
Bitmap slices[N], out_bitmaps[N];
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
slices[i] = bitmaps[i].Slice(offset, out_length);
|
||||
out_bitmaps[i] = Bitmap(&out->at(i), out_offset, out_length);
|
||||
}
|
||||
|
||||
int64_t bit_i = 0;
|
||||
Bitmap::VisitBits(slices, [&](std::bitset<N> bits) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
out_bitmaps[i].SetBitTo(bit_i, bits[i]);
|
||||
}
|
||||
++bit_i;
|
||||
});
|
||||
}
|
||||
|
||||
/// assert bitmaps have identical length and return that length
|
||||
static int64_t BitLength(const Bitmap* bitmaps, size_t N);
|
||||
|
||||
template <size_t N>
|
||||
static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
|
||||
for (size_t i = 1; i < N; ++i) {
|
||||
assert(bitmaps[i].length() == bitmaps[0].length());
|
||||
}
|
||||
return bitmaps[0].length();
|
||||
}
|
||||
|
||||
const uint8_t* data_ = NULLPTR;
|
||||
uint8_t* mutable_data_ = NULLPTR;
|
||||
int64_t offset_ = 0, length_ = 0;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,44 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/span.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
/// \brief Generate Bitmap with all position to `value` except for one found
|
||||
/// at `straggler_pos`.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length,
|
||||
int64_t straggler_pos, bool value = true);
|
||||
|
||||
/// \brief Convert vector of bytes to bitmap buffer
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> BytesToBits(util::span<const uint8_t> bytes,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,112 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// A std::generate() like function to write sequential bits into a bitmap area.
|
||||
// Bits preceding the bitmap area are preserved, bits following the bitmap
|
||||
// area may be clobbered.
|
||||
|
||||
template <class Generator>
|
||||
void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
uint8_t* cur = bitmap + start_offset / 8;
|
||||
uint8_t bit_mask = bit_util::kBitmask[start_offset % 8];
|
||||
uint8_t current_byte = *cur & bit_util::kPrecedingBitmask[start_offset % 8];
|
||||
|
||||
for (int64_t index = 0; index < length; ++index) {
|
||||
const bool bit = g();
|
||||
current_byte = bit ? (current_byte | bit_mask) : current_byte;
|
||||
bit_mask = static_cast<uint8_t>(bit_mask << 1);
|
||||
if (bit_mask == 0) {
|
||||
bit_mask = 1;
|
||||
*cur++ = current_byte;
|
||||
current_byte = 0;
|
||||
}
|
||||
}
|
||||
if (bit_mask != 1) {
|
||||
*cur++ = current_byte;
|
||||
}
|
||||
}
|
||||
|
||||
// Like GenerateBits(), but unrolls its main loop for higher performance.
|
||||
|
||||
template <class Generator>
|
||||
void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
|
||||
Generator&& g) {
|
||||
static_assert(std::is_same<decltype(std::declval<Generator>()()), bool>::value,
|
||||
"Functor passed to GenerateBitsUnrolled must return bool");
|
||||
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
uint8_t current_byte;
|
||||
uint8_t* cur = bitmap + start_offset / 8;
|
||||
const uint64_t start_bit_offset = start_offset % 8;
|
||||
uint8_t bit_mask = bit_util::kBitmask[start_bit_offset];
|
||||
int64_t remaining = length;
|
||||
|
||||
if (bit_mask != 0x01) {
|
||||
current_byte = *cur & bit_util::kPrecedingBitmask[start_bit_offset];
|
||||
while (bit_mask != 0 && remaining > 0) {
|
||||
current_byte |= g() * bit_mask;
|
||||
bit_mask = static_cast<uint8_t>(bit_mask << 1);
|
||||
--remaining;
|
||||
}
|
||||
*cur++ = current_byte;
|
||||
}
|
||||
|
||||
int64_t remaining_bytes = remaining / 8;
|
||||
uint8_t out_results[8];
|
||||
while (remaining_bytes-- > 0) {
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
out_results[i] = g();
|
||||
}
|
||||
*cur++ = static_cast<uint8_t>(out_results[0] | out_results[1] << 1 |
|
||||
out_results[2] << 2 | out_results[3] << 3 |
|
||||
out_results[4] << 4 | out_results[5] << 5 |
|
||||
out_results[6] << 6 | out_results[7] << 7);
|
||||
}
|
||||
|
||||
int64_t remaining_bits = remaining % 8;
|
||||
if (remaining_bits) {
|
||||
current_byte = 0;
|
||||
bit_mask = 0x01;
|
||||
while (remaining_bits-- > 0) {
|
||||
current_byte |= g() * bit_mask;
|
||||
bit_mask = static_cast<uint8_t>(bit_mask << 1);
|
||||
}
|
||||
*cur++ = current_byte;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,246 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Buffer;
|
||||
class MemoryPool;
|
||||
|
||||
namespace internal {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Bitmap utilities
|
||||
|
||||
/// Copy a bit range of an existing bitmap
|
||||
///
|
||||
/// \param[in] pool memory pool to allocate memory from
|
||||
/// \param[in] bitmap source data
|
||||
/// \param[in] offset bit offset into the source data
|
||||
/// \param[in] length number of bits to copy
|
||||
/// \param[in] out_offset bit offset into the output buffer
|
||||
///
|
||||
/// \return Status message
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> CopyBitmap(MemoryPool* pool, const uint8_t* bitmap,
|
||||
int64_t offset, int64_t length,
|
||||
int64_t out_offset = 0);
|
||||
|
||||
/// Copy a bit range of an existing bitmap into an existing bitmap
|
||||
///
|
||||
/// \param[in] bitmap source data
|
||||
/// \param[in] offset bit offset into the source data
|
||||
/// \param[in] length number of bits to copy
|
||||
/// \param[in] dest_offset bit offset into the destination
|
||||
/// \param[out] dest the destination buffer, must have at least space for
|
||||
/// (offset + length) bits
|
||||
ARROW_EXPORT
|
||||
void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
|
||||
int64_t dest_offset);
|
||||
|
||||
/// Invert a bit range of an existing bitmap into an existing bitmap
|
||||
///
|
||||
/// \param[in] bitmap source data
|
||||
/// \param[in] offset bit offset into the source data
|
||||
/// \param[in] length number of bits to copy
|
||||
/// \param[in] dest_offset bit offset into the destination
|
||||
/// \param[out] dest the destination buffer, must have at least space for
|
||||
/// (offset + length) bits
|
||||
ARROW_EXPORT
|
||||
void InvertBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
|
||||
int64_t dest_offset);
|
||||
|
||||
/// Invert a bit range of an existing bitmap
|
||||
///
|
||||
/// \param[in] pool memory pool to allocate memory from
|
||||
/// \param[in] bitmap source data
|
||||
/// \param[in] offset bit offset into the source data
|
||||
/// \param[in] length number of bits to copy
|
||||
///
|
||||
/// \return Status message
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> InvertBitmap(MemoryPool* pool, const uint8_t* bitmap,
|
||||
int64_t offset, int64_t length);
|
||||
|
||||
/// Reverse a bit range of an existing bitmap into an existing bitmap
|
||||
///
|
||||
/// \param[in] bitmap source data
|
||||
/// \param[in] offset bit offset into the source data
|
||||
/// \param[in] length number of bits to reverse
|
||||
/// \param[in] dest_offset bit offset into the destination
|
||||
/// \param[out] dest the destination buffer, must have at least space for
|
||||
/// (offset + length) bits
|
||||
ARROW_EXPORT
|
||||
void ReverseBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
|
||||
int64_t dest_offset);
|
||||
|
||||
/// Reverse a bit range of an existing bitmap
|
||||
///
|
||||
/// \param[in] pool memory pool to allocate memory from
|
||||
/// \param[in] bitmap source data
|
||||
/// \param[in] offset bit offset into the source data
|
||||
/// \param[in] length number of bits to reverse
|
||||
///
|
||||
/// \return Status message
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> ReverseBitmap(MemoryPool* pool, const uint8_t* bitmap,
|
||||
int64_t offset, int64_t length);
|
||||
|
||||
/// Compute the number of 1's in the given data array
|
||||
///
|
||||
/// \param[in] data a packed LSB-ordered bitmap as a byte array
|
||||
/// \param[in] bit_offset a bitwise offset into the bitmap
|
||||
/// \param[in] length the number of bits to inspect in the bitmap relative to
|
||||
/// the offset
|
||||
///
|
||||
/// \return The number of set (1) bits in the range
|
||||
ARROW_EXPORT
|
||||
int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length);
|
||||
|
||||
/// Compute the number of 1's in the result of an "and" (&) of two bitmaps
|
||||
///
|
||||
/// \param[in] left_bitmap a packed LSB-ordered bitmap as a byte array
|
||||
/// \param[in] left_offset a bitwise offset into the left bitmap
|
||||
/// \param[in] right_bitmap a packed LSB-ordered bitmap as a byte array
|
||||
/// \param[in] right_offset a bitwise offset into the right bitmap
|
||||
/// \param[in] length the length of the bitmaps (must be the same)
|
||||
///
|
||||
/// \return The number of set (1) bits in the "and" of the two bitmaps
|
||||
ARROW_EXPORT
|
||||
int64_t CountAndSetBits(const uint8_t* left_bitmap, int64_t left_offset,
|
||||
const uint8_t* right_bitmap, int64_t right_offset,
|
||||
int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length);
|
||||
|
||||
// Same as BitmapEquals, but considers a NULL bitmap pointer the same as an
|
||||
// all-ones bitmap.
|
||||
ARROW_EXPORT
|
||||
bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
|
||||
const std::shared_ptr<Buffer>& right, int64_t right_offset,
|
||||
int64_t length);
|
||||
|
||||
/// \brief Do a "bitmap and" on right and left buffers starting at
|
||||
/// their respective bit-offsets for the given bit-length and put
|
||||
/// the results in out_buffer starting at the given bit-offset.
|
||||
///
|
||||
/// out_buffer will be allocated and initialized to zeros using pool before
|
||||
/// the operation.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool* pool, const uint8_t* left,
|
||||
int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length,
|
||||
int64_t out_offset);
|
||||
|
||||
/// \brief Do a "bitmap and" on right and left buffers starting at
|
||||
/// their respective bit-offsets for the given bit-length and put
|
||||
/// the results in out starting at the given bit-offset.
|
||||
ARROW_EXPORT
|
||||
void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
|
||||
|
||||
/// \brief Do a "bitmap or" for the given bit length on right and left buffers
|
||||
/// starting at their respective bit-offsets and put the results in out_buffer
|
||||
/// starting at the given bit-offset.
|
||||
///
|
||||
/// out_buffer will be allocated and initialized to zeros using pool before
|
||||
/// the operation.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> BitmapOr(MemoryPool* pool, const uint8_t* left,
|
||||
int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length,
|
||||
int64_t out_offset);
|
||||
|
||||
/// \brief Do a "bitmap or" for the given bit length on right and left buffers
|
||||
/// starting at their respective bit-offsets and put the results in out
|
||||
/// starting at the given bit-offset.
|
||||
ARROW_EXPORT
|
||||
void BitmapOr(const uint8_t* left, int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
|
||||
|
||||
/// \brief Do a "bitmap xor" for the given bit-length on right and left
|
||||
/// buffers starting at their respective bit-offsets and put the results in
|
||||
/// out_buffer starting at the given bit offset.
|
||||
///
|
||||
/// out_buffer will be allocated and initialized to zeros using pool before
|
||||
/// the operation.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> BitmapXor(MemoryPool* pool, const uint8_t* left,
|
||||
int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length,
|
||||
int64_t out_offset);
|
||||
|
||||
/// \brief Do a "bitmap xor" for the given bit-length on right and left
|
||||
/// buffers starting at their respective bit-offsets and put the results in
|
||||
/// out starting at the given bit offset.
|
||||
ARROW_EXPORT
|
||||
void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
|
||||
|
||||
/// \brief Do a "bitmap and not" on right and left buffers starting at
|
||||
/// their respective bit-offsets for the given bit-length and put
|
||||
/// the results in out_buffer starting at the given bit-offset.
|
||||
///
|
||||
/// out_buffer will be allocated and initialized to zeros using pool before
|
||||
/// the operation.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
|
||||
int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length,
|
||||
int64_t out_offset);
|
||||
|
||||
/// \brief Do a "bitmap and not" on right and left buffers starting at
|
||||
/// their respective bit-offsets for the given bit-length and put
|
||||
/// the results in out starting at the given bit-offset.
|
||||
ARROW_EXPORT
|
||||
void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
|
||||
|
||||
/// \brief Do a "bitmap or not" on right and left buffers starting at
|
||||
/// their respective bit-offsets for the given bit-length and put
|
||||
/// the results in out_buffer starting at the given bit-offset.
|
||||
///
|
||||
/// out_buffer will be allocated and initialized to zeros using pool before
|
||||
/// the operation.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
|
||||
int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length,
|
||||
int64_t out_offset);
|
||||
|
||||
/// \brief Do a "bitmap or not" on right and left buffers starting at
|
||||
/// their respective bit-offsets for the given bit-length and put
|
||||
/// the results in out starting at the given bit-offset.
|
||||
ARROW_EXPORT
|
||||
void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
|
||||
int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,275 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/endian.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
class BitmapReader {
|
||||
public:
|
||||
BitmapReader() = default;
|
||||
|
||||
BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
|
||||
: bitmap_(bitmap), position_(0), length_(length) {
|
||||
current_byte_ = 0;
|
||||
byte_offset_ = start_offset / 8;
|
||||
bit_offset_ = start_offset % 8;
|
||||
if (length > 0) {
|
||||
current_byte_ = bitmap[byte_offset_];
|
||||
}
|
||||
}
|
||||
|
||||
bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
|
||||
|
||||
bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
|
||||
|
||||
void Next() {
|
||||
++bit_offset_;
|
||||
++position_;
|
||||
if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
|
||||
bit_offset_ = 0;
|
||||
++byte_offset_;
|
||||
if (ARROW_PREDICT_TRUE(position_ < length_)) {
|
||||
current_byte_ = bitmap_[byte_offset_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int64_t position() const { return position_; }
|
||||
|
||||
int64_t length() const { return length_; }
|
||||
|
||||
private:
|
||||
const uint8_t* bitmap_;
|
||||
int64_t position_;
|
||||
int64_t length_;
|
||||
|
||||
uint8_t current_byte_;
|
||||
int64_t byte_offset_;
|
||||
int64_t bit_offset_;
|
||||
};
|
||||
|
||||
// XXX Cannot name it BitmapWordReader because the name is already used
|
||||
// in bitmap_ops.cc
|
||||
|
||||
class BitmapUInt64Reader {
|
||||
public:
|
||||
BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
|
||||
: bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
|
||||
num_carry_bits_(8 - start_offset % 8),
|
||||
length_(length),
|
||||
remaining_length_(length_),
|
||||
carry_bits_(0) {
|
||||
if (length_ > 0) {
|
||||
// Load carry bits from the first byte's MSBs
|
||||
if (length_ >= num_carry_bits_) {
|
||||
carry_bits_ =
|
||||
LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
|
||||
} else {
|
||||
carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t NextWord() {
|
||||
if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
|
||||
// We can load a full word
|
||||
uint64_t next_word = LoadFullWord();
|
||||
// Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
|
||||
uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
|
||||
carry_bits_ = next_word >> (64 - num_carry_bits_);
|
||||
remaining_length_ -= 64;
|
||||
return word;
|
||||
} else if (remaining_length_ > num_carry_bits_) {
|
||||
// We can load a partial word
|
||||
uint64_t next_word =
|
||||
LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
|
||||
uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
|
||||
carry_bits_ = next_word >> (64 - num_carry_bits_);
|
||||
remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
|
||||
return word;
|
||||
} else {
|
||||
remaining_length_ = 0;
|
||||
return carry_bits_;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t position() const { return length_ - remaining_length_; }
|
||||
|
||||
int64_t length() const { return length_; }
|
||||
|
||||
private:
|
||||
uint64_t LoadFullWord() {
|
||||
uint64_t word;
|
||||
memcpy(&word, bitmap_, 8);
|
||||
bitmap_ += 8;
|
||||
return bit_util::ToLittleEndian(word);
|
||||
}
|
||||
|
||||
uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
|
||||
uint64_t word = 0;
|
||||
const int64_t num_bytes = bit_util::BytesForBits(num_bits);
|
||||
memcpy(&word, bitmap_, num_bytes);
|
||||
bitmap_ += num_bytes;
|
||||
return (bit_util::ToLittleEndian(word) >> bit_offset) &
|
||||
bit_util::LeastSignificantBitMask(num_bits);
|
||||
}
|
||||
|
||||
const uint8_t* bitmap_;
|
||||
const int64_t num_carry_bits_; // in [1, 8]
|
||||
const int64_t length_;
|
||||
int64_t remaining_length_;
|
||||
uint64_t carry_bits_;
|
||||
};
|
||||
|
||||
// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
|
||||
// on sufficiently large inputs. However, it has a larger prolog / epilog overhead
|
||||
// and should probably not be used for small bitmaps.
|
||||
|
||||
template <typename Word, bool may_have_byte_offset = true>
|
||||
class BitmapWordReader {
|
||||
public:
|
||||
BitmapWordReader() = default;
|
||||
BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
|
||||
: offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
|
||||
bitmap_(bitmap + offset / 8),
|
||||
bitmap_end_(bitmap_ + bit_util::BytesForBits(offset_ + length)) {
|
||||
// decrement word count by one as we may touch two adjacent words in one iteration
|
||||
nwords_ = length / (sizeof(Word) * 8) - 1;
|
||||
if (nwords_ < 0) {
|
||||
nwords_ = 0;
|
||||
}
|
||||
trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
|
||||
trailing_bytes_ = static_cast<int>(bit_util::BytesForBits(trailing_bits_));
|
||||
|
||||
if (nwords_ > 0) {
|
||||
current_data.word_ = load<Word>(bitmap_);
|
||||
} else if (length > 0) {
|
||||
current_data.epi.byte_ = load<uint8_t>(bitmap_);
|
||||
}
|
||||
}
|
||||
|
||||
Word NextWord() {
|
||||
bitmap_ += sizeof(Word);
|
||||
const Word next_word = load<Word>(bitmap_);
|
||||
Word word = current_data.word_;
|
||||
if (may_have_byte_offset && offset_) {
|
||||
// combine two adjacent words into one word
|
||||
// |<------ next ----->|<---- current ---->|
|
||||
// +-------------+-----+-------------+-----+
|
||||
// | --- | A | B | --- |
|
||||
// +-------------+-----+-------------+-----+
|
||||
// | | offset
|
||||
// v v
|
||||
// +-----+-------------+
|
||||
// | A | B |
|
||||
// +-----+-------------+
|
||||
// |<------ word ----->|
|
||||
word >>= offset_;
|
||||
word |= next_word << (sizeof(Word) * 8 - offset_);
|
||||
}
|
||||
current_data.word_ = next_word;
|
||||
return word;
|
||||
}
|
||||
|
||||
uint8_t NextTrailingByte(int& valid_bits) {
|
||||
uint8_t byte;
|
||||
assert(trailing_bits_ > 0);
|
||||
|
||||
if (trailing_bits_ <= 8) {
|
||||
// last byte
|
||||
valid_bits = trailing_bits_;
|
||||
trailing_bits_ = 0;
|
||||
byte = 0;
|
||||
internal::BitmapReader reader(bitmap_, offset_, valid_bits);
|
||||
for (int i = 0; i < valid_bits; ++i) {
|
||||
byte >>= 1;
|
||||
if (reader.IsSet()) {
|
||||
byte |= 0x80;
|
||||
}
|
||||
reader.Next();
|
||||
}
|
||||
byte >>= (8 - valid_bits);
|
||||
} else {
|
||||
++bitmap_;
|
||||
const uint8_t next_byte = load<uint8_t>(bitmap_);
|
||||
byte = current_data.epi.byte_;
|
||||
if (may_have_byte_offset && offset_) {
|
||||
byte >>= offset_;
|
||||
byte |= next_byte << (8 - offset_);
|
||||
}
|
||||
current_data.epi.byte_ = next_byte;
|
||||
trailing_bits_ -= 8;
|
||||
trailing_bytes_--;
|
||||
valid_bits = 8;
|
||||
}
|
||||
return byte;
|
||||
}
|
||||
|
||||
int64_t words() const { return nwords_; }
|
||||
int trailing_bytes() const { return trailing_bytes_; }
|
||||
|
||||
private:
|
||||
int64_t offset_;
|
||||
const uint8_t* bitmap_;
|
||||
|
||||
const uint8_t* bitmap_end_;
|
||||
int64_t nwords_;
|
||||
int trailing_bits_;
|
||||
int trailing_bytes_;
|
||||
union {
|
||||
Word word_;
|
||||
struct {
|
||||
#if ARROW_LITTLE_ENDIAN == 0
|
||||
uint8_t padding_bytes_[sizeof(Word) - 1];
|
||||
#endif
|
||||
uint8_t byte_;
|
||||
} epi;
|
||||
} current_data;
|
||||
|
||||
template <typename DType>
|
||||
DType load(const uint8_t* bitmap) {
|
||||
assert(bitmap + sizeof(DType) <= bitmap_end_);
|
||||
return bit_util::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief Index into a possibly nonexistent bitmap
|
||||
struct OptionalBitIndexer {
|
||||
const uint8_t* bitmap;
|
||||
const int64_t offset;
|
||||
|
||||
explicit OptionalBitIndexer(const uint8_t* buffer = NULLPTR, int64_t offset = 0)
|
||||
: bitmap(buffer), offset(offset) {}
|
||||
|
||||
bool operator[](int64_t i) const {
|
||||
return bitmap == NULLPTR || bit_util::GetBit(bitmap, offset + i);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,88 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/bitmap_reader.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// A function that visits each bit in a bitmap and calls a visitor function with a
|
||||
// boolean representation of that bit. This is intended to be analogous to
|
||||
// GenerateBits.
|
||||
template <class Visitor>
|
||||
void VisitBits(const uint8_t* bitmap, int64_t start_offset, int64_t length,
|
||||
Visitor&& visit) {
|
||||
BitmapReader reader(bitmap, start_offset, length);
|
||||
for (int64_t index = 0; index < length; ++index) {
|
||||
visit(reader.IsSet());
|
||||
reader.Next();
|
||||
}
|
||||
}
|
||||
|
||||
// Like VisitBits(), but unrolls its main loop for better performance.
|
||||
template <class Visitor>
|
||||
void VisitBitsUnrolled(const uint8_t* bitmap, int64_t start_offset, int64_t length,
|
||||
Visitor&& visit) {
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Start by visiting any bits preceding the first full byte.
|
||||
int64_t num_bits_before_full_bytes =
|
||||
bit_util::RoundUpToMultipleOf8(start_offset) - start_offset;
|
||||
// Truncate num_bits_before_full_bytes if it is greater than length.
|
||||
if (num_bits_before_full_bytes > length) {
|
||||
num_bits_before_full_bytes = length;
|
||||
}
|
||||
// Use the non loop-unrolled VisitBits since we don't want to add branches
|
||||
VisitBits<Visitor>(bitmap, start_offset, num_bits_before_full_bytes, visit);
|
||||
|
||||
// Shift the start pointer to the first full byte and compute the
|
||||
// number of full bytes to be read.
|
||||
const uint8_t* first_full_byte = bitmap + bit_util::CeilDiv(start_offset, 8);
|
||||
const int64_t num_full_bytes = (length - num_bits_before_full_bytes) / 8;
|
||||
|
||||
// Iterate over each full byte of the input bitmap and call the visitor in
|
||||
// a loop-unrolled manner.
|
||||
for (int64_t byte_index = 0; byte_index < num_full_bytes; ++byte_index) {
|
||||
// Get the current bit-packed byte value from the bitmap.
|
||||
const uint8_t byte = *(first_full_byte + byte_index);
|
||||
|
||||
// Execute the visitor function on each bit of the current byte.
|
||||
visit(bit_util::GetBitFromByte(byte, 0));
|
||||
visit(bit_util::GetBitFromByte(byte, 1));
|
||||
visit(bit_util::GetBitFromByte(byte, 2));
|
||||
visit(bit_util::GetBitFromByte(byte, 3));
|
||||
visit(bit_util::GetBitFromByte(byte, 4));
|
||||
visit(bit_util::GetBitFromByte(byte, 5));
|
||||
visit(bit_util::GetBitFromByte(byte, 6));
|
||||
visit(bit_util::GetBitFromByte(byte, 7));
|
||||
}
|
||||
|
||||
// Write any leftover bits in the last byte.
|
||||
const int64_t num_bits_after_full_bytes = (length - num_bits_before_full_bytes) % 8;
|
||||
VisitBits<Visitor>(first_full_byte + num_full_bytes, 0, num_bits_after_full_bytes,
|
||||
visit);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,286 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/endian.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
class BitmapWriter {
|
||||
// A sequential bitwise writer that preserves surrounding bit values.
|
||||
|
||||
public:
|
||||
BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
|
||||
: bitmap_(bitmap), position_(0), length_(length) {
|
||||
byte_offset_ = start_offset / 8;
|
||||
bit_mask_ = bit_util::kBitmask[start_offset % 8];
|
||||
if (length > 0) {
|
||||
current_byte_ = bitmap[byte_offset_];
|
||||
} else {
|
||||
current_byte_ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void Set() { current_byte_ |= bit_mask_; }
|
||||
|
||||
void Clear() { current_byte_ &= bit_mask_ ^ 0xFF; }
|
||||
|
||||
void Next() {
|
||||
bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
|
||||
++position_;
|
||||
if (bit_mask_ == 0) {
|
||||
// Finished this byte, need advancing
|
||||
bit_mask_ = 0x01;
|
||||
bitmap_[byte_offset_++] = current_byte_;
|
||||
if (ARROW_PREDICT_TRUE(position_ < length_)) {
|
||||
current_byte_ = bitmap_[byte_offset_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Finish() {
|
||||
// Store current byte if we didn't went past bitmap storage
|
||||
if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
|
||||
bitmap_[byte_offset_] = current_byte_;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t position() const { return position_; }
|
||||
|
||||
private:
|
||||
uint8_t* bitmap_;
|
||||
int64_t position_;
|
||||
int64_t length_;
|
||||
|
||||
uint8_t current_byte_;
|
||||
uint8_t bit_mask_;
|
||||
int64_t byte_offset_;
|
||||
};
|
||||
|
||||
class FirstTimeBitmapWriter {
|
||||
// Like BitmapWriter, but any bit values *following* the bits written
|
||||
// might be clobbered. It is hence faster than BitmapWriter, and can
|
||||
// also avoid false positives with Valgrind.
|
||||
|
||||
public:
|
||||
FirstTimeBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
|
||||
: bitmap_(bitmap), position_(0), length_(length) {
|
||||
current_byte_ = 0;
|
||||
byte_offset_ = start_offset / 8;
|
||||
bit_mask_ = bit_util::kBitmask[start_offset % 8];
|
||||
if (length > 0) {
|
||||
current_byte_ =
|
||||
bitmap[byte_offset_] & bit_util::kPrecedingBitmask[start_offset % 8];
|
||||
} else {
|
||||
current_byte_ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends number_of_bits from word to valid_bits and valid_bits_offset.
|
||||
///
|
||||
/// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed
|
||||
/// to be unset (i.e. 0).
|
||||
/// \param[in] number_of_bits The number of bits to append from word.
|
||||
void AppendWord(uint64_t word, int64_t number_of_bits) {
|
||||
if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Location that the first byte needs to be written to.
|
||||
uint8_t* append_position = bitmap_ + byte_offset_;
|
||||
|
||||
// Update state variables except for current_byte_ here.
|
||||
position_ += number_of_bits;
|
||||
int64_t bit_offset = bit_util::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
|
||||
bit_mask_ = bit_util::kBitmask[(bit_offset + number_of_bits) % 8];
|
||||
byte_offset_ += (bit_offset + number_of_bits) / 8;
|
||||
|
||||
if (bit_offset != 0) {
|
||||
// We are in the middle of the byte. This code updates the byte and shifts
|
||||
// bits appropriately within word so it can be memcpy'd below.
|
||||
int64_t bits_to_carry = 8 - bit_offset;
|
||||
// Carry over bits from word to current_byte_. We assume any extra bits in word
|
||||
// unset so no additional accounting is needed for when number_of_bits <
|
||||
// bits_to_carry.
|
||||
current_byte_ |= (word & bit_util::kPrecedingBitmask[bits_to_carry]) << bit_offset;
|
||||
// Check if everything is transferred into current_byte_.
|
||||
if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
|
||||
return;
|
||||
}
|
||||
*append_position = current_byte_;
|
||||
append_position++;
|
||||
// Move the carry bits off of word.
|
||||
word = word >> bits_to_carry;
|
||||
number_of_bits -= bits_to_carry;
|
||||
}
|
||||
word = bit_util::ToLittleEndian(word);
|
||||
int64_t bytes_for_word = ::arrow::bit_util::BytesForBits(number_of_bits);
|
||||
std::memcpy(append_position, &word, bytes_for_word);
|
||||
// At this point, the previous current_byte_ has been written to bitmap_.
|
||||
// The new current_byte_ is either the last relevant byte in 'word'
|
||||
// or cleared if the new position is byte aligned (i.e. a fresh byte).
|
||||
if (bit_mask_ == 0x1) {
|
||||
current_byte_ = 0;
|
||||
} else {
|
||||
current_byte_ = *(append_position + bytes_for_word - 1);
|
||||
}
|
||||
}
|
||||
|
||||
void Set() { current_byte_ |= bit_mask_; }
|
||||
|
||||
void Clear() {}
|
||||
|
||||
void Next() {
|
||||
bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
|
||||
++position_;
|
||||
if (bit_mask_ == 0) {
|
||||
// Finished this byte, need advancing
|
||||
bit_mask_ = 0x01;
|
||||
bitmap_[byte_offset_++] = current_byte_;
|
||||
current_byte_ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void Finish() {
|
||||
// Store current byte if we didn't went go bitmap storage
|
||||
if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
|
||||
bitmap_[byte_offset_] = current_byte_;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t position() const { return position_; }
|
||||
|
||||
private:
|
||||
uint8_t* bitmap_;
|
||||
int64_t position_;
|
||||
int64_t length_;
|
||||
|
||||
uint8_t current_byte_;
|
||||
uint8_t bit_mask_;
|
||||
int64_t byte_offset_;
|
||||
};
|
||||
|
||||
template <typename Word, bool may_have_byte_offset = true>
|
||||
class BitmapWordWriter {
|
||||
public:
|
||||
BitmapWordWriter() = default;
|
||||
BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length)
|
||||
: offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
|
||||
bitmap_(bitmap + offset / 8),
|
||||
bitmap_end_(bitmap_ + bit_util::BytesForBits(offset_ + length)),
|
||||
mask_((1U << offset_) - 1) {
|
||||
if (offset_) {
|
||||
if (length >= static_cast<int>(sizeof(Word) * 8)) {
|
||||
current_data.word_ = load<Word>(bitmap_);
|
||||
} else if (length > 0) {
|
||||
current_data.epi.byte_ = load<uint8_t>(bitmap_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PutNextWord(Word word) {
|
||||
if (may_have_byte_offset && offset_) {
|
||||
// split one word into two adjacent words, don't touch unused bits
|
||||
// |<------ word ----->|
|
||||
// +-----+-------------+
|
||||
// | A | B |
|
||||
// +-----+-------------+
|
||||
// | |
|
||||
// v v offset
|
||||
// +-------------+-----+-------------+-----+
|
||||
// | --- | A | B | --- |
|
||||
// +-------------+-----+-------------+-----+
|
||||
// |<------ next ----->|<---- current ---->|
|
||||
word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
|
||||
Word next_word = load<Word>(bitmap_ + sizeof(Word));
|
||||
current_data.word_ = (current_data.word_ & mask_) | (word & ~mask_);
|
||||
next_word = (next_word & ~mask_) | (word & mask_);
|
||||
store<Word>(bitmap_, current_data.word_);
|
||||
store<Word>(bitmap_ + sizeof(Word), next_word);
|
||||
current_data.word_ = next_word;
|
||||
} else {
|
||||
store<Word>(bitmap_, word);
|
||||
}
|
||||
bitmap_ += sizeof(Word);
|
||||
}
|
||||
|
||||
void PutNextTrailingByte(uint8_t byte, int valid_bits) {
|
||||
if (valid_bits == 8) {
|
||||
if (may_have_byte_offset && offset_) {
|
||||
byte = (byte << offset_) | (byte >> (8 - offset_));
|
||||
uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
|
||||
current_data.epi.byte_ = (current_data.epi.byte_ & mask_) | (byte & ~mask_);
|
||||
next_byte = (next_byte & ~mask_) | (byte & mask_);
|
||||
store<uint8_t>(bitmap_, current_data.epi.byte_);
|
||||
store<uint8_t>(bitmap_ + 1, next_byte);
|
||||
current_data.epi.byte_ = next_byte;
|
||||
} else {
|
||||
store<uint8_t>(bitmap_, byte);
|
||||
}
|
||||
++bitmap_;
|
||||
} else {
|
||||
assert(valid_bits > 0);
|
||||
assert(valid_bits < 8);
|
||||
assert(bitmap_ + bit_util::BytesForBits(offset_ + valid_bits) <= bitmap_end_);
|
||||
internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
|
||||
for (int i = 0; i < valid_bits; ++i) {
|
||||
(byte & 0x01) ? writer.Set() : writer.Clear();
|
||||
writer.Next();
|
||||
byte >>= 1;
|
||||
}
|
||||
writer.Finish();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t offset_;
|
||||
uint8_t* bitmap_;
|
||||
|
||||
const uint8_t* bitmap_end_;
|
||||
uint64_t mask_;
|
||||
union {
|
||||
Word word_;
|
||||
struct {
|
||||
#if ARROW_LITTLE_ENDIAN == 0
|
||||
uint8_t padding_bytes_[sizeof(Word) - 1];
|
||||
#endif
|
||||
uint8_t byte_;
|
||||
} epi;
|
||||
} current_data;
|
||||
|
||||
template <typename DType>
|
||||
DType load(const uint8_t* bitmap) {
|
||||
assert(bitmap + sizeof(DType) <= bitmap_end_);
|
||||
return bit_util::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
|
||||
}
|
||||
|
||||
template <typename DType>
|
||||
void store(uint8_t* bitmap, DType data) {
|
||||
assert(bitmap + sizeof(DType) <= bitmap_end_);
|
||||
util::SafeStore(bitmap, bit_util::FromLittleEndian(data));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,88 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace util {
|
||||
|
||||
/// \brief The sum of bytes in each buffer referenced by the array
|
||||
///
|
||||
/// Note: An array may only reference a portion of a buffer.
|
||||
/// This method will overestimate in this case and return the
|
||||
/// byte size of the entire buffer.
|
||||
/// Note: If a buffer is referenced multiple times then it will
|
||||
/// only be counted once.
|
||||
ARROW_EXPORT int64_t TotalBufferSize(const ArrayData& array_data);
|
||||
/// \brief The sum of bytes in each buffer referenced by the array
|
||||
/// \see TotalBufferSize(const ArrayData& array_data) for details
|
||||
ARROW_EXPORT int64_t TotalBufferSize(const Array& array);
|
||||
/// \brief The sum of bytes in each buffer referenced by the array
|
||||
/// \see TotalBufferSize(const ArrayData& array_data) for details
|
||||
ARROW_EXPORT int64_t TotalBufferSize(const ChunkedArray& chunked_array);
|
||||
/// \brief The sum of bytes in each buffer referenced by the batch
|
||||
/// \see TotalBufferSize(const ArrayData& array_data) for details
|
||||
ARROW_EXPORT int64_t TotalBufferSize(const RecordBatch& record_batch);
|
||||
/// \brief The sum of bytes in each buffer referenced by the table
|
||||
/// \see TotalBufferSize(const ArrayData& array_data) for details
|
||||
ARROW_EXPORT int64_t TotalBufferSize(const Table& table);
|
||||
|
||||
/// \brief Calculate the buffer ranges referenced by the array
|
||||
///
|
||||
/// These ranges will take into account array offsets
|
||||
///
|
||||
/// The ranges may contain duplicates
|
||||
///
|
||||
/// Dictionary arrays will ignore the offset of their containing array
|
||||
///
|
||||
/// The return value will be a struct array corresponding to the schema:
|
||||
/// schema({field("start", uint64()), field("offset", uint64()), field("length",
|
||||
/// uint64()))
|
||||
ARROW_EXPORT Result<std::shared_ptr<Array>> ReferencedRanges(const ArrayData& array_data);
|
||||
|
||||
/// \brief Returns the sum of bytes from all buffer ranges referenced
|
||||
///
|
||||
/// Unlike TotalBufferSize this method will account for array
|
||||
/// offsets.
|
||||
///
|
||||
/// If buffers are shared between arrays then the shared
|
||||
/// portion will be counted multiple times.
|
||||
///
|
||||
/// Dictionary arrays will always be counted in their entirety
|
||||
/// even if the array only references a portion of the dictionary.
|
||||
ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const ArrayData& array_data);
|
||||
/// \brief Returns the sum of bytes from all buffer ranges referenced
|
||||
/// \see ReferencedBufferSize(const ArrayData& array_data) for details
|
||||
ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const Array& array_data);
|
||||
/// \brief Returns the sum of bytes from all buffer ranges referenced
|
||||
/// \see ReferencedBufferSize(const ArrayData& array_data) for details
|
||||
ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const ChunkedArray& array_data);
|
||||
/// \brief Returns the sum of bytes from all buffer ranges referenced
|
||||
/// \see ReferencedBufferSize(const ArrayData& array_data) for details
|
||||
ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const RecordBatch& array_data);
|
||||
/// \brief Returns the sum of bytes from all buffer ranges referenced
|
||||
/// \see ReferencedBufferSize(const ArrayData& array_data) for details
|
||||
ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const Table& array_data);
|
||||
|
||||
} // namespace util
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,118 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class StopToken;
|
||||
|
||||
struct StopSourceImpl;
|
||||
|
||||
/// EXPERIMENTAL
|
||||
class ARROW_EXPORT StopSource {
|
||||
public:
|
||||
StopSource();
|
||||
~StopSource();
|
||||
|
||||
// Consumer API (the side that stops)
|
||||
void RequestStop();
|
||||
void RequestStop(Status error);
|
||||
// Async-signal-safe. TODO Deprecate this?
|
||||
void RequestStopFromSignal(int signum);
|
||||
|
||||
StopToken token();
|
||||
|
||||
// For internal use only
|
||||
void Reset();
|
||||
|
||||
protected:
|
||||
std::shared_ptr<StopSourceImpl> impl_;
|
||||
};
|
||||
|
||||
/// EXPERIMENTAL
|
||||
class ARROW_EXPORT StopToken {
|
||||
public:
|
||||
// Public for Cython
|
||||
StopToken() = default;
|
||||
|
||||
explicit StopToken(std::shared_ptr<StopSourceImpl> impl) : impl_(std::move(impl)) {}
|
||||
|
||||
// A trivial token that never propagates any stop request
|
||||
static StopToken Unstoppable() { return StopToken(); }
|
||||
|
||||
/// \brief Check if the stop source has been cancelled.
|
||||
///
|
||||
/// Producers should call this method, whenever convenient, to check and
|
||||
/// see if they should stop producing early (i.e. have been cancelled).
|
||||
/// Failure to call this method often enough will lead to an unresponsive
|
||||
/// cancellation.
|
||||
///
|
||||
/// This is part of the producer API (the side that gets asked to stop)
|
||||
/// This method is thread-safe
|
||||
///
|
||||
/// \return An OK status if the stop source has not been cancelled or a
|
||||
/// cancel error if the source has been cancelled.
|
||||
Status Poll() const;
|
||||
bool IsStopRequested() const;
|
||||
|
||||
protected:
|
||||
std::shared_ptr<StopSourceImpl> impl_;
|
||||
};
|
||||
|
||||
/// EXPERIMENTAL: Set a global StopSource that can receive signals
|
||||
///
|
||||
/// The only allowed order of calls is the following:
|
||||
/// - SetSignalStopSource()
|
||||
/// - any number of pairs of (RegisterCancellingSignalHandler,
|
||||
/// UnregisterCancellingSignalHandler) calls
|
||||
/// - ResetSignalStopSource()
|
||||
///
|
||||
/// Beware that these settings are process-wide. Typically, only one
|
||||
/// thread should call these APIs, even in a multithreaded setting.
|
||||
ARROW_EXPORT
|
||||
Result<StopSource*> SetSignalStopSource();
|
||||
|
||||
/// EXPERIMENTAL: Reset the global signal-receiving StopSource
|
||||
///
|
||||
/// This will invalidate the pointer returned by SetSignalStopSource.
|
||||
ARROW_EXPORT
|
||||
void ResetSignalStopSource();
|
||||
|
||||
/// EXPERIMENTAL: Register signal handler triggering the signal-receiving StopSource
|
||||
///
|
||||
/// Note that those handlers are automatically un-registered in a fork()ed process,
|
||||
/// therefore the child process will need to call RegisterCancellingSignalHandler()
|
||||
/// if desired.
|
||||
ARROW_EXPORT
|
||||
Status RegisterCancellingSignalHandler(const std::vector<int>& signals);
|
||||
|
||||
/// EXPERIMENTAL: Unregister signal handler set up by RegisterCancellingSignalHandler
|
||||
ARROW_EXPORT
|
||||
void UnregisterCancellingSignalHandler();
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,61 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
template <typename OutputType, typename InputType>
|
||||
inline OutputType checked_cast(InputType&& value) {
|
||||
static_assert(std::is_class<typename std::remove_pointer<
|
||||
typename std::remove_reference<InputType>::type>::type>::value,
|
||||
"checked_cast input type must be a class");
|
||||
static_assert(std::is_class<typename std::remove_pointer<
|
||||
typename std::remove_reference<OutputType>::type>::type>::value,
|
||||
"checked_cast output type must be a class");
|
||||
#ifdef NDEBUG
|
||||
return static_cast<OutputType>(value);
|
||||
#else
|
||||
return dynamic_cast<OutputType>(value);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
std::shared_ptr<T> checked_pointer_cast(std::shared_ptr<U> r) noexcept {
|
||||
#ifdef NDEBUG
|
||||
return std::static_pointer_cast<T>(std::move(r));
|
||||
#else
|
||||
return std::dynamic_pointer_cast<T>(std::move(r));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
std::unique_ptr<T> checked_pointer_cast(std::unique_ptr<U> r) noexcept {
|
||||
#ifdef NDEBUG
|
||||
return std::unique_ptr<T>(static_cast<T*>(r.release()));
|
||||
#else
|
||||
return std::unique_ptr<T>(dynamic_cast<T*>(r.release()));
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,62 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
/// CRTP helper for declaring equality comparison. Defines operator== and operator!=
|
||||
template <typename T>
|
||||
class EqualityComparable {
|
||||
public:
|
||||
~EqualityComparable() {
|
||||
static_assert(
|
||||
std::is_same<decltype(std::declval<const T>().Equals(std::declval<const T>())),
|
||||
bool>::value,
|
||||
"EqualityComparable depends on the method T::Equals(const T&) const");
|
||||
}
|
||||
|
||||
template <typename... Extra>
|
||||
bool Equals(const std::shared_ptr<T>& other, Extra&&... extra) const {
|
||||
if (other == NULLPTR) {
|
||||
return false;
|
||||
}
|
||||
return cast().Equals(*other, std::forward<Extra>(extra)...);
|
||||
}
|
||||
|
||||
struct PtrsEqual {
|
||||
bool operator()(const std::shared_ptr<T>& l, const std::shared_ptr<T>& r) const {
|
||||
return l->Equals(*r);
|
||||
}
|
||||
};
|
||||
|
||||
friend bool operator==(T const& a, T const& b) { return a.Equals(b); }
|
||||
friend bool operator!=(T const& a, T const& b) { return !(a == b); }
|
||||
|
||||
private:
|
||||
const T& cast() const { return static_cast<const T&>(*this); }
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,241 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
constexpr int kUseDefaultCompressionLevel = std::numeric_limits<int>::min();
|
||||
|
||||
/// \brief Streaming compressor interface
|
||||
///
|
||||
class ARROW_EXPORT Compressor {
|
||||
public:
|
||||
virtual ~Compressor() = default;
|
||||
|
||||
struct CompressResult {
|
||||
int64_t bytes_read;
|
||||
int64_t bytes_written;
|
||||
};
|
||||
struct FlushResult {
|
||||
int64_t bytes_written;
|
||||
bool should_retry;
|
||||
};
|
||||
struct EndResult {
|
||||
int64_t bytes_written;
|
||||
bool should_retry;
|
||||
};
|
||||
|
||||
/// \brief Compress some input.
|
||||
///
|
||||
/// If bytes_read is 0 on return, then a larger output buffer should be supplied.
|
||||
virtual Result<CompressResult> Compress(int64_t input_len, const uint8_t* input,
|
||||
int64_t output_len, uint8_t* output) = 0;
|
||||
|
||||
/// \brief Flush part of the compressed output.
|
||||
///
|
||||
/// If should_retry is true on return, Flush() should be called again
|
||||
/// with a larger buffer.
|
||||
virtual Result<FlushResult> Flush(int64_t output_len, uint8_t* output) = 0;
|
||||
|
||||
/// \brief End compressing, doing whatever is necessary to end the stream.
|
||||
///
|
||||
/// If should_retry is true on return, End() should be called again
|
||||
/// with a larger buffer. Otherwise, the Compressor should not be used anymore.
|
||||
///
|
||||
/// End() implies Flush().
|
||||
virtual Result<EndResult> End(int64_t output_len, uint8_t* output) = 0;
|
||||
|
||||
// XXX add methods for buffer size heuristics?
|
||||
};
|
||||
|
||||
/// \brief Streaming decompressor interface
|
||||
///
|
||||
class ARROW_EXPORT Decompressor {
|
||||
public:
|
||||
virtual ~Decompressor() = default;
|
||||
|
||||
struct DecompressResult {
|
||||
// XXX is need_more_output necessary? (Brotli?)
|
||||
int64_t bytes_read;
|
||||
int64_t bytes_written;
|
||||
bool need_more_output;
|
||||
};
|
||||
|
||||
/// \brief Decompress some input.
|
||||
///
|
||||
/// If need_more_output is true on return, a larger output buffer needs
|
||||
/// to be supplied.
|
||||
virtual Result<DecompressResult> Decompress(int64_t input_len, const uint8_t* input,
|
||||
int64_t output_len, uint8_t* output) = 0;
|
||||
|
||||
/// \brief Return whether the compressed stream is finished.
|
||||
///
|
||||
/// This is a heuristic. If true is returned, then it is guaranteed
|
||||
/// that the stream is finished. If false is returned, however, it may
|
||||
/// simply be that the underlying library isn't able to provide the information.
|
||||
virtual bool IsFinished() = 0;
|
||||
|
||||
/// \brief Reinitialize decompressor, making it ready for a new compressed stream.
|
||||
virtual Status Reset() = 0;
|
||||
|
||||
// XXX add methods for buffer size heuristics?
|
||||
};
|
||||
|
||||
/// \brief Compression codec options
|
||||
class ARROW_EXPORT CodecOptions {
|
||||
public:
|
||||
explicit CodecOptions(int compression_level = kUseDefaultCompressionLevel)
|
||||
: compression_level(compression_level) {}
|
||||
|
||||
virtual ~CodecOptions() = default;
|
||||
|
||||
int compression_level;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// GZip codec options implementation
|
||||
|
||||
enum class GZipFormat {
|
||||
ZLIB,
|
||||
DEFLATE,
|
||||
GZIP,
|
||||
};
|
||||
|
||||
class ARROW_EXPORT GZipCodecOptions : public CodecOptions {
|
||||
public:
|
||||
GZipFormat gzip_format = GZipFormat::GZIP;
|
||||
std::optional<int> window_bits;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// brotli codec options implementation
|
||||
|
||||
class ARROW_EXPORT BrotliCodecOptions : public CodecOptions {
|
||||
public:
|
||||
std::optional<int> window_bits;
|
||||
};
|
||||
|
||||
/// \brief Compression codec
|
||||
class ARROW_EXPORT Codec {
|
||||
public:
|
||||
virtual ~Codec() = default;
|
||||
|
||||
/// \brief Return special value to indicate that a codec implementation
|
||||
/// should use its default compression level
|
||||
static int UseDefaultCompressionLevel();
|
||||
|
||||
/// \brief Return a string name for compression type
|
||||
static const std::string& GetCodecAsString(Compression::type t);
|
||||
|
||||
/// \brief Return compression type for name (all lower case)
|
||||
static Result<Compression::type> GetCompressionType(const std::string& name);
|
||||
|
||||
/// \brief Create a codec for the given compression algorithm with CodecOptions
|
||||
static Result<std::unique_ptr<Codec>> Create(
|
||||
Compression::type codec, const CodecOptions& codec_options = CodecOptions{});
|
||||
|
||||
/// \brief Create a codec for the given compression algorithm
|
||||
static Result<std::unique_ptr<Codec>> Create(Compression::type codec,
|
||||
int compression_level);
|
||||
|
||||
/// \brief Return true if support for indicated codec has been enabled
|
||||
static bool IsAvailable(Compression::type codec);
|
||||
|
||||
/// \brief Return true if indicated codec supports setting a compression level
|
||||
static bool SupportsCompressionLevel(Compression::type codec);
|
||||
|
||||
/// \brief Return the smallest supported compression level for the codec
|
||||
/// Note: This function creates a temporary Codec instance
|
||||
static Result<int> MinimumCompressionLevel(Compression::type codec);
|
||||
|
||||
/// \brief Return the largest supported compression level for the codec
|
||||
/// Note: This function creates a temporary Codec instance
|
||||
static Result<int> MaximumCompressionLevel(Compression::type codec);
|
||||
|
||||
/// \brief Return the default compression level
|
||||
/// Note: This function creates a temporary Codec instance
|
||||
static Result<int> DefaultCompressionLevel(Compression::type codec);
|
||||
|
||||
/// \brief Return the smallest supported compression level
|
||||
virtual int minimum_compression_level() const = 0;
|
||||
|
||||
/// \brief Return the largest supported compression level
|
||||
virtual int maximum_compression_level() const = 0;
|
||||
|
||||
/// \brief Return the default compression level
|
||||
virtual int default_compression_level() const = 0;
|
||||
|
||||
/// \brief One-shot decompression function
|
||||
///
|
||||
/// output_buffer_len must be correct and therefore be obtained in advance.
|
||||
/// The actual decompressed length is returned.
|
||||
///
|
||||
/// \note One-shot decompression is not always compatible with streaming
|
||||
/// compression. Depending on the codec (e.g. LZ4), different formats may
|
||||
/// be used.
|
||||
virtual Result<int64_t> Decompress(int64_t input_len, const uint8_t* input,
|
||||
int64_t output_buffer_len,
|
||||
uint8_t* output_buffer) = 0;
|
||||
|
||||
/// \brief One-shot compression function
|
||||
///
|
||||
/// output_buffer_len must first have been computed using MaxCompressedLen().
|
||||
/// The actual compressed length is returned.
|
||||
///
|
||||
/// \note One-shot compression is not always compatible with streaming
|
||||
/// decompression. Depending on the codec (e.g. LZ4), different formats may
|
||||
/// be used.
|
||||
virtual Result<int64_t> Compress(int64_t input_len, const uint8_t* input,
|
||||
int64_t output_buffer_len, uint8_t* output_buffer) = 0;
|
||||
|
||||
virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) = 0;
|
||||
|
||||
/// \brief Create a streaming compressor instance
|
||||
virtual Result<std::shared_ptr<Compressor>> MakeCompressor() = 0;
|
||||
|
||||
/// \brief Create a streaming compressor instance
|
||||
virtual Result<std::shared_ptr<Decompressor>> MakeDecompressor() = 0;
|
||||
|
||||
/// \brief This Codec's compression type
|
||||
virtual Compression::type compression_type() const = 0;
|
||||
|
||||
/// \brief The name of this Codec's compression type
|
||||
const std::string& name() const { return GetCodecAsString(compression_type()); }
|
||||
|
||||
/// \brief This Codec's compression level, if applicable
|
||||
virtual int compression_level() const { return UseDefaultCompressionLevel(); }
|
||||
|
||||
private:
|
||||
/// \brief Initializes the codec's resources.
|
||||
virtual Status Init();
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,68 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/util/mutex.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
template <typename K, typename V>
|
||||
class ConcurrentMap {
|
||||
public:
|
||||
void Insert(const K& key, const V& value) {
|
||||
auto lock = mutex_.Lock();
|
||||
map_.insert({key, value});
|
||||
}
|
||||
|
||||
template <typename ValueFunc>
|
||||
V GetOrInsert(const K& key, ValueFunc&& compute_value_func) {
|
||||
auto lock = mutex_.Lock();
|
||||
auto it = map_.find(key);
|
||||
if (it == map_.end()) {
|
||||
auto pair = map_.emplace(key, compute_value_func());
|
||||
it = pair.first;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
void Erase(const K& key) {
|
||||
auto lock = mutex_.Lock();
|
||||
map_.erase(key);
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
auto lock = mutex_.Lock();
|
||||
map_.clear();
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
auto lock = mutex_.Lock();
|
||||
return map_.size();
|
||||
}
|
||||
|
||||
private:
|
||||
std::unordered_map<K, V> map_;
|
||||
mutable arrow::util::Mutex mutex_;
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,69 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#define ARROW_VERSION_MAJOR 22
|
||||
#define ARROW_VERSION_MINOR 0
|
||||
#define ARROW_VERSION_PATCH 0
|
||||
#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH
|
||||
|
||||
#define ARROW_VERSION_STRING "22.0.0"
|
||||
|
||||
#define ARROW_SO_VERSION "2200"
|
||||
#define ARROW_FULL_SO_VERSION "2200.0.0"
|
||||
|
||||
#define ARROW_CXX_COMPILER_ID "GNU"
|
||||
#define ARROW_CXX_COMPILER_VERSION "14.2.1"
|
||||
#define ARROW_CXX_COMPILER_FLAGS " -Wno-noexcept-type -Wno-self-move -Wno-subobject-linkage -fdiagnostics-color=always -Wall -fno-semantic-interposition -msse4.2 "
|
||||
|
||||
#define ARROW_BUILD_TYPE "RELEASE"
|
||||
|
||||
#define ARROW_PACKAGE_KIND "python-wheel-manylinux228"
|
||||
|
||||
#define ARROW_COMPUTE
|
||||
#define ARROW_CSV
|
||||
/* #undef ARROW_CUDA */
|
||||
#define ARROW_DATASET
|
||||
#define ARROW_FILESYSTEM
|
||||
#define ARROW_FLIGHT
|
||||
/* #undef ARROW_FLIGHT_SQL */
|
||||
#define ARROW_IPC
|
||||
#define ARROW_JEMALLOC
|
||||
#define ARROW_JEMALLOC_VENDORED
|
||||
#define ARROW_JSON
|
||||
#define ARROW_MIMALLOC
|
||||
#define ARROW_ORC
|
||||
#define ARROW_PARQUET
|
||||
#define ARROW_SUBSTRAIT
|
||||
|
||||
#define ARROW_AZURE
|
||||
#define ARROW_ENABLE_THREADING
|
||||
#define ARROW_GCS
|
||||
#define ARROW_HDFS
|
||||
#define ARROW_S3
|
||||
/* #undef ARROW_USE_GLOG */
|
||||
#define ARROW_USE_NATIVE_INT128
|
||||
#define ARROW_WITH_BROTLI
|
||||
#define ARROW_WITH_BZ2
|
||||
#define ARROW_WITH_LZ4
|
||||
/* #undef ARROW_WITH_MUSL */
|
||||
/* #undef ARROW_WITH_OPENTELEMETRY */
|
||||
#define ARROW_WITH_RE2
|
||||
#define ARROW_WITH_SNAPPY
|
||||
#define ARROW_WITH_UTF8PROC
|
||||
#define ARROW_WITH_ZLIB
|
||||
#define ARROW_WITH_ZSTD
|
||||
#define PARQUET_REQUIRE_ENCRYPTION
|
||||
@@ -0,0 +1,411 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/chunked_array.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/visit_type_inline.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
template <typename BaseConverter, template <typename...> class ConverterTrait>
|
||||
static Result<std::unique_ptr<BaseConverter>> MakeConverter(
|
||||
std::shared_ptr<DataType> type, typename BaseConverter::OptionsType options,
|
||||
MemoryPool* pool);
|
||||
|
||||
template <typename Input, typename Options>
|
||||
class Converter {
|
||||
public:
|
||||
using Self = Converter<Input, Options>;
|
||||
using InputType = Input;
|
||||
using OptionsType = Options;
|
||||
|
||||
virtual ~Converter() = default;
|
||||
|
||||
Status Construct(std::shared_ptr<DataType> type, OptionsType options,
|
||||
MemoryPool* pool) {
|
||||
type_ = std::move(type);
|
||||
options_ = std::move(options);
|
||||
return Init(pool);
|
||||
}
|
||||
|
||||
virtual Status Append(InputType value) { return Status::NotImplemented("Append"); }
|
||||
|
||||
virtual Status Extend(InputType values, int64_t size, int64_t offset = 0) {
|
||||
return Status::NotImplemented("Extend");
|
||||
}
|
||||
|
||||
virtual Status ExtendMasked(InputType values, InputType mask, int64_t size,
|
||||
int64_t offset = 0) {
|
||||
return Status::NotImplemented("ExtendMasked");
|
||||
}
|
||||
|
||||
const std::shared_ptr<ArrayBuilder>& builder() const { return builder_; }
|
||||
|
||||
const std::shared_ptr<DataType>& type() const { return type_; }
|
||||
|
||||
OptionsType options() const { return options_; }
|
||||
|
||||
bool may_overflow() const { return may_overflow_; }
|
||||
|
||||
bool rewind_on_overflow() const { return rewind_on_overflow_; }
|
||||
|
||||
virtual Status Reserve(int64_t additional_capacity) {
|
||||
return builder_->Reserve(additional_capacity);
|
||||
}
|
||||
|
||||
Status AppendNull() { return builder_->AppendNull(); }
|
||||
|
||||
virtual Result<std::shared_ptr<Array>> ToArray() { return builder_->Finish(); }
|
||||
|
||||
virtual Result<std::shared_ptr<Array>> ToArray(int64_t length) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray());
|
||||
return arr->Slice(0, length);
|
||||
}
|
||||
|
||||
virtual Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() {
|
||||
ARROW_ASSIGN_OR_RAISE(auto array, ToArray());
|
||||
std::vector<std::shared_ptr<Array>> chunks = {std::move(array)};
|
||||
return std::make_shared<ChunkedArray>(chunks);
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual Status Init(MemoryPool* pool) { return Status::OK(); }
|
||||
|
||||
std::shared_ptr<DataType> type_;
|
||||
std::shared_ptr<ArrayBuilder> builder_;
|
||||
OptionsType options_;
|
||||
bool may_overflow_ = false;
|
||||
bool rewind_on_overflow_ = false;
|
||||
};
|
||||
|
||||
template <typename ArrowType, typename BaseConverter>
|
||||
class PrimitiveConverter : public BaseConverter {
|
||||
public:
|
||||
using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
|
||||
|
||||
protected:
|
||||
Status Init(MemoryPool* pool) override {
|
||||
this->builder_ = std::make_shared<BuilderType>(this->type_, pool);
|
||||
// Narrow variable-sized binary types may overflow
|
||||
this->may_overflow_ = is_binary_like(this->type_->id());
|
||||
primitive_type_ = checked_cast<const ArrowType*>(this->type_.get());
|
||||
primitive_builder_ = checked_cast<BuilderType*>(this->builder_.get());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
const ArrowType* primitive_type_;
|
||||
BuilderType* primitive_builder_;
|
||||
};
|
||||
|
||||
template <typename ArrowType, typename BaseConverter,
|
||||
template <typename...> class ConverterTrait>
|
||||
class ListConverter : public BaseConverter {
|
||||
public:
|
||||
using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
|
||||
using ConverterType = typename ConverterTrait<ArrowType>::type;
|
||||
|
||||
protected:
|
||||
Status Init(MemoryPool* pool) override {
|
||||
list_type_ = checked_cast<const ArrowType*>(this->type_.get());
|
||||
ARROW_ASSIGN_OR_RAISE(value_converter_,
|
||||
(MakeConverter<BaseConverter, ConverterTrait>(
|
||||
list_type_->value_type(), this->options_, pool)));
|
||||
this->builder_ =
|
||||
std::make_shared<BuilderType>(pool, value_converter_->builder(), this->type_);
|
||||
list_builder_ = checked_cast<BuilderType*>(this->builder_.get());
|
||||
// Narrow list types may overflow
|
||||
this->may_overflow_ = this->rewind_on_overflow_ =
|
||||
sizeof(typename ArrowType::offset_type) < sizeof(int64_t);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
const ArrowType* list_type_;
|
||||
BuilderType* list_builder_;
|
||||
std::unique_ptr<BaseConverter> value_converter_;
|
||||
};
|
||||
|
||||
template <typename BaseConverter, template <typename...> class ConverterTrait>
|
||||
class StructConverter : public BaseConverter {
|
||||
public:
|
||||
using ConverterType = typename ConverterTrait<StructType>::type;
|
||||
|
||||
Status Reserve(int64_t additional_capacity) override {
|
||||
ARROW_RETURN_NOT_OK(this->builder_->Reserve(additional_capacity));
|
||||
for (const auto& child : children_) {
|
||||
ARROW_RETURN_NOT_OK(child->Reserve(additional_capacity));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
Status Init(MemoryPool* pool) override {
|
||||
std::unique_ptr<BaseConverter> child_converter;
|
||||
std::vector<std::shared_ptr<ArrayBuilder>> child_builders;
|
||||
|
||||
struct_type_ = checked_cast<const StructType*>(this->type_.get());
|
||||
for (const auto& field : struct_type_->fields()) {
|
||||
ARROW_ASSIGN_OR_RAISE(child_converter,
|
||||
(MakeConverter<BaseConverter, ConverterTrait>(
|
||||
field->type(), this->options_, pool)));
|
||||
this->may_overflow_ |= child_converter->may_overflow();
|
||||
this->rewind_on_overflow_ = this->may_overflow_;
|
||||
child_builders.push_back(child_converter->builder());
|
||||
children_.push_back(std::move(child_converter));
|
||||
}
|
||||
|
||||
this->builder_ =
|
||||
std::make_shared<StructBuilder>(this->type_, pool, std::move(child_builders));
|
||||
struct_builder_ = checked_cast<StructBuilder*>(this->builder_.get());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
const StructType* struct_type_;
|
||||
StructBuilder* struct_builder_;
|
||||
std::vector<std::unique_ptr<BaseConverter>> children_;
|
||||
};
|
||||
|
||||
template <typename ValueType, typename BaseConverter>
|
||||
class DictionaryConverter : public BaseConverter {
|
||||
public:
|
||||
using BuilderType = DictionaryBuilder<ValueType>;
|
||||
|
||||
protected:
|
||||
Status Init(MemoryPool* pool) override {
|
||||
std::unique_ptr<ArrayBuilder> builder;
|
||||
ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, this->type_, NULLPTR, &builder));
|
||||
this->builder_ = std::move(builder);
|
||||
this->may_overflow_ = false;
|
||||
dict_type_ = checked_cast<const DictionaryType*>(this->type_.get());
|
||||
value_type_ = checked_cast<const ValueType*>(dict_type_->value_type().get());
|
||||
value_builder_ = checked_cast<BuilderType*>(this->builder_.get());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
const DictionaryType* dict_type_;
|
||||
const ValueType* value_type_;
|
||||
BuilderType* value_builder_;
|
||||
};
|
||||
|
||||
template <typename BaseConverter, template <typename...> class ConverterTrait>
|
||||
struct MakeConverterImpl {
|
||||
template <typename T, typename ConverterType = typename ConverterTrait<T>::type>
|
||||
Status Visit(const T&) {
|
||||
out.reset(new ConverterType());
|
||||
return out->Construct(std::move(type), std::move(options), pool);
|
||||
}
|
||||
|
||||
Status Visit(const DictionaryType& t) {
|
||||
switch (t.value_type()->id()) {
|
||||
#define DICTIONARY_CASE(TYPE) \
|
||||
case TYPE::type_id: \
|
||||
out = std::make_unique< \
|
||||
typename ConverterTrait<DictionaryType>::template dictionary_type<TYPE>>(); \
|
||||
break;
|
||||
DICTIONARY_CASE(BooleanType);
|
||||
DICTIONARY_CASE(Int8Type);
|
||||
DICTIONARY_CASE(Int16Type);
|
||||
DICTIONARY_CASE(Int32Type);
|
||||
DICTIONARY_CASE(Int64Type);
|
||||
DICTIONARY_CASE(UInt8Type);
|
||||
DICTIONARY_CASE(UInt16Type);
|
||||
DICTIONARY_CASE(UInt32Type);
|
||||
DICTIONARY_CASE(UInt64Type);
|
||||
DICTIONARY_CASE(FloatType);
|
||||
DICTIONARY_CASE(DoubleType);
|
||||
DICTIONARY_CASE(BinaryType);
|
||||
DICTIONARY_CASE(StringType);
|
||||
DICTIONARY_CASE(FixedSizeBinaryType);
|
||||
#undef DICTIONARY_CASE
|
||||
default:
|
||||
return Status::NotImplemented("DictionaryArray converter for type ", t.ToString(),
|
||||
" not implemented");
|
||||
}
|
||||
return out->Construct(std::move(type), std::move(options), pool);
|
||||
}
|
||||
|
||||
Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); }
|
||||
|
||||
std::shared_ptr<DataType> type;
|
||||
typename BaseConverter::OptionsType options;
|
||||
MemoryPool* pool;
|
||||
std::unique_ptr<BaseConverter> out;
|
||||
};
|
||||
|
||||
template <typename BaseConverter, template <typename...> class ConverterTrait>
|
||||
static Result<std::unique_ptr<BaseConverter>> MakeConverter(
|
||||
std::shared_ptr<DataType> type, typename BaseConverter::OptionsType options,
|
||||
MemoryPool* pool) {
|
||||
MakeConverterImpl<BaseConverter, ConverterTrait> visitor{
|
||||
std::move(type), std::move(options), pool, NULLPTR};
|
||||
ARROW_RETURN_NOT_OK(VisitTypeInline(*visitor.type, &visitor));
|
||||
return std::move(visitor.out);
|
||||
}
|
||||
|
||||
template <typename Converter>
|
||||
class Chunker {
|
||||
public:
|
||||
using InputType = typename Converter::InputType;
|
||||
|
||||
explicit Chunker(std::unique_ptr<Converter> converter)
|
||||
: converter_(std::move(converter)) {}
|
||||
|
||||
Status Reserve(int64_t additional_capacity) {
|
||||
ARROW_RETURN_NOT_OK(converter_->Reserve(additional_capacity));
|
||||
reserved_ += additional_capacity;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() {
|
||||
auto status = converter_->AppendNull();
|
||||
if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
|
||||
if (converter_->builder()->length() == 0) {
|
||||
// Builder length == 0 means the individual element is too large to append.
|
||||
// In this case, no need to try again.
|
||||
return status;
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(FinishChunk());
|
||||
return converter_->AppendNull();
|
||||
}
|
||||
++length_;
|
||||
return status;
|
||||
}
|
||||
|
||||
Status Append(InputType value) {
|
||||
auto status = converter_->Append(value);
|
||||
if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
|
||||
if (converter_->builder()->length() == 0) {
|
||||
return status;
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(FinishChunk());
|
||||
return Append(value);
|
||||
}
|
||||
++length_;
|
||||
return status;
|
||||
}
|
||||
|
||||
Status Extend(InputType values, int64_t size, int64_t offset = 0) {
|
||||
while (offset < size) {
|
||||
auto length_before = converter_->builder()->length();
|
||||
auto status = converter_->Extend(values, size, offset);
|
||||
auto length_after = converter_->builder()->length();
|
||||
auto num_converted = length_after - length_before;
|
||||
|
||||
offset += num_converted;
|
||||
length_ += num_converted;
|
||||
|
||||
if (status.IsCapacityError()) {
|
||||
if (converter_->builder()->length() == 0) {
|
||||
// Builder length == 0 means the individual element is too large to append.
|
||||
// In this case, no need to try again.
|
||||
return status;
|
||||
} else if (converter_->rewind_on_overflow()) {
|
||||
// The list-like and binary-like conversion paths may raise a capacity error,
|
||||
// we need to handle them differently. While the binary-like converters check
|
||||
// the capacity before append/extend the list-like converters just check after
|
||||
// append/extend. Thus depending on the implementation semantics we may need
|
||||
// to rewind (slice) the output chunk by one.
|
||||
length_ -= 1;
|
||||
offset -= 1;
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(FinishChunk());
|
||||
} else if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ExtendMasked(InputType values, InputType mask, int64_t size,
|
||||
int64_t offset = 0) {
|
||||
while (offset < size) {
|
||||
auto length_before = converter_->builder()->length();
|
||||
auto status = converter_->ExtendMasked(values, mask, size, offset);
|
||||
auto length_after = converter_->builder()->length();
|
||||
auto num_converted = length_after - length_before;
|
||||
|
||||
offset += num_converted;
|
||||
length_ += num_converted;
|
||||
|
||||
if (status.IsCapacityError()) {
|
||||
if (converter_->builder()->length() == 0) {
|
||||
// Builder length == 0 means the individual element is too large to append.
|
||||
// In this case, no need to try again.
|
||||
return status;
|
||||
} else if (converter_->rewind_on_overflow()) {
|
||||
// The list-like and binary-like conversion paths may raise a capacity error,
|
||||
// we need to handle them differently. While the binary-like converters check
|
||||
// the capacity before append/extend the list-like converters just check after
|
||||
// append/extend. Thus depending on the implementation semantics we may need
|
||||
// to rewind (slice) the output chunk by one.
|
||||
length_ -= 1;
|
||||
offset -= 1;
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(FinishChunk());
|
||||
} else if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishChunk() {
|
||||
ARROW_ASSIGN_OR_RAISE(auto chunk, converter_->ToArray(length_));
|
||||
chunks_.push_back(chunk);
|
||||
// Reserve space for the remaining items.
|
||||
// Besides being an optimization, it is also required if the converter's
|
||||
// implementation relies on unsafe builder methods in converter->Append().
|
||||
auto remaining = reserved_ - length_;
|
||||
Reset();
|
||||
return Reserve(remaining);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() {
|
||||
ARROW_RETURN_NOT_OK(FinishChunk());
|
||||
return std::make_shared<ChunkedArray>(chunks_);
|
||||
}
|
||||
|
||||
protected:
|
||||
void Reset() {
|
||||
converter_->builder()->Reset();
|
||||
length_ = 0;
|
||||
reserved_ = 0;
|
||||
}
|
||||
|
||||
int64_t length_ = 0;
|
||||
int64_t reserved_ = 0;
|
||||
std::unique_ptr<Converter> converter_;
|
||||
std::vector<std::shared_ptr<Array>> chunks_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static Result<std::unique_ptr<Chunker<T>>> MakeChunker(std::unique_ptr<T> converter) {
|
||||
return std::make_unique<Chunker<T>>(std::move(converter));
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,114 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// From Apache Impala (incubating) as of 2016-01-29. Pared down to a minimal
|
||||
// set of functions needed for Apache Arrow / Apache parquet-cpp
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
/// CpuInfo is an interface to query for cpu information at runtime. The caller can
|
||||
/// ask for the sizes of the caches and what hardware features are supported.
|
||||
/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
|
||||
/// /sys/devices)
|
||||
class ARROW_EXPORT CpuInfo {
|
||||
public:
|
||||
~CpuInfo();
|
||||
|
||||
/// x86 features
|
||||
static constexpr int64_t SSSE3 = (1LL << 0);
|
||||
static constexpr int64_t SSE4_1 = (1LL << 1);
|
||||
static constexpr int64_t SSE4_2 = (1LL << 2);
|
||||
static constexpr int64_t POPCNT = (1LL << 3);
|
||||
static constexpr int64_t AVX = (1LL << 4);
|
||||
static constexpr int64_t AVX2 = (1LL << 5);
|
||||
static constexpr int64_t AVX512F = (1LL << 6);
|
||||
static constexpr int64_t AVX512CD = (1LL << 7);
|
||||
static constexpr int64_t AVX512VL = (1LL << 8);
|
||||
static constexpr int64_t AVX512DQ = (1LL << 9);
|
||||
static constexpr int64_t AVX512BW = (1LL << 10);
|
||||
static constexpr int64_t AVX512 = AVX512F | AVX512CD | AVX512VL | AVX512DQ | AVX512BW;
|
||||
static constexpr int64_t BMI1 = (1LL << 11);
|
||||
static constexpr int64_t BMI2 = (1LL << 12);
|
||||
|
||||
/// Arm features
|
||||
static constexpr int64_t ASIMD = (1LL << 32);
|
||||
|
||||
/// Cache enums for L1 (data), L2 and L3
|
||||
enum class CacheLevel { L1 = 0, L2, L3, Last = L3 };
|
||||
|
||||
/// CPU vendors
|
||||
enum class Vendor { Unknown, Intel, AMD };
|
||||
|
||||
static const CpuInfo* GetInstance();
|
||||
|
||||
/// Returns all the flags for this cpu
|
||||
int64_t hardware_flags() const;
|
||||
|
||||
/// Returns the number of cores (including hyper-threaded) on this machine.
|
||||
int num_cores() const;
|
||||
|
||||
/// Returns the vendor of the cpu.
|
||||
Vendor vendor() const;
|
||||
|
||||
/// Returns the model name of the cpu (e.g. Intel i7-2600)
|
||||
const std::string& model_name() const;
|
||||
|
||||
/// Returns the size of the cache in KB at this cache level
|
||||
int64_t CacheSize(CacheLevel level) const;
|
||||
|
||||
/// \brief Returns whether or not the given feature is enabled.
|
||||
///
|
||||
/// IsSupported() is true iff IsDetected() is also true and the feature
|
||||
/// wasn't disabled by the user (for example by setting the ARROW_USER_SIMD_LEVEL
|
||||
/// environment variable).
|
||||
bool IsSupported(int64_t flags) const;
|
||||
|
||||
/// Returns whether or not the given feature is available on the CPU.
|
||||
bool IsDetected(int64_t flags) const;
|
||||
|
||||
/// Determine if the CPU meets the minimum CPU requirements and if not, issue an error
|
||||
/// and terminate.
|
||||
void VerifyCpuRequirements() const;
|
||||
|
||||
/// Toggle a hardware feature on and off. It is not valid to turn on a feature
|
||||
/// that the underlying hardware cannot support. This is useful for testing.
|
||||
void EnableFeature(int64_t flag, bool enable);
|
||||
|
||||
bool HasEfficientBmi2() const {
|
||||
// BMI2 (pext, pdep) is only efficient on Intel X86 processors.
|
||||
return vendor() == Vendor::Intel && IsSupported(BMI2);
|
||||
}
|
||||
|
||||
private:
|
||||
CpuInfo();
|
||||
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,36 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
/// \brief Compute the CRC32 checksum of the given data
|
||||
///
|
||||
/// This function computes CRC32 with the polynomial 0x04C11DB7,
|
||||
/// as used in zlib and others (note this is different from CRC32C).
|
||||
/// To compute a running CRC32, pass the previous value in `prev`,
|
||||
/// otherwise `prev` should be 0.
|
||||
ARROW_EXPORT
|
||||
uint32_t crc32(uint32_t prev, const void* data, size_t length);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,29 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
ARROW_EXPORT
|
||||
void DebugTrap();
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,523 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <iosfwd>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/basic_decimal.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Decimal64;
|
||||
|
||||
namespace internal {
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ToArrowStatus(DecimalStatus);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
template <>
|
||||
struct IntoStatus<DecimalStatus> {
|
||||
static inline Status ToStatus(DecimalStatus st) { return internal::ToArrowStatus(st); }
|
||||
};
|
||||
|
||||
/// Represents a signed 32-bit decimal value in two's complement.
|
||||
/// Calulations wrap around and overflow is ignored.
|
||||
/// The max decimal precision that can be safely represented is
|
||||
/// 9 significant digits.
|
||||
///
|
||||
/// The implementation is split into two parts :
|
||||
///
|
||||
/// 1. BasicDecimal32
|
||||
/// - can be safely compiled to IR without references to libstdc++
|
||||
/// 2. Decimal32
|
||||
/// - has additional functionality on top of BasicDecimal32 to deal with
|
||||
/// strings and streams
|
||||
class ARROW_EXPORT Decimal32 : public BasicDecimal32 {
|
||||
public:
|
||||
/// \cond FALSE
|
||||
// (need to avoid a duplicate definition in sphinx)
|
||||
using BasicDecimal32::BasicDecimal32;
|
||||
/// \endcond
|
||||
|
||||
/// \brief constructor creates a Decimal32 from a BasicDecimal32
|
||||
constexpr Decimal32(const BasicDecimal32& value) noexcept // NOLINT runtime/explicit
|
||||
: BasicDecimal32(value) {}
|
||||
|
||||
/// \brief Parse the number from a base 10 string representation
|
||||
explicit Decimal32(const std::string& value);
|
||||
|
||||
/// \brief Empty constructor creates a Decimal32 with a value of 0
|
||||
/// this is required for some older compilers
|
||||
constexpr Decimal32() noexcept : BasicDecimal32() {}
|
||||
|
||||
/// \brief Divide this number by right and return the result.
|
||||
///
|
||||
/// This operation is not destructive.
|
||||
/// The answer rounds to zero. Signs work like:
|
||||
/// 21 / 5 -> 4, 1
|
||||
/// -21 / 5 -> -4, -1
|
||||
/// 21 / -5 -> -4, 1
|
||||
/// -21 / -5 -> 4, -1
|
||||
/// \param[in] divisor the number to divide by
|
||||
/// \return the pair of the quotient and the remainder
|
||||
Result<std::pair<Decimal32, Decimal32>> Divide(const Decimal32& divisor) const {
|
||||
std::pair<Decimal32, Decimal32> result;
|
||||
ARROW_RETURN_NOT_OK(BasicDecimal32::Divide(divisor, &result.first, &result.second));
|
||||
return result;
|
||||
}
|
||||
|
||||
/// \brief Convert the Decimal32 value to a base 10 decimal string with the given scale
|
||||
std::string ToString(int32_t scale) const;
|
||||
|
||||
/// \brief Convert the value to an integer string
|
||||
std::string ToIntegerString() const;
|
||||
|
||||
/// \brief Cast this value to an int64_t
|
||||
explicit operator int64_t() const;
|
||||
|
||||
explicit operator Decimal64() const;
|
||||
|
||||
/// \brief Convert a decimal string to a Decimal value, optionally including
|
||||
/// precision and scale if they're passed in and not null.
|
||||
static Status FromString(std::string_view s, Decimal32* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Status FromString(const std::string& s, Decimal32* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Status FromString(const char* s, Decimal32* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Result<Decimal32> FromString(std::string_view s);
|
||||
static Result<Decimal32> FromString(const std::string& s);
|
||||
static Result<Decimal32> FromString(const char* s);
|
||||
|
||||
static Result<Decimal32> FromReal(double real, int32_t precision, int32_t scale);
|
||||
static Result<Decimal32> FromReal(float real, int32_t precision, int32_t scale);
|
||||
|
||||
/// \brief Convert from a big-endian byte representation. The length must be
|
||||
/// between 1 and 4
|
||||
/// \return error status if the length is an invalid value
|
||||
static Result<Decimal32> FromBigEndian(const uint8_t* data, int32_t length);
|
||||
|
||||
/// \brief Convert Decimal32 from one scale to another
|
||||
Result<Decimal32> Rescale(int32_t original_scale, int32_t new_scale) const {
|
||||
Decimal32 out;
|
||||
ARROW_RETURN_NOT_OK(BasicDecimal32::Rescale(original_scale, new_scale, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Convert to a signed integer
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
|
||||
Result<T> ToInteger() const {
|
||||
return static_cast<T>(value_);
|
||||
}
|
||||
|
||||
/// \brief Convert to a signed integer
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
|
||||
Status ToInteger(T* out) const {
|
||||
return ToInteger<T>().Value(out);
|
||||
}
|
||||
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
float ToFloat(int32_t scale) const;
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
double ToDouble(int32_t scale) const;
|
||||
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
|
||||
T ToReal(int32_t scale) const {
|
||||
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
|
||||
"Unexpected floating-point type");
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return ToFloat(scale);
|
||||
} else {
|
||||
return ToDouble(scale);
|
||||
}
|
||||
}
|
||||
|
||||
ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
|
||||
const Decimal32& decimal);
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Decimal64 : public BasicDecimal64 {
|
||||
public:
|
||||
/// \cond FALSE
|
||||
// (need to avoid a duplicate definition in sphinx)
|
||||
using BasicDecimal64::BasicDecimal64;
|
||||
/// \endcond
|
||||
|
||||
/// \brief constructor creates a Decimal64 from a BasicDecimal64
|
||||
constexpr Decimal64(const BasicDecimal64& value) noexcept // NOLINT runtime/explicit
|
||||
: BasicDecimal64(value) {}
|
||||
|
||||
explicit Decimal64(const BasicDecimal32& value) noexcept
|
||||
: BasicDecimal64(static_cast<int64_t>(value.value())) {}
|
||||
|
||||
/// \brief Parse the number from a base 10 string representation
|
||||
explicit Decimal64(const std::string& value);
|
||||
|
||||
/// \brief Empty constructor creates a Decimal64 with a value of 0
|
||||
/// this is required for some older compilers
|
||||
constexpr Decimal64() noexcept : BasicDecimal64() {}
|
||||
|
||||
/// \brief Divide this number by right and return the result.
|
||||
///
|
||||
/// This operation is not destructive.
|
||||
/// The answer rounds to zero. Signs work like:
|
||||
/// 21 / 5 -> 4, 1
|
||||
/// -21 / 5 -> -4, -1
|
||||
/// 21 / -5 -> -4, 1
|
||||
/// -21 / -5 -> 4, -1
|
||||
/// \param[in] divisor the number to divide by
|
||||
/// \return the pair of the quotient and the remainder
|
||||
Result<std::pair<Decimal64, Decimal64>> Divide(const Decimal64& divisor) const {
|
||||
std::pair<Decimal64, Decimal64> result;
|
||||
ARROW_RETURN_NOT_OK(BasicDecimal64::Divide(divisor, &result.first, &result.second));
|
||||
return result;
|
||||
}
|
||||
|
||||
/// \brief Convert the Decimal64 value to a base 10 decimal string with the given scale
|
||||
std::string ToString(int32_t scale) const;
|
||||
|
||||
/// \brief Convert the value to an integer string
|
||||
std::string ToIntegerString() const;
|
||||
|
||||
/// \brief Cast this value to an int64_t
|
||||
explicit operator int64_t() const;
|
||||
|
||||
/// \brief Convert a decimal string to a Decimal value, optionally including
|
||||
/// precision and scale if they're passed in and not null.
|
||||
static Status FromString(std::string_view s, Decimal64* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Status FromString(const std::string& s, Decimal64* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Status FromString(const char* s, Decimal64* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Result<Decimal64> FromString(std::string_view s);
|
||||
static Result<Decimal64> FromString(const std::string& s);
|
||||
static Result<Decimal64> FromString(const char* s);
|
||||
|
||||
static Result<Decimal64> FromReal(double real, int32_t precision, int32_t scale);
|
||||
static Result<Decimal64> FromReal(float real, int32_t precision, int32_t scale);
|
||||
|
||||
/// \brief Convert from a big-endian byte representation. The length must be
|
||||
/// between 1 and 4
|
||||
/// \return error status if the length is an invalid value
|
||||
static Result<Decimal64> FromBigEndian(const uint8_t* data, int32_t length);
|
||||
|
||||
/// \brief Convert Decimal64 from one scale to another
|
||||
Result<Decimal64> Rescale(int32_t original_scale, int32_t new_scale) const {
|
||||
Decimal64 out;
|
||||
ARROW_RETURN_NOT_OK(BasicDecimal64::Rescale(original_scale, new_scale, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Convert to a signed integer
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
|
||||
Result<T> ToInteger() const {
|
||||
return static_cast<T>(value_);
|
||||
}
|
||||
|
||||
/// \brief Convert to a signed integer
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
|
||||
Status ToInteger(T* out) const {
|
||||
return ToInteger<T>().Value(out);
|
||||
}
|
||||
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
float ToFloat(int32_t scale) const;
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
double ToDouble(int32_t scale) const;
|
||||
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
|
||||
T ToReal(int32_t scale) const {
|
||||
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
|
||||
"Unexpected floating-point type");
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return ToFloat(scale);
|
||||
} else {
|
||||
return ToDouble(scale);
|
||||
}
|
||||
}
|
||||
|
||||
ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
|
||||
const Decimal64& decimal);
|
||||
};
|
||||
|
||||
/// Represents a signed 128-bit integer in two's complement.
|
||||
/// Calculations wrap around and overflow is ignored.
|
||||
/// The max decimal precision that can be safely represented is
|
||||
/// 38 significant digits.
|
||||
///
|
||||
/// For a discussion of the algorithms, look at Knuth's volume 2,
|
||||
/// Semi-numerical Algorithms section 4.3.1.
|
||||
///
|
||||
/// Adapted from the Apache ORC C++ implementation
|
||||
///
|
||||
/// The implementation is split into two parts :
|
||||
///
|
||||
/// 1. BasicDecimal128
|
||||
/// - can be safely compiled to IR without references to libstdc++.
|
||||
/// 2. Decimal128
|
||||
/// - has additional functionality on top of BasicDecimal128 to deal with
|
||||
/// strings and streams.
|
||||
class ARROW_EXPORT Decimal128 : public BasicDecimal128 {
|
||||
public:
|
||||
/// \cond FALSE
|
||||
// (need to avoid a duplicate definition in Sphinx)
|
||||
using BasicDecimal128::BasicDecimal128;
|
||||
/// \endcond
|
||||
|
||||
/// \brief constructor creates a Decimal128 from a BasicDecimal128.
|
||||
constexpr Decimal128(const BasicDecimal128& value) noexcept // NOLINT runtime/explicit
|
||||
: BasicDecimal128(value) {}
|
||||
|
||||
/// \brief Parse the number from a base 10 string representation.
|
||||
explicit Decimal128(const std::string& value);
|
||||
|
||||
/// \brief Empty constructor creates a Decimal128 with a value of 0.
|
||||
// This is required on some older compilers.
|
||||
constexpr Decimal128() noexcept : BasicDecimal128() {}
|
||||
|
||||
/// Divide this number by right and return the result.
|
||||
///
|
||||
/// This operation is not destructive.
|
||||
/// The answer rounds to zero. Signs work like:
|
||||
/// 21 / 5 -> 4, 1
|
||||
/// -21 / 5 -> -4, -1
|
||||
/// 21 / -5 -> -4, 1
|
||||
/// -21 / -5 -> 4, -1
|
||||
/// \param[in] divisor the number to divide by
|
||||
/// \return the pair of the quotient and the remainder
|
||||
Result<std::pair<Decimal128, Decimal128>> Divide(const Decimal128& divisor) const {
|
||||
std::pair<Decimal128, Decimal128> result;
|
||||
ARROW_RETURN_NOT_OK(BasicDecimal128::Divide(divisor, &result.first, &result.second));
|
||||
return result;
|
||||
}
|
||||
|
||||
/// \brief Convert the Decimal128 value to a base 10 decimal string with the given
|
||||
/// scale.
|
||||
std::string ToString(int32_t scale) const;
|
||||
|
||||
/// \brief Convert the value to an integer string
|
||||
std::string ToIntegerString() const;
|
||||
|
||||
/// \brief Cast this value to an int64_t.
|
||||
explicit operator int64_t() const;
|
||||
|
||||
/// \brief Convert a decimal string to a Decimal128 value, optionally including
|
||||
/// precision and scale if they're passed in and not null.
|
||||
static Status FromString(std::string_view s, Decimal128* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Status FromString(const std::string& s, Decimal128* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Status FromString(const char* s, Decimal128* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Result<Decimal128> FromString(std::string_view s);
|
||||
static Result<Decimal128> FromString(const std::string& s);
|
||||
static Result<Decimal128> FromString(const char* s);
|
||||
|
||||
static Result<Decimal128> FromReal(double real, int32_t precision, int32_t scale);
|
||||
static Result<Decimal128> FromReal(float real, int32_t precision, int32_t scale);
|
||||
|
||||
/// \brief Convert from a big-endian byte representation. The length must be
|
||||
/// between 1 and 16.
|
||||
/// \return error status if the length is an invalid value
|
||||
static Result<Decimal128> FromBigEndian(const uint8_t* data, int32_t length);
|
||||
|
||||
/// \brief Convert Decimal128 from one scale to another
|
||||
Result<Decimal128> Rescale(int32_t original_scale, int32_t new_scale) const {
|
||||
Decimal128 out;
|
||||
ARROW_RETURN_NOT_OK(BasicDecimal128::Rescale(original_scale, new_scale, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Convert to a signed integer
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
|
||||
Result<T> ToInteger() const {
|
||||
constexpr auto min_value = std::numeric_limits<T>::min();
|
||||
constexpr auto max_value = std::numeric_limits<T>::max();
|
||||
const auto& self = *this;
|
||||
if (self < min_value || self > max_value) {
|
||||
return Status::Invalid("Invalid cast from Decimal128 to ", sizeof(T),
|
||||
" byte integer");
|
||||
}
|
||||
return static_cast<T>(low_bits());
|
||||
}
|
||||
|
||||
/// \brief Convert to a signed integer
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
|
||||
Status ToInteger(T* out) const {
|
||||
return ToInteger<T>().Value(out);
|
||||
}
|
||||
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
float ToFloat(int32_t scale) const;
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
double ToDouble(int32_t scale) const;
|
||||
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
|
||||
T ToReal(int32_t scale) const {
|
||||
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
|
||||
"Unexpected floating-point type");
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return ToFloat(scale);
|
||||
} else {
|
||||
return ToDouble(scale);
|
||||
}
|
||||
}
|
||||
|
||||
ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
|
||||
const Decimal128& decimal);
|
||||
};
|
||||
|
||||
/// Represents a signed 256-bit integer in two's complement.
|
||||
/// The max decimal precision that can be safely represented is
|
||||
/// 76 significant digits.
|
||||
///
|
||||
/// The implementation is split into two parts :
|
||||
///
|
||||
/// 1. BasicDecimal256
|
||||
/// - can be safely compiled to IR without references to libstdc++.
|
||||
/// 2. Decimal256
|
||||
/// - (TODO) has additional functionality on top of BasicDecimal256 to deal with
|
||||
/// strings and streams.
|
||||
class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
|
||||
public:
|
||||
/// \cond FALSE
|
||||
// (need to avoid a duplicate definition in Sphinx)
|
||||
using BasicDecimal256::BasicDecimal256;
|
||||
/// \endcond
|
||||
|
||||
/// \brief constructor creates a Decimal256 from a BasicDecimal256.
|
||||
constexpr Decimal256(const BasicDecimal256& value) noexcept // NOLINT(runtime/explicit)
|
||||
: BasicDecimal256(value) {}
|
||||
|
||||
/// \brief Parse the number from a base 10 string representation.
|
||||
explicit Decimal256(const std::string& value);
|
||||
|
||||
/// \brief Empty constructor creates a Decimal256 with a value of 0.
|
||||
// This is required on some older compilers.
|
||||
constexpr Decimal256() noexcept : BasicDecimal256() {}
|
||||
|
||||
/// \brief Convert the Decimal256 value to a base 10 decimal string with the given
|
||||
/// scale.
|
||||
std::string ToString(int32_t scale) const;
|
||||
|
||||
/// \brief Convert the value to an integer string
|
||||
std::string ToIntegerString() const;
|
||||
|
||||
/// \brief Convert a decimal string to a Decimal256 value, optionally including
|
||||
/// precision and scale if they're passed in and not null.
|
||||
static Status FromString(std::string_view s, Decimal256* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Status FromString(const std::string& s, Decimal256* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Status FromString(const char* s, Decimal256* out, int32_t* precision,
|
||||
int32_t* scale = NULLPTR);
|
||||
static Result<Decimal256> FromString(std::string_view s);
|
||||
static Result<Decimal256> FromString(const std::string& s);
|
||||
static Result<Decimal256> FromString(const char* s);
|
||||
|
||||
/// \brief Convert Decimal256 from one scale to another
|
||||
Result<Decimal256> Rescale(int32_t original_scale, int32_t new_scale) const {
|
||||
Decimal256 out;
|
||||
ARROW_RETURN_NOT_OK(BasicDecimal256::Rescale(original_scale, new_scale, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Divide this number by right and return the result.
|
||||
///
|
||||
/// This operation is not destructive.
|
||||
/// The answer rounds to zero. Signs work like:
|
||||
/// 21 / 5 -> 4, 1
|
||||
/// -21 / 5 -> -4, -1
|
||||
/// 21 / -5 -> -4, 1
|
||||
/// -21 / -5 -> 4, -1
|
||||
/// \param[in] divisor the number to divide by
|
||||
/// \return the pair of the quotient and the remainder
|
||||
Result<std::pair<Decimal256, Decimal256>> Divide(const Decimal256& divisor) const {
|
||||
std::pair<Decimal256, Decimal256> result;
|
||||
ARROW_RETURN_NOT_OK(BasicDecimal256::Divide(divisor, &result.first, &result.second));
|
||||
return result;
|
||||
}
|
||||
|
||||
/// \brief Convert from a big-endian byte representation. The length must be
|
||||
/// between 1 and 32.
|
||||
/// \return error status if the length is an invalid value
|
||||
static Result<Decimal256> FromBigEndian(const uint8_t* data, int32_t length);
|
||||
|
||||
static Result<Decimal256> FromReal(double real, int32_t precision, int32_t scale);
|
||||
static Result<Decimal256> FromReal(float real, int32_t precision, int32_t scale);
|
||||
|
||||
/// \brief Convert to a floating-point number (scaled).
|
||||
/// May return infinity in case of overflow.
|
||||
float ToFloat(int32_t scale) const;
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
double ToDouble(int32_t scale) const;
|
||||
|
||||
/// \brief Convert to a floating-point number (scaled)
|
||||
template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
|
||||
T ToReal(int32_t scale) const {
|
||||
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
|
||||
"Unexpected floating-point type");
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return ToFloat(scale);
|
||||
} else {
|
||||
return ToDouble(scale);
|
||||
}
|
||||
}
|
||||
|
||||
ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
|
||||
const Decimal256& decimal);
|
||||
};
|
||||
|
||||
/// For an integer type, return the max number of decimal digits
|
||||
/// (=minimal decimal precision) it can represent.
|
||||
inline Result<int32_t> MaxDecimalDigitsForInteger(Type::type type_id) {
|
||||
switch (type_id) {
|
||||
case Type::INT8:
|
||||
case Type::UINT8:
|
||||
return 3;
|
||||
case Type::INT16:
|
||||
case Type::UINT16:
|
||||
return 5;
|
||||
case Type::INT32:
|
||||
case Type::UINT32:
|
||||
return 10;
|
||||
case Type::INT64:
|
||||
return 19;
|
||||
case Type::UINT64:
|
||||
return 20;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return Status::Invalid("Not an integer type: ", type_id);
|
||||
}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,181 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string_view>
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Buffer;
|
||||
|
||||
class ARROW_EXPORT BoundaryFinder {
|
||||
public:
|
||||
BoundaryFinder() = default;
|
||||
|
||||
virtual ~BoundaryFinder();
|
||||
|
||||
/// \brief Find the position of the first delimiter inside block
|
||||
///
|
||||
/// `partial` is taken to be the beginning of the block, and `block`
|
||||
/// its continuation. Also, `partial` doesn't contain a delimiter.
|
||||
///
|
||||
/// The returned `out_pos` is relative to `block`'s start and should point
|
||||
/// to the first character after the first delimiter.
|
||||
/// `out_pos` will be -1 if no delimiter is found.
|
||||
virtual Status FindFirst(std::string_view partial, std::string_view block,
|
||||
int64_t* out_pos) = 0;
|
||||
|
||||
/// \brief Find the position of the last delimiter inside block
|
||||
///
|
||||
/// The returned `out_pos` is relative to `block`'s start and should point
|
||||
/// to the first character after the last delimiter.
|
||||
/// `out_pos` will be -1 if no delimiter is found.
|
||||
virtual Status FindLast(std::string_view block, int64_t* out_pos) = 0;
|
||||
|
||||
/// \brief Find the position of the Nth delimiter inside the block
|
||||
///
|
||||
/// `partial` is taken to be the beginning of the block, and `block`
|
||||
/// its continuation. Also, `partial` doesn't contain a delimiter.
|
||||
///
|
||||
/// The returned `out_pos` is relative to `block`'s start and should point
|
||||
/// to the first character after the first delimiter.
|
||||
/// `out_pos` will be -1 if no delimiter is found.
|
||||
///
|
||||
/// The returned `num_found` is the number of delimiters actually found
|
||||
virtual Status FindNth(std::string_view partial, std::string_view block, int64_t count,
|
||||
int64_t* out_pos, int64_t* num_found) = 0;
|
||||
|
||||
static constexpr int64_t kNoDelimiterFound = -1;
|
||||
|
||||
protected:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(BoundaryFinder);
|
||||
};
|
||||
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<BoundaryFinder> MakeNewlineBoundaryFinder();
|
||||
|
||||
/// \brief A reusable block-based chunker for delimited data
|
||||
///
|
||||
/// The chunker takes a block of delimited data and helps carve a sub-block
|
||||
/// which begins and ends on delimiters (suitable for consumption by parsers
|
||||
/// which can only parse whole objects).
|
||||
class ARROW_EXPORT Chunker {
|
||||
public:
|
||||
explicit Chunker(std::shared_ptr<BoundaryFinder> delimiter);
|
||||
~Chunker();
|
||||
|
||||
/// \brief Carve up a chunk in a block of data to contain only whole objects
|
||||
///
|
||||
/// Pre-conditions:
|
||||
/// - `block` is the start of a valid block of delimited data
|
||||
/// (i.e. starts just after a delimiter)
|
||||
///
|
||||
/// Post-conditions:
|
||||
/// - block == whole + partial
|
||||
/// - `whole` is a valid block of delimited data
|
||||
/// (i.e. starts just after a delimiter and ends with a delimiter)
|
||||
/// - `partial` doesn't contain an entire delimited object
|
||||
/// (IOW: `partial` is generally small)
|
||||
///
|
||||
/// This method will look for the last delimiter in `block` and may
|
||||
/// therefore be costly.
|
||||
///
|
||||
/// \param[in] block data to be chunked
|
||||
/// \param[out] whole subrange of block containing whole delimited objects
|
||||
/// \param[out] partial subrange of block starting with a partial delimited object
|
||||
Status Process(std::shared_ptr<Buffer> block, std::shared_ptr<Buffer>* whole,
|
||||
std::shared_ptr<Buffer>* partial);
|
||||
|
||||
/// \brief Carve the completion of a partial object out of a block
|
||||
///
|
||||
/// Pre-conditions:
|
||||
/// - `partial` is the start of a valid block of delimited data
|
||||
/// (i.e. starts just after a delimiter)
|
||||
/// - `block` follows `partial` in file order
|
||||
///
|
||||
/// Post-conditions:
|
||||
/// - block == completion + rest
|
||||
/// - `partial + completion` is a valid block of delimited data
|
||||
/// (i.e. starts just after a delimiter and ends with a delimiter)
|
||||
/// - `completion` doesn't contain an entire delimited object
|
||||
/// (IOW: `completion` is generally small)
|
||||
///
|
||||
/// This method will look for the first delimiter in `block` and should
|
||||
/// therefore be reasonably cheap.
|
||||
///
|
||||
/// \param[in] partial incomplete delimited data
|
||||
/// \param[in] block delimited data following partial
|
||||
/// \param[out] completion subrange of block containing the completion of partial
|
||||
/// \param[out] rest subrange of block containing what completion does not cover
|
||||
Status ProcessWithPartial(std::shared_ptr<Buffer> partial,
|
||||
std::shared_ptr<Buffer> block,
|
||||
std::shared_ptr<Buffer>* completion,
|
||||
std::shared_ptr<Buffer>* rest);
|
||||
|
||||
/// \brief Like ProcessWithPartial, but for the last block of a file
|
||||
///
|
||||
/// This method allows for a final delimited object without a trailing delimiter
|
||||
/// (ProcessWithPartial would return an error in that case).
|
||||
///
|
||||
/// Pre-conditions:
|
||||
/// - `partial` is the start of a valid block of delimited data
|
||||
/// - `block` follows `partial` in file order and is the last data block
|
||||
///
|
||||
/// Post-conditions:
|
||||
/// - block == completion + rest
|
||||
/// - `partial + completion` is a valid block of delimited data
|
||||
/// - `completion` doesn't contain an entire delimited object
|
||||
/// (IOW: `completion` is generally small)
|
||||
///
|
||||
Status ProcessFinal(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
|
||||
std::shared_ptr<Buffer>* completion, std::shared_ptr<Buffer>* rest);
|
||||
|
||||
/// \brief Skip count number of rows
|
||||
/// Pre-conditions:
|
||||
/// - `partial` is the start of a valid block of delimited data
|
||||
/// (i.e. starts just after a delimiter)
|
||||
/// - `block` follows `partial` in file order
|
||||
///
|
||||
/// Post-conditions:
|
||||
/// - `count` is updated to indicate the number of rows that still need to be skipped
|
||||
/// - If `count` is > 0 then `rest` is an incomplete block that should be a future
|
||||
/// `partial`
|
||||
/// - Else `rest` could be one or more valid blocks of delimited data which need to be
|
||||
/// parsed
|
||||
///
|
||||
/// \param[in] partial incomplete delimited data
|
||||
/// \param[in] block delimited data following partial
|
||||
/// \param[in] final whether this is the final chunk
|
||||
/// \param[in,out] count number of rows that need to be skipped
|
||||
/// \param[out] rest subrange of block containing what was not skipped
|
||||
Status ProcessSkip(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
|
||||
bool final, int64_t* count, std::shared_ptr<Buffer>* rest);
|
||||
|
||||
protected:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker);
|
||||
|
||||
std::shared_ptr<BoundaryFinder> boundary_finder_;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,245 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef _WIN32
|
||||
# define ARROW_LITTLE_ENDIAN 1
|
||||
#else
|
||||
# if defined(__APPLE__) || defined(__FreeBSD__)
|
||||
# include <machine/endian.h> // IWYU pragma: keep
|
||||
# elif defined(sun) || defined(__sun)
|
||||
# include <sys/byteorder.h> // IWYU pragma: keep
|
||||
# elif !defined(_AIX)
|
||||
# include <endian.h> // IWYU pragma: keep
|
||||
# endif
|
||||
#
|
||||
# ifndef __BYTE_ORDER__
|
||||
# error "__BYTE_ORDER__ not defined"
|
||||
# endif
|
||||
#
|
||||
# ifndef __ORDER_LITTLE_ENDIAN__
|
||||
# error "__ORDER_LITTLE_ENDIAN__ not defined"
|
||||
# endif
|
||||
#
|
||||
# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
# define ARROW_LITTLE_ENDIAN 1
|
||||
# else
|
||||
# define ARROW_LITTLE_ENDIAN 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# include <intrin.h> // IWYU pragma: keep
|
||||
# define ARROW_BYTE_SWAP64 _byteswap_uint64
|
||||
# define ARROW_BYTE_SWAP32 _byteswap_ulong
|
||||
#else
|
||||
# define ARROW_BYTE_SWAP64 __builtin_bswap64
|
||||
# define ARROW_BYTE_SWAP32 __builtin_bswap32
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
|
||||
#include "arrow/util/type_traits.h"
|
||||
#include "arrow/util/ubsan.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace bit_util {
|
||||
|
||||
//
|
||||
// Byte-swap 16-bit, 32-bit and 64-bit values
|
||||
//
|
||||
|
||||
// Swap the byte order (i.e. endianness)
|
||||
static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); }
|
||||
static inline uint64_t ByteSwap(uint64_t value) {
|
||||
return static_cast<uint64_t>(ARROW_BYTE_SWAP64(value));
|
||||
}
|
||||
static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); }
|
||||
static inline uint32_t ByteSwap(uint32_t value) {
|
||||
return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
|
||||
}
|
||||
static inline int16_t ByteSwap(int16_t value) {
|
||||
constexpr auto m = static_cast<int16_t>(0xff);
|
||||
return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
|
||||
}
|
||||
static inline uint16_t ByteSwap(uint16_t value) {
|
||||
return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
|
||||
}
|
||||
static inline uint8_t ByteSwap(uint8_t value) { return value; }
|
||||
static inline int8_t ByteSwap(int8_t value) { return value; }
|
||||
static inline double ByteSwap(double value) {
|
||||
const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy<uint64_t>(value));
|
||||
return util::SafeCopy<double>(swapped);
|
||||
}
|
||||
static inline float ByteSwap(float value) {
|
||||
const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy<uint32_t>(value));
|
||||
return util::SafeCopy<float>(swapped);
|
||||
}
|
||||
|
||||
// Write the swapped bytes into dst. Src and dst cannot overlap.
|
||||
static inline void ByteSwap(void* dst, const void* src, int len) {
|
||||
switch (len) {
|
||||
case 1:
|
||||
*reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
|
||||
return;
|
||||
case 2:
|
||||
*reinterpret_cast<int16_t*>(dst) = ByteSwap(*reinterpret_cast<const int16_t*>(src));
|
||||
return;
|
||||
case 4:
|
||||
*reinterpret_cast<int32_t*>(dst) = ByteSwap(*reinterpret_cast<const int32_t*>(src));
|
||||
return;
|
||||
case 8:
|
||||
*reinterpret_cast<int64_t*>(dst) = ByteSwap(*reinterpret_cast<const int64_t*>(src));
|
||||
return;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
auto d = reinterpret_cast<uint8_t*>(dst);
|
||||
auto s = reinterpret_cast<const uint8_t*>(src);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
d[i] = s[len - i - 1];
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to little/big endian format from the machine's native endian format.
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<
|
||||
T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
|
||||
uint8_t, int8_t, float, double, bool>>
|
||||
static inline T ToBigEndian(T value) {
|
||||
return ByteSwap(value);
|
||||
}
|
||||
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<
|
||||
T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
|
||||
uint8_t, int8_t, float, double, bool>>
|
||||
static inline T ToLittleEndian(T value) {
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<
|
||||
T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
|
||||
uint8_t, int8_t, float, double, bool>>
|
||||
static inline T ToBigEndian(T value) {
|
||||
return value;
|
||||
}
|
||||
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<
|
||||
T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
|
||||
uint8_t, int8_t, float, double, bool>>
|
||||
static inline T ToLittleEndian(T value) {
|
||||
return ByteSwap(value);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Convert from big/little endian format to the machine's native endian format.
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<
|
||||
T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
|
||||
uint8_t, int8_t, float, double, bool>>
|
||||
static inline T FromBigEndian(T value) {
|
||||
return ByteSwap(value);
|
||||
}
|
||||
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<
|
||||
T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
|
||||
uint8_t, int8_t, float, double, bool>>
|
||||
static inline T FromLittleEndian(T value) {
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<
|
||||
T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
|
||||
uint8_t, int8_t, float, double, bool>>
|
||||
static inline T FromBigEndian(T value) {
|
||||
return value;
|
||||
}
|
||||
|
||||
template <typename T, typename = internal::EnableIfIsOneOf<
|
||||
T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
|
||||
uint8_t, int8_t, float, double, bool>>
|
||||
static inline T FromLittleEndian(T value) {
|
||||
return ByteSwap(value);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Handle endianness in *word* granularity (keep individual array element untouched)
|
||||
namespace little_endian {
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Read a native endian array as little endian
|
||||
template <typename T, size_t N>
|
||||
struct Reader {
|
||||
const std::array<T, N>& native_array;
|
||||
|
||||
explicit Reader(const std::array<T, N>& native_array) : native_array(native_array) {}
|
||||
|
||||
const T& operator[](size_t i) const {
|
||||
return native_array[ARROW_LITTLE_ENDIAN ? i : N - 1 - i];
|
||||
}
|
||||
};
|
||||
|
||||
// Read/write a native endian array as little endian
|
||||
template <typename T, size_t N>
|
||||
struct Writer {
|
||||
std::array<T, N>* native_array;
|
||||
|
||||
explicit Writer(std::array<T, N>* native_array) : native_array(native_array) {}
|
||||
|
||||
const T& operator[](size_t i) const {
|
||||
return (*native_array)[ARROW_LITTLE_ENDIAN ? i : N - 1 - i];
|
||||
}
|
||||
T& operator[](size_t i) { return (*native_array)[ARROW_LITTLE_ENDIAN ? i : N - 1 - i]; }
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Construct array reader and try to deduce template augments
|
||||
template <typename T, size_t N>
|
||||
static inline detail::Reader<T, N> Make(const std::array<T, N>& native_array) {
|
||||
return detail::Reader<T, N>(native_array);
|
||||
}
|
||||
|
||||
// Construct array writer and try to deduce template augments
|
||||
template <typename T, size_t N>
|
||||
static inline detail::Writer<T, N> Make(std::array<T, N>* native_array) {
|
||||
return detail::Writer<T, N>(native_array);
|
||||
}
|
||||
|
||||
// Convert little endian array to native endian
|
||||
template <typename T, size_t N>
|
||||
static inline std::array<T, N> ToNative(std::array<T, N> array) {
|
||||
if (!ARROW_LITTLE_ENDIAN) {
|
||||
std::reverse(array.begin(), array.end());
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
// Convert native endian array to little endian
|
||||
template <typename T, size_t N>
|
||||
static inline std::array<T, N> FromNative(std::array<T, N> array) {
|
||||
return ToNative(array);
|
||||
}
|
||||
|
||||
} // namespace little_endian
|
||||
|
||||
} // namespace bit_util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,206 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iosfwd>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/util/endian.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/ubsan.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
/// \brief Class representing an IEEE half-precision float, encoded as a `uint16_t`
|
||||
///
|
||||
/// The exact format is as follows (from LSB to MSB):
|
||||
/// - bits 0-10: mantissa
|
||||
/// - bits 10-15: exponent
|
||||
/// - bit 15: sign
|
||||
///
|
||||
class ARROW_EXPORT Float16 {
|
||||
public:
|
||||
Float16() = default;
|
||||
explicit Float16(float f) : Float16(FromFloat(f)) {}
|
||||
explicit Float16(double d) : Float16(FromDouble(d)) {}
|
||||
template <typename T,
|
||||
typename std::enable_if_t<std::is_convertible_v<T, double>>* = NULLPTR>
|
||||
explicit Float16(T v) : Float16(static_cast<double>(v)) {}
|
||||
|
||||
/// \brief Create a `Float16` from its exact binary representation
|
||||
constexpr static Float16 FromBits(uint16_t bits) { return Float16{bits, bool{}}; }
|
||||
/// \brief Create a `Float16` from a 32-bit float (may lose precision)
|
||||
static Float16 FromFloat(float f);
|
||||
/// \brief Create a `Float16` from a 64-bit float (may lose precision)
|
||||
static Float16 FromDouble(double d);
|
||||
|
||||
/// \brief Read a `Float16` from memory in native-endian byte order
|
||||
static Float16 FromBytes(const uint8_t* src) {
|
||||
return FromBits(SafeLoadAs<uint16_t>(src));
|
||||
}
|
||||
|
||||
/// \brief Read a `Float16` from memory in little-endian byte order
|
||||
static Float16 FromLittleEndian(const uint8_t* src) {
|
||||
return FromBits(::arrow::bit_util::FromLittleEndian(SafeLoadAs<uint16_t>(src)));
|
||||
}
|
||||
|
||||
/// \brief Read a `Float16` from memory in big-endian byte order
|
||||
static Float16 FromBigEndian(const uint8_t* src) {
|
||||
return FromBits(::arrow::bit_util::FromBigEndian(SafeLoadAs<uint16_t>(src)));
|
||||
}
|
||||
|
||||
/// \brief Return the value's binary representation as a `uint16_t`
|
||||
constexpr uint16_t bits() const { return bits_; }
|
||||
|
||||
/// \brief Return true if the value is negative (sign bit is set)
|
||||
constexpr bool signbit() const { return (bits_ & 0x8000) != 0; }
|
||||
|
||||
/// \brief Return true if the value is NaN
|
||||
constexpr bool is_nan() const { return (bits_ & 0x7fff) > 0x7c00; }
|
||||
/// \brief Return true if the value is positive/negative infinity
|
||||
constexpr bool is_infinity() const { return (bits_ & 0x7fff) == 0x7c00; }
|
||||
/// \brief Return true if the value is finite and not NaN
|
||||
constexpr bool is_finite() const { return (bits_ & 0x7c00) != 0x7c00; }
|
||||
/// \brief Return true if the value is positive/negative zero
|
||||
constexpr bool is_zero() const { return (bits_ & 0x7fff) == 0; }
|
||||
|
||||
/// \brief Convert to a 32-bit float
|
||||
float ToFloat() const;
|
||||
/// \brief Convert to a 64-bit float
|
||||
double ToDouble() const;
|
||||
|
||||
explicit operator float() const { return ToFloat(); }
|
||||
explicit operator double() const { return ToDouble(); }
|
||||
|
||||
/// \brief Copy the value's bytes in native-endian byte order
|
||||
void ToBytes(uint8_t* dest) const { std::memcpy(dest, &bits_, sizeof(bits_)); }
|
||||
/// \brief Return the value's bytes in native-endian byte order
|
||||
constexpr std::array<uint8_t, 2> ToBytes() const {
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
return ToLittleEndian();
|
||||
#else
|
||||
return ToBigEndian();
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Copy the value's bytes in little-endian byte order
|
||||
void ToLittleEndian(uint8_t* dest) const {
|
||||
const auto bytes = ToLittleEndian();
|
||||
std::memcpy(dest, bytes.data(), bytes.size());
|
||||
}
|
||||
/// \brief Return the value's bytes in little-endian byte order
|
||||
constexpr std::array<uint8_t, 2> ToLittleEndian() const {
|
||||
return {uint8_t(bits_ & 0xff), uint8_t(bits_ >> 8)};
|
||||
}
|
||||
|
||||
/// \brief Copy the value's bytes in big-endian byte order
|
||||
void ToBigEndian(uint8_t* dest) const {
|
||||
const auto bytes = ToBigEndian();
|
||||
std::memcpy(dest, bytes.data(), bytes.size());
|
||||
}
|
||||
/// \brief Return the value's bytes in big-endian byte order
|
||||
constexpr std::array<uint8_t, 2> ToBigEndian() const {
|
||||
return {uint8_t(bits_ >> 8), uint8_t(bits_ & 0xff)};
|
||||
}
|
||||
|
||||
constexpr Float16 operator-() const { return FromBits(bits_ ^ 0x8000); }
|
||||
constexpr Float16 operator+() const { return FromBits(bits_); }
|
||||
|
||||
friend constexpr bool operator==(Float16 lhs, Float16 rhs) {
|
||||
if (lhs.is_nan() || rhs.is_nan()) return false;
|
||||
return Float16::CompareEq(lhs, rhs);
|
||||
}
|
||||
friend constexpr bool operator!=(Float16 lhs, Float16 rhs) { return !(lhs == rhs); }
|
||||
|
||||
friend constexpr bool operator<(Float16 lhs, Float16 rhs) {
|
||||
if (lhs.is_nan() || rhs.is_nan()) return false;
|
||||
return Float16::CompareLt(lhs, rhs);
|
||||
}
|
||||
friend constexpr bool operator>(Float16 lhs, Float16 rhs) { return rhs < lhs; }
|
||||
|
||||
friend constexpr bool operator<=(Float16 lhs, Float16 rhs) {
|
||||
if (lhs.is_nan() || rhs.is_nan()) return false;
|
||||
return !Float16::CompareLt(rhs, lhs);
|
||||
}
|
||||
friend constexpr bool operator>=(Float16 lhs, Float16 rhs) { return rhs <= lhs; }
|
||||
|
||||
ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, Float16 arg);
|
||||
|
||||
static constexpr Float16 zero() { return FromBits(0); }
|
||||
static constexpr Float16 one() { return FromBits(0x3c00); }
|
||||
|
||||
protected:
|
||||
uint16_t bits_;
|
||||
|
||||
private:
|
||||
constexpr Float16(uint16_t bits, bool) : bits_(bits) {}
|
||||
|
||||
// Comparison helpers that assume neither operand is NaN
|
||||
static constexpr bool CompareEq(Float16 lhs, Float16 rhs) {
|
||||
return (lhs.bits() == rhs.bits()) || (lhs.is_zero() && rhs.is_zero());
|
||||
}
|
||||
static constexpr bool CompareLt(Float16 lhs, Float16 rhs) {
|
||||
if (lhs.signbit()) {
|
||||
if (rhs.signbit()) {
|
||||
// Both are negative
|
||||
return lhs.bits() > rhs.bits();
|
||||
} else {
|
||||
// Handle +/-0
|
||||
return !lhs.is_zero() || rhs.bits() != 0;
|
||||
}
|
||||
} else if (rhs.signbit()) {
|
||||
return false;
|
||||
} else {
|
||||
// Both are positive
|
||||
return lhs.bits() < rhs.bits();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static_assert(std::is_standard_layout_v<Float16>);
|
||||
static_assert(std::is_trivial_v<Float16>);
|
||||
static_assert(sizeof(Float16) == sizeof(uint16_t));
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
|
||||
// TODO: Not complete
|
||||
template <>
|
||||
class std::numeric_limits<arrow::util::Float16> {
|
||||
using T = arrow::util::Float16;
|
||||
|
||||
public:
|
||||
static constexpr bool is_specialized = true;
|
||||
static constexpr bool is_signed = true;
|
||||
static constexpr bool has_infinity = true;
|
||||
static constexpr bool has_quiet_NaN = true;
|
||||
|
||||
static constexpr T min() { return T::FromBits(0b0000010000000000); }
|
||||
static constexpr T max() { return T::FromBits(0b0111101111111111); }
|
||||
static constexpr T lowest() { return -max(); }
|
||||
|
||||
static constexpr T infinity() { return T::FromBits(0b0111110000000000); }
|
||||
|
||||
static constexpr T quiet_NaN() { return T::FromBits(0b0111111111111111); }
|
||||
};
|
||||
@@ -0,0 +1,667 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This is a private header for number-to-string formatting utilities
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/string.h"
|
||||
#include "arrow/util/time.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
#include "arrow/vendored/datetime.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
/// \brief The entry point for conversion to strings.
|
||||
template <typename ARROW_TYPE, typename Enable = void>
|
||||
class StringFormatter;
|
||||
|
||||
template <typename T>
|
||||
struct is_formattable {
|
||||
template <typename U, typename = typename StringFormatter<U>::value_type>
|
||||
static std::true_type Test(U*);
|
||||
|
||||
template <typename U>
|
||||
static std::false_type Test(...);
|
||||
|
||||
static constexpr bool value = decltype(Test<T>(NULLPTR))::value;
|
||||
};
|
||||
|
||||
template <typename T, typename R = void>
|
||||
using enable_if_formattable = enable_if_t<is_formattable<T>::value, R>;
|
||||
|
||||
template <typename Appender>
|
||||
using Return = decltype(std::declval<Appender>()(std::string_view{}));
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// Boolean formatting
|
||||
|
||||
template <>
|
||||
class StringFormatter<BooleanType> {
|
||||
public:
|
||||
explicit StringFormatter(const DataType* = NULLPTR) {}
|
||||
|
||||
using value_type = bool;
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(bool value, Appender&& append) {
|
||||
if (value) {
|
||||
const char string[] = "true";
|
||||
return append(std::string_view(string));
|
||||
} else {
|
||||
const char string[] = "false";
|
||||
return append(std::string_view(string));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// Decimals formatting
|
||||
|
||||
template <typename ARROW_TYPE>
|
||||
class DecimalToStringFormatterMixin {
|
||||
public:
|
||||
explicit DecimalToStringFormatterMixin(const DataType* type)
|
||||
: scale_(static_cast<const ARROW_TYPE*>(type)->scale()) {}
|
||||
|
||||
using value_type = typename TypeTraits<ARROW_TYPE>::CType;
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(const value_type& value, Appender&& append) {
|
||||
return append(value.ToString(scale_));
|
||||
}
|
||||
|
||||
private:
|
||||
int32_t scale_;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Decimal32Type>
|
||||
: public DecimalToStringFormatterMixin<Decimal32Type> {
|
||||
using DecimalToStringFormatterMixin::DecimalToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Decimal64Type>
|
||||
: public DecimalToStringFormatterMixin<Decimal64Type> {
|
||||
using DecimalToStringFormatterMixin::DecimalToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Decimal128Type>
|
||||
: public DecimalToStringFormatterMixin<Decimal128Type> {
|
||||
using DecimalToStringFormatterMixin::DecimalToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Decimal256Type>
|
||||
: public DecimalToStringFormatterMixin<Decimal256Type> {
|
||||
using DecimalToStringFormatterMixin::DecimalToStringFormatterMixin;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// Integer formatting
|
||||
|
||||
namespace detail {
|
||||
|
||||
// A 2x100 direct table mapping integers in [0..99] to their decimal representations.
|
||||
ARROW_EXPORT extern const char digit_pairs[];
|
||||
|
||||
// Based on fmtlib's format_int class:
|
||||
// Write digits from right to left into a stack allocated buffer.
|
||||
// \pre *cursor points to the byte after the one that will be written.
|
||||
// \post *cursor points to the byte that was written.
|
||||
inline void FormatOneChar(char c, char** cursor) { *(--(*cursor)) = c; }
|
||||
|
||||
template <typename Int>
|
||||
void FormatOneDigit(Int value, char** cursor) {
|
||||
assert(value >= 0 && value <= 9);
|
||||
FormatOneChar(static_cast<char>('0' + value), cursor);
|
||||
}
|
||||
|
||||
// GH-35662: I don't know why but the following combination causes SEGV:
|
||||
// * template implementation without inline
|
||||
// * MinGW
|
||||
// * Release build
|
||||
template <typename Int>
|
||||
inline void FormatTwoDigits(Int value, char** cursor) {
|
||||
assert(value >= 0 && value <= 99);
|
||||
auto digit_pair = &digit_pairs[value * 2];
|
||||
FormatOneChar(digit_pair[1], cursor);
|
||||
FormatOneChar(digit_pair[0], cursor);
|
||||
}
|
||||
|
||||
template <typename Int>
|
||||
void FormatAllDigits(Int value, char** cursor) {
|
||||
assert(value >= 0);
|
||||
while (value >= 100) {
|
||||
FormatTwoDigits(value % 100, cursor);
|
||||
value /= 100;
|
||||
}
|
||||
|
||||
if (value >= 10) {
|
||||
FormatTwoDigits(value, cursor);
|
||||
} else {
|
||||
FormatOneDigit(value, cursor);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Int>
|
||||
void FormatAllDigitsLeftPadded(Int value, size_t pad, char pad_char, char** cursor) {
|
||||
auto end = *cursor - pad;
|
||||
FormatAllDigits(value, cursor);
|
||||
while (*cursor > end) {
|
||||
FormatOneChar(pad_char, cursor);
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t BUFFER_SIZE>
|
||||
std::string_view ViewDigitBuffer(const std::array<char, BUFFER_SIZE>& buffer,
|
||||
char* cursor) {
|
||||
auto buffer_end = buffer.data() + BUFFER_SIZE;
|
||||
return {cursor, static_cast<size_t>(buffer_end - cursor)};
|
||||
}
|
||||
|
||||
template <typename Int, typename UInt = typename std::make_unsigned<Int>::type>
|
||||
constexpr UInt Abs(Int value) {
|
||||
return value < 0 ? ~static_cast<UInt>(value) + 1 : static_cast<UInt>(value);
|
||||
}
|
||||
|
||||
template <typename Int>
|
||||
constexpr size_t Digits10(Int value) {
|
||||
return value <= 9 ? 1 : Digits10(value / 10) + 1;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename ARROW_TYPE>
|
||||
class IntToStringFormatterMixin {
|
||||
public:
|
||||
explicit IntToStringFormatterMixin(const DataType* = NULLPTR) {}
|
||||
|
||||
using value_type = typename ARROW_TYPE::c_type;
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(value_type value, Appender&& append) {
|
||||
constexpr size_t buffer_size =
|
||||
detail::Digits10(std::numeric_limits<value_type>::max()) + 1;
|
||||
|
||||
std::array<char, buffer_size> buffer;
|
||||
char* cursor = buffer.data() + buffer_size;
|
||||
detail::FormatAllDigits(detail::Abs(value), &cursor);
|
||||
if (value < 0) {
|
||||
detail::FormatOneChar('-', &cursor);
|
||||
}
|
||||
return append(detail::ViewDigitBuffer(buffer, cursor));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Int8Type> : public IntToStringFormatterMixin<Int8Type> {
|
||||
using IntToStringFormatterMixin::IntToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Int16Type> : public IntToStringFormatterMixin<Int16Type> {
|
||||
using IntToStringFormatterMixin::IntToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Int32Type> : public IntToStringFormatterMixin<Int32Type> {
|
||||
using IntToStringFormatterMixin::IntToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Int64Type> : public IntToStringFormatterMixin<Int64Type> {
|
||||
using IntToStringFormatterMixin::IntToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<UInt8Type> : public IntToStringFormatterMixin<UInt8Type> {
|
||||
using IntToStringFormatterMixin::IntToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<UInt16Type> : public IntToStringFormatterMixin<UInt16Type> {
|
||||
using IntToStringFormatterMixin::IntToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<UInt32Type> : public IntToStringFormatterMixin<UInt32Type> {
|
||||
using IntToStringFormatterMixin::IntToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<UInt64Type> : public IntToStringFormatterMixin<UInt64Type> {
|
||||
using IntToStringFormatterMixin::IntToStringFormatterMixin;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// Floating-point formatting
|
||||
|
||||
class ARROW_EXPORT FloatToStringFormatter {
|
||||
public:
|
||||
FloatToStringFormatter();
|
||||
FloatToStringFormatter(int flags, const char* inf_symbol, const char* nan_symbol,
|
||||
char exp_character, int decimal_in_shortest_low,
|
||||
int decimal_in_shortest_high,
|
||||
int max_leading_padding_zeroes_in_precision_mode,
|
||||
int max_trailing_padding_zeroes_in_precision_mode);
|
||||
~FloatToStringFormatter();
|
||||
|
||||
// Returns the number of characters written
|
||||
int FormatFloat(float v, char* out_buffer, int out_size);
|
||||
int FormatFloat(double v, char* out_buffer, int out_size);
|
||||
int FormatFloat(uint16_t v, char* out_buffer, int out_size);
|
||||
|
||||
protected:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
template <typename ARROW_TYPE>
|
||||
class FloatToStringFormatterMixin : public FloatToStringFormatter {
|
||||
public:
|
||||
using value_type = typename ARROW_TYPE::c_type;
|
||||
|
||||
static constexpr int buffer_size = 50;
|
||||
|
||||
explicit FloatToStringFormatterMixin(const DataType* = NULLPTR) {}
|
||||
|
||||
FloatToStringFormatterMixin(int flags, const char* inf_symbol, const char* nan_symbol,
|
||||
char exp_character, int decimal_in_shortest_low,
|
||||
int decimal_in_shortest_high,
|
||||
int max_leading_padding_zeroes_in_precision_mode,
|
||||
int max_trailing_padding_zeroes_in_precision_mode)
|
||||
: FloatToStringFormatter(flags, inf_symbol, nan_symbol, exp_character,
|
||||
decimal_in_shortest_low, decimal_in_shortest_high,
|
||||
max_leading_padding_zeroes_in_precision_mode,
|
||||
max_trailing_padding_zeroes_in_precision_mode) {}
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(value_type value, Appender&& append) {
|
||||
char buffer[buffer_size];
|
||||
int size = FormatFloat(value, buffer, buffer_size);
|
||||
return append(std::string_view(buffer, size));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<HalfFloatType> : public FloatToStringFormatterMixin<HalfFloatType> {
|
||||
public:
|
||||
using FloatToStringFormatterMixin::FloatToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<FloatType> : public FloatToStringFormatterMixin<FloatType> {
|
||||
public:
|
||||
using FloatToStringFormatterMixin::FloatToStringFormatterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<DoubleType> : public FloatToStringFormatterMixin<DoubleType> {
|
||||
public:
|
||||
using FloatToStringFormatterMixin::FloatToStringFormatterMixin;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// Temporal formatting
|
||||
|
||||
namespace detail {
|
||||
|
||||
constexpr size_t BufferSizeYYYY_MM_DD() {
|
||||
// "-"? "99999-12-31"
|
||||
return 1 + detail::Digits10(99999) + 1 + detail::Digits10(12) + 1 +
|
||||
detail::Digits10(31);
|
||||
}
|
||||
|
||||
inline void FormatYYYY_MM_DD(arrow_vendored::date::year_month_day ymd, char** cursor) {
|
||||
FormatTwoDigits(static_cast<unsigned>(ymd.day()), cursor);
|
||||
FormatOneChar('-', cursor);
|
||||
FormatTwoDigits(static_cast<unsigned>(ymd.month()), cursor);
|
||||
FormatOneChar('-', cursor);
|
||||
auto year = static_cast<int>(ymd.year());
|
||||
const auto is_neg_year = year < 0;
|
||||
year = std::abs(year);
|
||||
assert(year <= 99999);
|
||||
FormatTwoDigits(year % 100, cursor);
|
||||
year /= 100;
|
||||
FormatTwoDigits(year % 100, cursor);
|
||||
if (year >= 100) {
|
||||
FormatOneDigit(year / 100, cursor);
|
||||
}
|
||||
if (is_neg_year) {
|
||||
FormatOneChar('-', cursor);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Duration>
|
||||
constexpr size_t BufferSizeHH_MM_SS() {
|
||||
// "23:59:59" ("." "9"+)?
|
||||
return detail::Digits10(23) + 1 + detail::Digits10(59) + 1 + detail::Digits10(59) + 1 +
|
||||
detail::Digits10(Duration::period::den) - 1;
|
||||
}
|
||||
|
||||
template <typename Duration>
|
||||
void FormatHH_MM_SS(arrow_vendored::date::hh_mm_ss<Duration> hms, char** cursor) {
|
||||
constexpr size_t subsecond_digits = Digits10(Duration::period::den) - 1;
|
||||
if (subsecond_digits != 0) {
|
||||
FormatAllDigitsLeftPadded(hms.subseconds().count(), subsecond_digits, '0', cursor);
|
||||
FormatOneChar('.', cursor);
|
||||
}
|
||||
FormatTwoDigits(hms.seconds().count(), cursor);
|
||||
FormatOneChar(':', cursor);
|
||||
FormatTwoDigits(hms.minutes().count(), cursor);
|
||||
FormatOneChar(':', cursor);
|
||||
FormatTwoDigits(hms.hours().count(), cursor);
|
||||
}
|
||||
|
||||
// Some out-of-bound datetime values would result in erroneous printing
|
||||
// because of silent integer wraparound in the `arrow_vendored::date` library.
|
||||
//
|
||||
// To avoid such misprinting, we must therefore check the bounds explicitly.
|
||||
// The bounds correspond to start of year -32767 and end of year 32767,
|
||||
// respectively (-32768 is an invalid year value in `arrow_vendored::date`).
|
||||
//
|
||||
// Note these values are the same as documented for C++20:
|
||||
// https://en.cppreference.com/w/cpp/chrono/year_month_day/operator_days
|
||||
template <typename Unit>
|
||||
bool IsDateTimeInRange(Unit duration) {
|
||||
constexpr Unit kMinIncl =
|
||||
std::chrono::duration_cast<Unit>(arrow_vendored::date::days{-12687428});
|
||||
constexpr Unit kMaxExcl =
|
||||
std::chrono::duration_cast<Unit>(arrow_vendored::date::days{11248738});
|
||||
return duration >= kMinIncl && duration < kMaxExcl;
|
||||
}
|
||||
|
||||
// IsDateTimeInRange() specialization for nanoseconds: a 64-bit number of
|
||||
// nanoseconds cannot represent years outside of the [-32767, 32767]
|
||||
// range, and the {kMinIncl, kMaxExcl} constants above would overflow.
|
||||
constexpr bool IsDateTimeInRange(std::chrono::nanoseconds duration) { return true; }
|
||||
|
||||
template <typename Unit>
|
||||
bool IsTimeInRange(Unit duration) {
|
||||
constexpr Unit kMinIncl = std::chrono::duration_cast<Unit>(std::chrono::seconds{0});
|
||||
constexpr Unit kMaxExcl = std::chrono::duration_cast<Unit>(std::chrono::seconds{86400});
|
||||
return duration >= kMinIncl && duration < kMaxExcl;
|
||||
}
|
||||
|
||||
template <typename RawValue, typename Appender>
|
||||
Return<Appender> FormatOutOfRange(RawValue&& raw_value, Appender&& append) {
|
||||
// XXX locale-sensitive but good enough for now
|
||||
std::string formatted = "<value out of range: " + ToChars(raw_value) + ">";
|
||||
return append(std::move(formatted));
|
||||
}
|
||||
|
||||
const auto kEpoch = arrow_vendored::date::sys_days{arrow_vendored::date::jan / 1 / 1970};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <>
|
||||
class StringFormatter<DurationType> : public IntToStringFormatterMixin<DurationType> {
|
||||
using IntToStringFormatterMixin::IntToStringFormatterMixin;
|
||||
};
|
||||
|
||||
class DateToStringFormatterMixin {
|
||||
public:
|
||||
explicit DateToStringFormatterMixin(const DataType* = NULLPTR) {}
|
||||
|
||||
protected:
|
||||
template <typename Appender>
|
||||
Return<Appender> FormatDays(arrow_vendored::date::days since_epoch, Appender&& append) {
|
||||
arrow_vendored::date::sys_days timepoint_days{since_epoch};
|
||||
|
||||
constexpr size_t buffer_size = detail::BufferSizeYYYY_MM_DD();
|
||||
|
||||
std::array<char, buffer_size> buffer;
|
||||
char* cursor = buffer.data() + buffer_size;
|
||||
|
||||
detail::FormatYYYY_MM_DD(arrow_vendored::date::year_month_day{timepoint_days},
|
||||
&cursor);
|
||||
return append(detail::ViewDigitBuffer(buffer, cursor));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Date32Type> : public DateToStringFormatterMixin {
|
||||
public:
|
||||
using value_type = typename Date32Type::c_type;
|
||||
|
||||
using DateToStringFormatterMixin::DateToStringFormatterMixin;
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(value_type value, Appender&& append) {
|
||||
const auto since_epoch = arrow_vendored::date::days{value};
|
||||
if (!ARROW_PREDICT_TRUE(detail::IsDateTimeInRange(since_epoch))) {
|
||||
return detail::FormatOutOfRange(value, append);
|
||||
}
|
||||
return FormatDays(since_epoch, std::forward<Appender>(append));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<Date64Type> : public DateToStringFormatterMixin {
|
||||
public:
|
||||
using value_type = typename Date64Type::c_type;
|
||||
|
||||
using DateToStringFormatterMixin::DateToStringFormatterMixin;
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(value_type value, Appender&& append) {
|
||||
const auto since_epoch = std::chrono::milliseconds{value};
|
||||
if (!ARROW_PREDICT_TRUE(detail::IsDateTimeInRange(since_epoch))) {
|
||||
return detail::FormatOutOfRange(value, append);
|
||||
}
|
||||
return FormatDays(std::chrono::duration_cast<arrow_vendored::date::days>(since_epoch),
|
||||
std::forward<Appender>(append));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<TimestampType> {
|
||||
public:
|
||||
using value_type = int64_t;
|
||||
|
||||
explicit StringFormatter(const DataType* type)
|
||||
: unit_(checked_cast<const TimestampType&>(*type).unit()),
|
||||
timezone_(checked_cast<const TimestampType&>(*type).timezone()) {}
|
||||
|
||||
template <typename Duration, typename Appender>
|
||||
Return<Appender> operator()(Duration, value_type value, Appender&& append) {
|
||||
using arrow_vendored::date::days;
|
||||
|
||||
const Duration since_epoch{value};
|
||||
if (!ARROW_PREDICT_TRUE(detail::IsDateTimeInRange(since_epoch))) {
|
||||
return detail::FormatOutOfRange(value, append);
|
||||
}
|
||||
|
||||
const auto timepoint = detail::kEpoch + since_epoch;
|
||||
// Round days towards zero
|
||||
// (the naive approach of using arrow_vendored::date::floor() would
|
||||
// result in UB for very large negative timestamps, similarly as
|
||||
// https://github.com/HowardHinnant/date/issues/696)
|
||||
auto timepoint_days = std::chrono::time_point_cast<days>(timepoint);
|
||||
Duration since_midnight;
|
||||
if (timepoint_days <= timepoint) {
|
||||
// Year >= 1970
|
||||
since_midnight = timepoint - timepoint_days;
|
||||
} else {
|
||||
// Year < 1970
|
||||
since_midnight = days(1) - (timepoint_days - timepoint);
|
||||
timepoint_days -= days(1);
|
||||
}
|
||||
|
||||
// YYYY_MM_DD " " HH_MM_SS "Z"?
|
||||
constexpr size_t buffer_size =
|
||||
detail::BufferSizeYYYY_MM_DD() + 1 + detail::BufferSizeHH_MM_SS<Duration>() + 1;
|
||||
|
||||
std::array<char, buffer_size> buffer;
|
||||
char* cursor = buffer.data() + buffer_size;
|
||||
|
||||
if (timezone_.size() > 0) {
|
||||
detail::FormatOneChar('Z', &cursor);
|
||||
}
|
||||
detail::FormatHH_MM_SS(arrow_vendored::date::make_time(since_midnight), &cursor);
|
||||
detail::FormatOneChar(' ', &cursor);
|
||||
detail::FormatYYYY_MM_DD(timepoint_days, &cursor);
|
||||
return append(detail::ViewDigitBuffer(buffer, cursor));
|
||||
}
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(value_type value, Appender&& append) {
|
||||
return util::VisitDuration(unit_, *this, value, std::forward<Appender>(append));
|
||||
}
|
||||
|
||||
private:
|
||||
TimeUnit::type unit_;
|
||||
std::string timezone_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class StringFormatter<T, enable_if_time<T>> {
|
||||
public:
|
||||
using value_type = typename T::c_type;
|
||||
|
||||
explicit StringFormatter(const DataType* type)
|
||||
: unit_(checked_cast<const T&>(*type).unit()) {}
|
||||
|
||||
template <typename Duration, typename Appender>
|
||||
Return<Appender> operator()(Duration, value_type count, Appender&& append) {
|
||||
const Duration since_midnight{count};
|
||||
if (!ARROW_PREDICT_TRUE(detail::IsTimeInRange(since_midnight))) {
|
||||
return detail::FormatOutOfRange(count, append);
|
||||
}
|
||||
|
||||
constexpr size_t buffer_size = detail::BufferSizeHH_MM_SS<Duration>();
|
||||
|
||||
std::array<char, buffer_size> buffer;
|
||||
char* cursor = buffer.data() + buffer_size;
|
||||
|
||||
detail::FormatHH_MM_SS(arrow_vendored::date::make_time(since_midnight), &cursor);
|
||||
return append(detail::ViewDigitBuffer(buffer, cursor));
|
||||
}
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(value_type value, Appender&& append) {
|
||||
return util::VisitDuration(unit_, *this, value, std::forward<Appender>(append));
|
||||
}
|
||||
|
||||
private:
|
||||
TimeUnit::type unit_;
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<MonthIntervalType> {
|
||||
public:
|
||||
using value_type = MonthIntervalType::c_type;
|
||||
|
||||
explicit StringFormatter(const DataType*) {}
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(value_type interval, Appender&& append) {
|
||||
constexpr size_t buffer_size =
|
||||
/*'m'*/ 3 + /*negative signs*/ 1 +
|
||||
/*months*/ detail::Digits10(std::numeric_limits<value_type>::max());
|
||||
std::array<char, buffer_size> buffer;
|
||||
char* cursor = buffer.data() + buffer_size;
|
||||
|
||||
detail::FormatOneChar('M', &cursor);
|
||||
detail::FormatAllDigits(detail::Abs(interval), &cursor);
|
||||
if (interval < 0) detail::FormatOneChar('-', &cursor);
|
||||
|
||||
return append(detail::ViewDigitBuffer(buffer, cursor));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<DayTimeIntervalType> {
|
||||
public:
|
||||
using value_type = DayTimeIntervalType::DayMilliseconds;
|
||||
|
||||
explicit StringFormatter(const DataType*) {}
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(value_type interval, Appender&& append) {
|
||||
constexpr size_t buffer_size =
|
||||
/*d, ms*/ 3 + /*negative signs*/ 2 +
|
||||
/*days/milliseconds*/ 2 * detail::Digits10(std::numeric_limits<int32_t>::max());
|
||||
std::array<char, buffer_size> buffer;
|
||||
char* cursor = buffer.data() + buffer_size;
|
||||
|
||||
detail::FormatOneChar('s', &cursor);
|
||||
detail::FormatOneChar('m', &cursor);
|
||||
detail::FormatAllDigits(detail::Abs(interval.milliseconds), &cursor);
|
||||
if (interval.milliseconds < 0) detail::FormatOneChar('-', &cursor);
|
||||
|
||||
detail::FormatOneChar('d', &cursor);
|
||||
detail::FormatAllDigits(detail::Abs(interval.days), &cursor);
|
||||
if (interval.days < 0) detail::FormatOneChar('-', &cursor);
|
||||
|
||||
return append(detail::ViewDigitBuffer(buffer, cursor));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class StringFormatter<MonthDayNanoIntervalType> {
|
||||
public:
|
||||
using value_type = MonthDayNanoIntervalType::MonthDayNanos;
|
||||
|
||||
explicit StringFormatter(const DataType*) {}
|
||||
|
||||
template <typename Appender>
|
||||
Return<Appender> operator()(value_type interval, Appender&& append) {
|
||||
constexpr size_t buffer_size =
|
||||
/*m, d, ns*/ 4 + /*negative signs*/ 3 +
|
||||
/*months/days*/ 2 * detail::Digits10(std::numeric_limits<int32_t>::max()) +
|
||||
/*nanoseconds*/ detail::Digits10(std::numeric_limits<int64_t>::max());
|
||||
std::array<char, buffer_size> buffer;
|
||||
char* cursor = buffer.data() + buffer_size;
|
||||
|
||||
detail::FormatOneChar('s', &cursor);
|
||||
detail::FormatOneChar('n', &cursor);
|
||||
detail::FormatAllDigits(detail::Abs(interval.nanoseconds), &cursor);
|
||||
if (interval.nanoseconds < 0) detail::FormatOneChar('-', &cursor);
|
||||
|
||||
detail::FormatOneChar('d', &cursor);
|
||||
detail::FormatAllDigits(detail::Abs(interval.days), &cursor);
|
||||
if (interval.days < 0) detail::FormatOneChar('-', &cursor);
|
||||
|
||||
detail::FormatOneChar('M', &cursor);
|
||||
detail::FormatAllDigits(detail::Abs(interval.months), &cursor);
|
||||
if (interval.months < 0) detail::FormatOneChar('-', &cursor);
|
||||
|
||||
return append(detail::ViewDigitBuffer(buffer, cursor));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,160 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
struct Empty {
|
||||
static Result<Empty> ToResult(Status s) {
|
||||
if (ARROW_PREDICT_TRUE(s.ok())) {
|
||||
return Empty{};
|
||||
}
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
/// Helper struct for examining lambdas and other callables.
|
||||
/// TODO(ARROW-12655) support function pointers
|
||||
struct call_traits {
|
||||
public:
|
||||
template <typename R, typename... A>
|
||||
static std::false_type is_overloaded_impl(R(A...));
|
||||
|
||||
template <typename F>
|
||||
static std::false_type is_overloaded_impl(decltype(&F::operator())*);
|
||||
|
||||
template <typename F>
|
||||
static std::true_type is_overloaded_impl(...);
|
||||
|
||||
template <typename F, typename R, typename... A>
|
||||
static R return_type_impl(R (F::*)(A...));
|
||||
|
||||
template <typename F, typename R, typename... A>
|
||||
static R return_type_impl(R (F::*)(A...) const);
|
||||
|
||||
template <std::size_t I, typename F, typename R, typename... A>
|
||||
static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
|
||||
R (F::*)(A...));
|
||||
|
||||
template <std::size_t I, typename F, typename R, typename... A>
|
||||
static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
|
||||
R (F::*)(A...) const);
|
||||
|
||||
template <std::size_t I, typename F, typename R, typename... A>
|
||||
static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
|
||||
R (F::*)(A...) &&);
|
||||
|
||||
template <typename F, typename R, typename... A>
|
||||
static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...));
|
||||
|
||||
template <typename F, typename R, typename... A>
|
||||
static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...)
|
||||
const);
|
||||
|
||||
template <typename F, typename R, typename... A>
|
||||
static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...) &&);
|
||||
|
||||
/// bool constant indicating whether F is a callable with more than one possible
|
||||
/// signature. Will be true_type for objects which define multiple operator() or which
|
||||
/// define a template operator()
|
||||
template <typename F>
|
||||
using is_overloaded =
|
||||
decltype(is_overloaded_impl<typename std::decay<F>::type>(NULLPTR));
|
||||
|
||||
template <typename F, typename T = void>
|
||||
using enable_if_overloaded = typename std::enable_if<is_overloaded<F>::value, T>::type;
|
||||
|
||||
template <typename F, typename T = void>
|
||||
using disable_if_overloaded =
|
||||
typename std::enable_if<!is_overloaded<F>::value, T>::type;
|
||||
|
||||
/// If F is not overloaded, the argument types of its call operator can be
|
||||
/// extracted via call_traits::argument_type<Index, F>
|
||||
template <std::size_t I, typename F>
|
||||
using argument_type = decltype(argument_type_impl<I>(&std::decay<F>::type::operator()));
|
||||
|
||||
template <typename F>
|
||||
using argument_count = decltype(argument_count_impl(&std::decay<F>::type::operator()));
|
||||
|
||||
template <typename F>
|
||||
using return_type = decltype(return_type_impl(&std::decay<F>::type::operator()));
|
||||
|
||||
template <typename F, typename T, typename RT = T>
|
||||
using enable_if_return =
|
||||
typename std::enable_if<std::is_same<return_type<F>, T>::value, RT>;
|
||||
|
||||
template <typename T, typename R = void>
|
||||
using enable_if_empty = typename std::enable_if<std::is_same<T, Empty>::value, R>::type;
|
||||
|
||||
template <typename T, typename R = void>
|
||||
using enable_if_not_empty =
|
||||
typename std::enable_if<!std::is_same<T, Empty>::value, R>::type;
|
||||
};
|
||||
|
||||
/// A type erased callable object which may only be invoked once.
|
||||
/// It can be constructed from any lambda which matches the provided call signature.
|
||||
/// Invoking it results in destruction of the lambda, freeing any state/references
|
||||
/// immediately. Invoking a default constructed FnOnce or one which has already been
|
||||
/// invoked will segfault.
|
||||
template <typename Signature>
|
||||
class FnOnce;
|
||||
|
||||
template <typename R, typename... A>
|
||||
class FnOnce<R(A...)> {
|
||||
public:
|
||||
FnOnce() = default;
|
||||
|
||||
template <typename Fn,
|
||||
typename = typename std::enable_if<std::is_convertible<
|
||||
decltype(std::declval<Fn&&>()(std::declval<A>()...)), R>::value>::type>
|
||||
FnOnce(Fn fn) : impl_(new FnImpl<Fn>(std::move(fn))) { // NOLINT runtime/explicit
|
||||
}
|
||||
|
||||
explicit operator bool() const { return impl_ != NULLPTR; }
|
||||
|
||||
R operator()(A... a) && {
|
||||
auto bye = std::move(impl_);
|
||||
return bye->invoke(std::forward<A&&>(a)...);
|
||||
}
|
||||
|
||||
private:
|
||||
struct Impl {
|
||||
virtual ~Impl() = default;
|
||||
virtual R invoke(A&&... a) = 0;
|
||||
};
|
||||
|
||||
template <typename Fn>
|
||||
struct FnImpl : Impl {
|
||||
explicit FnImpl(Fn fn) : fn_(std::move(fn)) {}
|
||||
R invoke(A&&... a) override { return std::move(fn_)(std::forward<A&&>(a)...); }
|
||||
Fn fn_;
|
||||
};
|
||||
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,882 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/config.h"
|
||||
#include "arrow/util/functional.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/tracing.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
template <typename>
|
||||
struct EnsureFuture;
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename>
|
||||
struct is_future : std::false_type {};
|
||||
|
||||
template <typename T>
|
||||
struct is_future<Future<T>> : std::true_type {};
|
||||
|
||||
template <typename Signature, typename Enable = void>
|
||||
struct result_of;
|
||||
|
||||
template <typename Fn, typename... A>
|
||||
struct result_of<Fn(A...),
|
||||
internal::void_t<decltype(std::declval<Fn>()(std::declval<A>()...))>> {
|
||||
using type = decltype(std::declval<Fn>()(std::declval<A>()...));
|
||||
};
|
||||
|
||||
template <typename Signature>
|
||||
using result_of_t = typename result_of<Signature>::type;
|
||||
|
||||
// Helper to find the synchronous counterpart for a Future
|
||||
template <typename T>
|
||||
struct SyncType {
|
||||
using type = Result<T>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SyncType<internal::Empty> {
|
||||
using type = Status;
|
||||
};
|
||||
|
||||
template <typename Fn>
|
||||
using first_arg_is_status =
|
||||
std::is_same<typename std::decay<internal::call_traits::argument_type<0, Fn>>::type,
|
||||
Status>;
|
||||
|
||||
template <typename Fn, typename Then, typename Else,
|
||||
typename Count = internal::call_traits::argument_count<Fn>>
|
||||
using if_has_no_args = typename std::conditional<Count::value == 0, Then, Else>::type;
|
||||
|
||||
/// Creates a callback that can be added to a future to mark a `dest` future finished
|
||||
template <typename Source, typename Dest, bool SourceEmpty = Source::is_empty,
|
||||
bool DestEmpty = Dest::is_empty>
|
||||
struct MarkNextFinished {};
|
||||
|
||||
/// If the source and dest are both empty we can pass on the status
|
||||
template <typename Source, typename Dest>
|
||||
struct MarkNextFinished<Source, Dest, true, true> {
|
||||
void operator()(const Status& status) && { next.MarkFinished(status); }
|
||||
Dest next;
|
||||
};
|
||||
|
||||
/// If the source is not empty but the dest is then we can take the
|
||||
/// status out of the result
|
||||
template <typename Source, typename Dest>
|
||||
struct MarkNextFinished<Source, Dest, false, true> {
|
||||
void operator()(const Result<typename Source::ValueType>& res) && {
|
||||
next.MarkFinished(internal::Empty::ToResult(res.status()));
|
||||
}
|
||||
Dest next;
|
||||
};
|
||||
|
||||
/// If neither are empty we pass on the result
|
||||
template <typename Source, typename Dest>
|
||||
struct MarkNextFinished<Source, Dest, false, false> {
|
||||
void operator()(const Result<typename Source::ValueType>& res) && {
|
||||
next.MarkFinished(res);
|
||||
}
|
||||
Dest next;
|
||||
};
|
||||
|
||||
/// Helper that contains information about how to apply a continuation
|
||||
struct ContinueFuture {
|
||||
template <typename Return>
|
||||
struct ForReturnImpl;
|
||||
|
||||
template <typename Return>
|
||||
using ForReturn = typename ForReturnImpl<Return>::type;
|
||||
|
||||
template <typename Signature>
|
||||
using ForSignature = ForReturn<result_of_t<Signature>>;
|
||||
|
||||
// If the callback returns void then we return Future<> that always finishes OK.
|
||||
template <typename ContinueFunc, typename... Args,
|
||||
typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
|
||||
typename NextFuture = ForReturn<ContinueResult>>
|
||||
typename std::enable_if<std::is_void<ContinueResult>::value>::type operator()(
|
||||
NextFuture next, ContinueFunc&& f, Args&&... a) const {
|
||||
std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
|
||||
next.MarkFinished();
|
||||
}
|
||||
|
||||
/// If the callback returns a non-future then we return Future<T>
|
||||
/// and mark the future finished with the callback result. It will get promoted
|
||||
/// to Result<T> as part of MarkFinished if it isn't already.
|
||||
///
|
||||
/// If the callback returns Status and we return Future<> then also send the callback
|
||||
/// result as-is to the destination future.
|
||||
template <typename ContinueFunc, typename... Args,
|
||||
typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
|
||||
typename NextFuture = ForReturn<ContinueResult>>
|
||||
typename std::enable_if<
|
||||
!std::is_void<ContinueResult>::value && !is_future<ContinueResult>::value &&
|
||||
(!NextFuture::is_empty || std::is_same<ContinueResult, Status>::value)>::type
|
||||
operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
|
||||
next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...));
|
||||
}
|
||||
|
||||
/// If the callback returns a Result and the next future is Future<> then we mark
|
||||
/// the future finished with the callback result.
|
||||
///
|
||||
/// It may seem odd that the next future is Future<> when the callback returns a
|
||||
/// result but this can occur if the OnFailure callback returns a result while the
|
||||
/// OnSuccess callback is void/Status (e.g. you would get this calling the one-arg
|
||||
/// version of Then with an OnSuccess callback that returns void)
|
||||
template <typename ContinueFunc, typename... Args,
|
||||
typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
|
||||
typename NextFuture = ForReturn<ContinueResult>>
|
||||
typename std::enable_if<!std::is_void<ContinueResult>::value &&
|
||||
!is_future<ContinueResult>::value && NextFuture::is_empty &&
|
||||
!std::is_same<ContinueResult, Status>::value>::type
|
||||
operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
|
||||
next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...).status());
|
||||
}
|
||||
|
||||
/// If the callback returns a Future<T> then we return Future<T>. We create a new
|
||||
/// future and add a callback to the future given to us by the user that forwards the
|
||||
/// result to the future we just created
|
||||
template <typename ContinueFunc, typename... Args,
|
||||
typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
|
||||
typename NextFuture = ForReturn<ContinueResult>>
|
||||
typename std::enable_if<is_future<ContinueResult>::value>::type operator()(
|
||||
NextFuture next, ContinueFunc&& f, Args&&... a) const {
|
||||
ContinueResult signal_to_complete_next =
|
||||
std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
|
||||
MarkNextFinished<ContinueResult, NextFuture> callback{std::move(next)};
|
||||
signal_to_complete_next.AddCallback(std::move(callback));
|
||||
}
|
||||
|
||||
/// Helpers to conditionally ignore arguments to ContinueFunc
|
||||
template <typename ContinueFunc, typename NextFuture, typename... Args>
|
||||
void IgnoringArgsIf(std::true_type, NextFuture&& next, ContinueFunc&& f,
|
||||
Args&&...) const {
|
||||
operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f));
|
||||
}
|
||||
template <typename ContinueFunc, typename NextFuture, typename... Args>
|
||||
void IgnoringArgsIf(std::false_type, NextFuture&& next, ContinueFunc&& f,
|
||||
Args&&... a) const {
|
||||
operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f),
|
||||
std::forward<Args>(a)...);
|
||||
}
|
||||
};
|
||||
|
||||
/// Helper struct which tells us what kind of Future gets returned from `Then` based on
|
||||
/// the return type of the OnSuccess callback
|
||||
template <>
|
||||
struct ContinueFuture::ForReturnImpl<void> {
|
||||
using type = Future<>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct ContinueFuture::ForReturnImpl<Status> {
|
||||
using type = Future<>;
|
||||
};
|
||||
|
||||
template <typename R>
|
||||
struct ContinueFuture::ForReturnImpl {
|
||||
using type = Future<R>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ContinueFuture::ForReturnImpl<Result<T>> {
|
||||
using type = Future<T>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ContinueFuture::ForReturnImpl<Future<T>> {
|
||||
using type = Future<T>;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// A Future's execution or completion status
|
||||
enum class FutureState : int8_t { PENDING, SUCCESS, FAILURE };
|
||||
|
||||
inline bool IsFutureFinished(FutureState state) { return state != FutureState::PENDING; }
|
||||
|
||||
/// \brief Describe whether the callback should be scheduled or run synchronously
|
||||
enum class ShouldSchedule {
|
||||
/// Always run the callback synchronously (the default)
|
||||
Never = 0,
|
||||
/// Schedule a new task only if the future is not finished when the
|
||||
/// callback is added
|
||||
IfUnfinished = 1,
|
||||
/// Always schedule the callback as a new task
|
||||
Always = 2,
|
||||
/// Schedule a new task only if it would run on an executor other than
|
||||
/// the specified executor.
|
||||
IfDifferentExecutor = 3,
|
||||
};
|
||||
|
||||
/// \brief Options that control how a continuation is run
|
||||
struct CallbackOptions {
|
||||
/// Describe whether the callback should be run synchronously or scheduled
|
||||
ShouldSchedule should_schedule = ShouldSchedule::Never;
|
||||
/// If the callback is scheduled then this is the executor it should be scheduled
|
||||
/// on. If this is NULL then should_schedule must be Never
|
||||
internal::Executor* executor = NULLPTR;
|
||||
|
||||
static CallbackOptions Defaults() { return {}; }
|
||||
};
|
||||
|
||||
// Untyped private implementation
|
||||
class ARROW_EXPORT FutureImpl : public std::enable_shared_from_this<FutureImpl> {
|
||||
public:
|
||||
FutureImpl();
|
||||
virtual ~FutureImpl() = default;
|
||||
|
||||
FutureState state() { return state_.load(); }
|
||||
|
||||
static std::unique_ptr<FutureImpl> Make();
|
||||
static std::unique_ptr<FutureImpl> MakeFinished(FutureState state);
|
||||
|
||||
#ifdef ARROW_WITH_OPENTELEMETRY
|
||||
void SetSpan(util::tracing::Span* span) { span_ = span; }
|
||||
#endif
|
||||
|
||||
// Future API
|
||||
void MarkFinished();
|
||||
void MarkFailed();
|
||||
void Wait();
|
||||
bool Wait(double seconds);
|
||||
template <typename ValueType>
|
||||
Result<ValueType>* CastResult() const {
|
||||
return static_cast<Result<ValueType>*>(result_.get());
|
||||
}
|
||||
|
||||
using Callback = internal::FnOnce<void(const FutureImpl& impl)>;
|
||||
void AddCallback(Callback callback, CallbackOptions opts);
|
||||
bool TryAddCallback(const std::function<Callback()>& callback_factory,
|
||||
CallbackOptions opts);
|
||||
|
||||
std::atomic<FutureState> state_{FutureState::PENDING};
|
||||
|
||||
// Type erased storage for arbitrary results
|
||||
// XXX small objects could be stored inline instead of boxed in a pointer
|
||||
using Storage = std::unique_ptr<void, void (*)(void*)>;
|
||||
Storage result_{NULLPTR, NULLPTR};
|
||||
|
||||
struct CallbackRecord {
|
||||
Callback callback;
|
||||
CallbackOptions options;
|
||||
};
|
||||
std::vector<CallbackRecord> callbacks_;
|
||||
#ifdef ARROW_WITH_OPENTELEMETRY
|
||||
util::tracing::Span* span_ = NULLPTR;
|
||||
#endif
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
// Public API
|
||||
|
||||
/// \brief EXPERIMENTAL A std::future-like class with more functionality.
|
||||
///
|
||||
/// A Future represents the results of a past or future computation.
|
||||
/// The Future API has two sides: a producer side and a consumer side.
|
||||
///
|
||||
/// The producer API allows creating a Future and setting its result or
|
||||
/// status, possibly after running a computation function.
|
||||
///
|
||||
/// The consumer API allows querying a Future's current state, wait for it
|
||||
/// to complete, and composing futures with callbacks.
|
||||
template <typename T>
|
||||
class [[nodiscard]] Future {
|
||||
public:
|
||||
using ValueType = T;
|
||||
using SyncType = typename detail::SyncType<T>::type;
|
||||
static constexpr bool is_empty = std::is_same<T, internal::Empty>::value;
|
||||
// The default constructor creates an invalid Future. Use Future::Make()
|
||||
// for a valid Future. This constructor is mostly for the convenience
|
||||
// of being able to presize a vector of Futures.
|
||||
Future() = default;
|
||||
|
||||
#ifdef ARROW_WITH_OPENTELEMETRY
|
||||
void SetSpan(util::tracing::Span* span) { impl_->SetSpan(span); }
|
||||
#endif
|
||||
|
||||
// Consumer API
|
||||
|
||||
bool is_valid() const { return impl_ != NULLPTR; }
|
||||
|
||||
/// \brief Return the Future's current state
|
||||
///
|
||||
/// A return value of PENDING is only indicative, as the Future can complete
|
||||
/// concurrently. A return value of FAILURE or SUCCESS is definitive, though.
|
||||
FutureState state() const {
|
||||
CheckValid();
|
||||
return impl_->state();
|
||||
}
|
||||
|
||||
/// \brief Whether the Future is finished
|
||||
///
|
||||
/// A false return value is only indicative, as the Future can complete
|
||||
/// concurrently. A true return value is definitive, though.
|
||||
bool is_finished() const {
|
||||
CheckValid();
|
||||
return IsFutureFinished(impl_->state());
|
||||
}
|
||||
|
||||
/// \brief Wait for the Future to complete and return its Result
|
||||
const Result<ValueType>& result() const& {
|
||||
Wait();
|
||||
return *GetResult();
|
||||
}
|
||||
|
||||
/// \brief Returns an rvalue to the result. This method is potentially unsafe
|
||||
///
|
||||
/// The future is not the unique owner of the result, copies of a future will
|
||||
/// also point to the same result. You must make sure that no other copies
|
||||
/// of the future exist. Attempts to add callbacks after you move the result
|
||||
/// will result in undefined behavior.
|
||||
Result<ValueType>&& MoveResult() {
|
||||
Wait();
|
||||
return std::move(*GetResult());
|
||||
}
|
||||
|
||||
/// \brief Wait for the Future to complete and return its Status
|
||||
const Status& status() const { return result().status(); }
|
||||
|
||||
/// \brief Future<T> is convertible to Future<>, which views only the
|
||||
/// Status of the original. Marking the returned Future Finished is not supported.
|
||||
explicit operator Future<>() const {
|
||||
Future<> status_future;
|
||||
status_future.impl_ = impl_;
|
||||
return status_future;
|
||||
}
|
||||
|
||||
/// \brief Wait for the Future to complete
|
||||
void Wait() const {
|
||||
CheckValid();
|
||||
impl_->Wait();
|
||||
}
|
||||
|
||||
/// \brief Wait for the Future to complete, or for the timeout to expire
|
||||
///
|
||||
/// `true` is returned if the Future completed, `false` if the timeout expired.
|
||||
/// Note a `false` value is only indicative, as the Future can complete
|
||||
/// concurrently.
|
||||
bool Wait(double seconds) const {
|
||||
CheckValid();
|
||||
return impl_->Wait(seconds);
|
||||
}
|
||||
|
||||
// Producer API
|
||||
|
||||
/// \brief Producer API: mark Future finished
|
||||
///
|
||||
/// The Future's result is set to `res`.
|
||||
void MarkFinished(Result<ValueType> res) { DoMarkFinished(std::move(res)); }
|
||||
|
||||
/// \brief Mark a Future<> completed with the provided Status.
|
||||
template <typename E = ValueType, typename = typename std::enable_if<
|
||||
std::is_same<E, internal::Empty>::value>::type>
|
||||
void MarkFinished(Status s = Status::OK()) {
|
||||
return DoMarkFinished(E::ToResult(std::move(s)));
|
||||
}
|
||||
|
||||
/// \brief Producer API: instantiate a valid Future
|
||||
///
|
||||
/// The Future's state is initialized with PENDING. If you are creating a future with
|
||||
/// this method you must ensure that future is eventually completed (with success or
|
||||
/// failure). Creating a future, returning it, and never completing the future can lead
|
||||
/// to memory leaks (for example, see Loop).
|
||||
static Future Make() {
|
||||
Future fut;
|
||||
fut.impl_ = FutureImpl::Make();
|
||||
return fut;
|
||||
}
|
||||
|
||||
/// \brief Producer API: instantiate a finished Future
|
||||
static Future<ValueType> MakeFinished(Result<ValueType> res) {
|
||||
Future<ValueType> fut;
|
||||
fut.InitializeFromResult(std::move(res));
|
||||
return fut;
|
||||
}
|
||||
|
||||
/// \brief Make a finished Future<> with the provided Status.
|
||||
template <typename E = ValueType, typename = typename std::enable_if<
|
||||
std::is_same<E, internal::Empty>::value>::type>
|
||||
static Future<> MakeFinished(Status s = Status::OK()) {
|
||||
return MakeFinished(E::ToResult(std::move(s)));
|
||||
}
|
||||
|
||||
struct WrapResultOnComplete {
|
||||
template <typename OnComplete>
|
||||
struct Callback {
|
||||
void operator()(const FutureImpl& impl) && {
|
||||
std::move(on_complete)(*impl.CastResult<ValueType>());
|
||||
}
|
||||
OnComplete on_complete;
|
||||
};
|
||||
};
|
||||
|
||||
struct WrapStatusyOnComplete {
|
||||
template <typename OnComplete>
|
||||
struct Callback {
|
||||
static_assert(std::is_same<internal::Empty, ValueType>::value,
|
||||
"Only callbacks for Future<> should accept Status and not Result");
|
||||
|
||||
void operator()(const FutureImpl& impl) && {
|
||||
std::move(on_complete)(impl.CastResult<ValueType>()->status());
|
||||
}
|
||||
OnComplete on_complete;
|
||||
};
|
||||
};
|
||||
|
||||
template <typename OnComplete>
|
||||
using WrapOnComplete = typename std::conditional<
|
||||
detail::first_arg_is_status<OnComplete>::value, WrapStatusyOnComplete,
|
||||
WrapResultOnComplete>::type::template Callback<OnComplete>;
|
||||
|
||||
/// \brief Consumer API: Register a callback to run when this future completes
|
||||
///
|
||||
/// The callback should receive the result of the future (const Result<T>&)
|
||||
/// For a void or statusy future this should be (const Status&)
|
||||
///
|
||||
/// There is no guarantee to the order in which callbacks will run. In
|
||||
/// particular, callbacks added while the future is being marked complete
|
||||
/// may be executed immediately, ahead of, or even the same time as, other
|
||||
/// callbacks that have been previously added.
|
||||
///
|
||||
/// WARNING: callbacks may hold arbitrary references, including cyclic references.
|
||||
/// Since callbacks will only be destroyed after they are invoked, this can lead to
|
||||
/// memory leaks if a Future is never marked finished (abandoned):
|
||||
///
|
||||
/// {
|
||||
/// auto fut = Future<>::Make();
|
||||
/// fut.AddCallback([fut]() {});
|
||||
/// }
|
||||
///
|
||||
/// In this example `fut` falls out of scope but is not destroyed because it holds a
|
||||
/// cyclic reference to itself through the callback.
|
||||
template <typename OnComplete, typename Callback = WrapOnComplete<OnComplete>>
|
||||
void AddCallback(OnComplete on_complete,
|
||||
CallbackOptions opts = CallbackOptions::Defaults()) const {
|
||||
// We know impl_ will not be dangling when invoking callbacks because at least one
|
||||
// thread will be waiting for MarkFinished to return. Thus it's safe to keep a
|
||||
// weak reference to impl_ here
|
||||
impl_->AddCallback(Callback{std::move(on_complete)}, opts);
|
||||
}
|
||||
|
||||
/// \brief Overload of AddCallback that will return false instead of running
|
||||
/// synchronously
|
||||
///
|
||||
/// This overload will guarantee the callback is never run synchronously. If the future
|
||||
/// is already finished then it will simply return false. This can be useful to avoid
|
||||
/// stack overflow in a situation where you have recursive Futures. For an example
|
||||
/// see the Loop function
|
||||
///
|
||||
/// Takes in a callback factory function to allow moving callbacks (the factory function
|
||||
/// will only be called if the callback can successfully be added)
|
||||
///
|
||||
/// Returns true if a callback was actually added and false if the callback failed
|
||||
/// to add because the future was marked complete.
|
||||
template <typename CallbackFactory,
|
||||
typename OnComplete = detail::result_of_t<CallbackFactory()>,
|
||||
typename Callback = WrapOnComplete<OnComplete>>
|
||||
bool TryAddCallback(CallbackFactory callback_factory,
|
||||
CallbackOptions opts = CallbackOptions::Defaults()) const {
|
||||
return impl_->TryAddCallback([&]() { return Callback{callback_factory()}; }, opts);
|
||||
}
|
||||
|
||||
template <typename OnSuccess, typename OnFailure>
|
||||
struct ThenOnComplete {
|
||||
static constexpr bool has_no_args =
|
||||
internal::call_traits::argument_count<OnSuccess>::value == 0;
|
||||
|
||||
using ContinuedFuture = detail::ContinueFuture::ForSignature<
|
||||
detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
|
||||
|
||||
static_assert(
|
||||
std::is_same<detail::ContinueFuture::ForSignature<OnFailure && (const Status&)>,
|
||||
ContinuedFuture>::value,
|
||||
"OnSuccess and OnFailure must continue with the same future type");
|
||||
|
||||
struct DummyOnSuccess {
|
||||
void operator()(const T&);
|
||||
};
|
||||
using OnSuccessArg = typename std::decay<internal::call_traits::argument_type<
|
||||
0, detail::if_has_no_args<OnSuccess, DummyOnSuccess, OnSuccess>>>::type;
|
||||
|
||||
static_assert(
|
||||
!std::is_same<OnSuccessArg, typename EnsureResult<OnSuccessArg>::type>::value,
|
||||
"OnSuccess' argument should not be a Result");
|
||||
|
||||
void operator()(const Result<T>& result) && {
|
||||
detail::ContinueFuture continue_future;
|
||||
if (ARROW_PREDICT_TRUE(result.ok())) {
|
||||
// move on_failure to a(n immediately destroyed) temporary to free its resources
|
||||
ARROW_UNUSED(OnFailure(std::move(on_failure)));
|
||||
continue_future.IgnoringArgsIf(
|
||||
detail::if_has_no_args<OnSuccess, std::true_type, std::false_type>{},
|
||||
std::move(next), std::move(on_success), result.ValueOrDie());
|
||||
} else {
|
||||
ARROW_UNUSED(OnSuccess(std::move(on_success)));
|
||||
continue_future(std::move(next), std::move(on_failure), result.status());
|
||||
}
|
||||
}
|
||||
|
||||
OnSuccess on_success;
|
||||
OnFailure on_failure;
|
||||
ContinuedFuture next;
|
||||
};
|
||||
|
||||
template <typename OnSuccess>
|
||||
struct PassthruOnFailure {
|
||||
using ContinuedFuture = detail::ContinueFuture::ForSignature<
|
||||
detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
|
||||
|
||||
Result<typename ContinuedFuture::ValueType> operator()(const Status& s) { return s; }
|
||||
};
|
||||
|
||||
/// \brief Consumer API: Register a continuation to run when this future completes
|
||||
///
|
||||
/// The continuation will run in the same thread that called MarkFinished (whatever
|
||||
/// callback is registered with this function will run before MarkFinished returns).
|
||||
/// Avoid long-running callbacks in favor of submitting a task to an Executor and
|
||||
/// returning the future.
|
||||
///
|
||||
/// Two callbacks are supported:
|
||||
/// - OnSuccess, called with the result (const ValueType&) on successful completion.
|
||||
/// for an empty future this will be called with nothing ()
|
||||
/// - OnFailure, called with the error (const Status&) on failed completion.
|
||||
/// This callback is optional and defaults to a passthru of any errors.
|
||||
///
|
||||
/// Then() returns a Future whose ValueType is derived from the return type of the
|
||||
/// callbacks. If a callback returns:
|
||||
/// - void, a Future<> will be returned which will completes successfully as soon
|
||||
/// as the callback runs.
|
||||
/// - Status, a Future<> will be returned which will complete with the returned Status
|
||||
/// as soon as the callback runs.
|
||||
/// - V or Result<V>, a Future<V> will be returned which will complete with the result
|
||||
/// of invoking the callback as soon as the callback runs.
|
||||
/// - Future<V>, a Future<V> will be returned which will be marked complete when the
|
||||
/// future returned by the callback completes (and will complete with the same
|
||||
/// result).
|
||||
///
|
||||
/// The continued Future type must be the same for both callbacks.
|
||||
///
|
||||
/// Note that OnFailure can swallow errors, allowing continued Futures to successfully
|
||||
/// complete even if this Future fails.
|
||||
///
|
||||
/// If this future is already completed then the callback will be run immediately
|
||||
/// and the returned future may already be marked complete.
|
||||
///
|
||||
/// See AddCallback for general considerations when writing callbacks.
|
||||
template <typename OnSuccess, typename OnFailure = PassthruOnFailure<OnSuccess>,
|
||||
typename OnComplete = ThenOnComplete<OnSuccess, OnFailure>,
|
||||
typename ContinuedFuture = typename OnComplete::ContinuedFuture>
|
||||
ContinuedFuture Then(OnSuccess on_success, OnFailure on_failure = {},
|
||||
CallbackOptions options = CallbackOptions::Defaults()) const {
|
||||
auto next = ContinuedFuture::Make();
|
||||
AddCallback(OnComplete{std::forward<OnSuccess>(on_success),
|
||||
std::forward<OnFailure>(on_failure), next},
|
||||
options);
|
||||
return next;
|
||||
}
|
||||
|
||||
/// \brief Implicit constructor to create a finished future from a value
|
||||
Future(ValueType val) : Future() { // NOLINT runtime/explicit
|
||||
impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
|
||||
SetResult(std::move(val));
|
||||
}
|
||||
|
||||
/// \brief Implicit constructor to create a future from a Result, enabling use
|
||||
/// of macros like ARROW_ASSIGN_OR_RAISE.
|
||||
Future(Result<ValueType> res) : Future() { // NOLINT runtime/explicit
|
||||
if (ARROW_PREDICT_TRUE(res.ok())) {
|
||||
impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
|
||||
} else {
|
||||
impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
|
||||
}
|
||||
SetResult(std::move(res));
|
||||
}
|
||||
|
||||
/// \brief Implicit constructor to create a future from a Status, enabling use
|
||||
/// of macros like ARROW_RETURN_NOT_OK.
|
||||
Future(Status s) // NOLINT runtime/explicit
|
||||
: Future(Result<ValueType>(std::move(s))) {}
|
||||
|
||||
protected:
|
||||
void InitializeFromResult(Result<ValueType> res) {
|
||||
if (ARROW_PREDICT_TRUE(res.ok())) {
|
||||
impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
|
||||
} else {
|
||||
impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
|
||||
}
|
||||
SetResult(std::move(res));
|
||||
}
|
||||
|
||||
void Initialize() { impl_ = FutureImpl::Make(); }
|
||||
|
||||
Result<ValueType>* GetResult() const { return impl_->CastResult<ValueType>(); }
|
||||
|
||||
void SetResult(Result<ValueType> res) {
|
||||
impl_->result_ = {new Result<ValueType>(std::move(res)),
|
||||
[](void* p) { delete static_cast<Result<ValueType>*>(p); }};
|
||||
}
|
||||
|
||||
void DoMarkFinished(Result<ValueType> res) {
|
||||
SetResult(std::move(res));
|
||||
|
||||
if (ARROW_PREDICT_TRUE(GetResult()->ok())) {
|
||||
impl_->MarkFinished();
|
||||
} else {
|
||||
impl_->MarkFailed();
|
||||
}
|
||||
}
|
||||
|
||||
void CheckValid() const {
|
||||
#ifndef NDEBUG
|
||||
if (!is_valid()) {
|
||||
Status::Invalid("Invalid Future (default-initialized?)").Abort();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
explicit Future(std::shared_ptr<FutureImpl> impl) : impl_(std::move(impl)) {}
|
||||
|
||||
std::shared_ptr<FutureImpl> impl_;
|
||||
|
||||
friend struct detail::ContinueFuture;
|
||||
|
||||
template <typename U>
|
||||
friend class Future;
|
||||
friend class WeakFuture<T>;
|
||||
|
||||
FRIEND_TEST(FutureRefTest, ChainRemoved);
|
||||
FRIEND_TEST(FutureRefTest, TailRemoved);
|
||||
FRIEND_TEST(FutureRefTest, HeadRemoved);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
typename Future<T>::SyncType FutureToSync(const Future<T>& fut) {
|
||||
return fut.result();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline typename Future<internal::Empty>::SyncType FutureToSync<internal::Empty>(
|
||||
const Future<internal::Empty>& fut) {
|
||||
return fut.status();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Future<>::Future(Status s) : Future(internal::Empty::ToResult(std::move(s))) {}
|
||||
|
||||
template <typename T>
|
||||
class WeakFuture {
|
||||
public:
|
||||
explicit WeakFuture(const Future<T>& future) : impl_(future.impl_) {}
|
||||
|
||||
Future<T> get() { return Future<T>{impl_.lock()}; }
|
||||
|
||||
private:
|
||||
std::weak_ptr<FutureImpl> impl_;
|
||||
};
|
||||
|
||||
/// \defgroup future-utilities Functions for working with Futures
|
||||
/// @{
|
||||
|
||||
/// If a Result<Future> holds an error instead of a Future, construct a finished Future
|
||||
/// holding that error.
|
||||
template <typename T>
|
||||
static Future<T> DeferNotOk(Result<Future<T>> maybe_future) {
|
||||
if (ARROW_PREDICT_FALSE(!maybe_future.ok())) {
|
||||
return Future<T>::MakeFinished(std::move(maybe_future).status());
|
||||
}
|
||||
return std::move(maybe_future).MoveValueUnsafe();
|
||||
}
|
||||
|
||||
/// \brief Create a Future which completes when all of `futures` complete.
|
||||
///
|
||||
/// The future's result is a vector of the results of `futures`.
|
||||
/// Note that this future will never be marked "failed"; failed results
|
||||
/// will be stored in the result vector alongside successful results.
|
||||
template <typename T>
|
||||
Future<std::vector<Result<T>>> All(std::vector<Future<T>> futures) {
|
||||
struct State {
|
||||
explicit State(std::vector<Future<T>> f)
|
||||
: futures(std::move(f)), n_remaining(futures.size()) {}
|
||||
|
||||
std::vector<Future<T>> futures;
|
||||
std::atomic<size_t> n_remaining;
|
||||
};
|
||||
|
||||
if (futures.size() == 0) {
|
||||
return {std::vector<Result<T>>{}};
|
||||
}
|
||||
|
||||
auto state = std::make_shared<State>(std::move(futures));
|
||||
|
||||
auto out = Future<std::vector<Result<T>>>::Make();
|
||||
for (const Future<T>& future : state->futures) {
|
||||
future.AddCallback([state, out](const Result<T>&) mutable {
|
||||
if (state->n_remaining.fetch_sub(1) != 1) return;
|
||||
|
||||
std::vector<Result<T>> results(state->futures.size());
|
||||
for (size_t i = 0; i < results.size(); ++i) {
|
||||
results[i] = state->futures[i].result();
|
||||
}
|
||||
out.MarkFinished(std::move(results));
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Create a Future which completes when all of `futures` complete.
|
||||
///
|
||||
/// The future will be marked complete if all `futures` complete
|
||||
/// successfully. Otherwise, it will be marked failed with the status of
|
||||
/// the first failing future.
|
||||
ARROW_EXPORT
|
||||
Future<> AllComplete(const std::vector<Future<>>& futures);
|
||||
|
||||
/// \brief Create a Future which completes when all of `futures` complete.
|
||||
///
|
||||
/// The future will finish with an ok status if all `futures` finish with
|
||||
/// an ok status. Otherwise, it will be marked failed with the status of
|
||||
/// one of the failing futures.
|
||||
///
|
||||
/// Unlike AllComplete this Future will not complete immediately when a
|
||||
/// failure occurs. It will wait until all futures have finished.
|
||||
ARROW_EXPORT
|
||||
Future<> AllFinished(const std::vector<Future<>>& futures);
|
||||
|
||||
/// @}
|
||||
|
||||
struct Continue {
|
||||
template <typename T>
|
||||
operator std::optional<T>() && { // NOLINT explicit
|
||||
return {};
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T = internal::Empty>
|
||||
std::optional<T> Break(T break_value = {}) {
|
||||
return std::optional<T>{std::move(break_value)};
|
||||
}
|
||||
|
||||
template <typename T = internal::Empty>
|
||||
using ControlFlow = std::optional<T>;
|
||||
|
||||
/// \brief Loop through an asynchronous sequence
|
||||
///
|
||||
/// \param[in] iterate A generator of Future<ControlFlow<BreakValue>>. On completion
|
||||
/// of each yielded future the resulting ControlFlow will be examined. A Break will
|
||||
/// terminate the loop, while a Continue will re-invoke `iterate`.
|
||||
///
|
||||
/// \return A future which will complete when a Future returned by iterate completes with
|
||||
/// a Break
|
||||
template <typename Iterate,
|
||||
typename Control = typename detail::result_of_t<Iterate()>::ValueType,
|
||||
typename BreakValueType = typename Control::value_type>
|
||||
Future<BreakValueType> Loop(Iterate iterate) {
|
||||
struct Callback {
|
||||
bool CheckForTermination(const Result<Control>& control_res) {
|
||||
if (!control_res.ok()) {
|
||||
break_fut.MarkFinished(control_res.status());
|
||||
return true;
|
||||
}
|
||||
if (control_res->has_value()) {
|
||||
break_fut.MarkFinished(**control_res);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void operator()(const Result<Control>& maybe_control) && {
|
||||
if (CheckForTermination(maybe_control)) return;
|
||||
|
||||
auto control_fut = iterate();
|
||||
while (true) {
|
||||
if (control_fut.TryAddCallback([this]() { return *this; })) {
|
||||
// Adding a callback succeeded; control_fut was not finished
|
||||
// and we must wait to CheckForTermination.
|
||||
return;
|
||||
}
|
||||
// Adding a callback failed; control_fut was finished and we
|
||||
// can CheckForTermination immediately. This also avoids recursion and potential
|
||||
// stack overflow.
|
||||
if (CheckForTermination(control_fut.result())) return;
|
||||
|
||||
control_fut = iterate();
|
||||
}
|
||||
}
|
||||
|
||||
Iterate iterate;
|
||||
|
||||
// If the future returned by control_fut is never completed then we will be hanging on
|
||||
// to break_fut forever even if the listener has given up listening on it. Instead we
|
||||
// rely on the fact that a producer (the caller of Future<>::Make) is always
|
||||
// responsible for completing the futures they create.
|
||||
// TODO: Could avoid this kind of situation with "future abandonment" similar to mesos
|
||||
Future<BreakValueType> break_fut;
|
||||
};
|
||||
|
||||
auto break_fut = Future<BreakValueType>::Make();
|
||||
auto control_fut = iterate();
|
||||
control_fut.AddCallback(Callback{std::move(iterate), break_fut});
|
||||
|
||||
return break_fut;
|
||||
}
|
||||
|
||||
inline Future<> ToFuture(Status status) {
|
||||
return Future<>::MakeFinished(std::move(status));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Future<T> ToFuture(T value) {
|
||||
return Future<T>::MakeFinished(std::move(value));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Future<T> ToFuture(Result<T> maybe_value) {
|
||||
return Future<T>::MakeFinished(std::move(maybe_value));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Future<T> ToFuture(Future<T> fut) {
|
||||
return fut;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct EnsureFuture {
|
||||
using type = decltype(ToFuture(std::declval<T>()));
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,66 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// BEGIN Hash utilities from Boost
|
||||
|
||||
namespace detail {
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# define ARROW_HASH_ROTL32(x, r) _rotl(x, r)
|
||||
#else
|
||||
# define ARROW_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
|
||||
#endif
|
||||
|
||||
template <typename SizeT>
|
||||
inline void hash_combine_impl(SizeT& seed, SizeT value) {
|
||||
seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
|
||||
}
|
||||
|
||||
inline void hash_combine_impl(uint32_t& h1, uint32_t k1) {
|
||||
const uint32_t c1 = 0xcc9e2d51;
|
||||
const uint32_t c2 = 0x1b873593;
|
||||
|
||||
k1 *= c1;
|
||||
k1 = ARROW_HASH_ROTL32(k1, 15);
|
||||
k1 *= c2;
|
||||
|
||||
h1 ^= k1;
|
||||
h1 = ARROW_HASH_ROTL32(h1, 13);
|
||||
h1 = h1 * 5 + 0xe6546b64;
|
||||
}
|
||||
|
||||
#undef ARROW_HASH_ROTL32
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class T>
|
||||
inline void hash_combine(std::size_t& seed, T const& v) {
|
||||
std::hash<T> hasher;
|
||||
return ::arrow::internal::detail::hash_combine_impl(seed, hasher(v));
|
||||
}
|
||||
|
||||
// END Hash utilities from Boost
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,984 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Private header, not to be exported
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/bitmap_builders.h"
|
||||
#include "arrow/util/endian.h"
|
||||
#include "arrow/util/float16.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/ubsan.h"
|
||||
|
||||
#define XXH_INLINE_ALL
|
||||
|
||||
#include "arrow/vendored/xxhash.h" // IWYU pragma: keep
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// XXX would it help to have a 32-bit hash value on large datasets?
|
||||
typedef uint64_t hash_t;
|
||||
|
||||
// Notes about the choice of a hash function.
|
||||
// - XXH3 is extremely fast on most data sizes, from small to huge;
|
||||
// faster even than HW CRC-based hashing schemes
|
||||
// - our custom hash function for tiny values (< 16 bytes) is still
|
||||
// significantly faster (~30%), at least on this machine and compiler
|
||||
|
||||
template <uint64_t AlgNum>
|
||||
inline hash_t ComputeStringHash(const void* data, int64_t length);
|
||||
|
||||
/// \brief A hash function for bitmaps that can handle offsets and lengths in
|
||||
/// terms of number of bits. The hash only depends on the bits actually hashed.
|
||||
///
|
||||
/// It's the caller's responsibility to ensure that bits_offset + num_bits are
|
||||
/// readable from the bitmap.
|
||||
///
|
||||
/// \pre bits_offset >= 0
|
||||
/// \pre num_bits >= 0
|
||||
/// \pre (bits_offset + num_bits + 7) / 8 <= readable length in bytes from bitmap
|
||||
///
|
||||
/// \param bitmap The pointer to the bitmap.
|
||||
/// \param seed The seed for the hash function (useful when chaining hash functions).
|
||||
/// \param bits_offset The offset in bits relative to the start of the bitmap.
|
||||
/// \param num_bits The number of bits after the offset to be hashed.
|
||||
ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed,
|
||||
int64_t bits_offset, int64_t num_bits);
|
||||
|
||||
template <typename Scalar, uint64_t AlgNum>
|
||||
struct ScalarHelperBase {
|
||||
static bool CompareScalars(Scalar u, Scalar v) { return u == v; }
|
||||
|
||||
static hash_t ComputeHash(const Scalar& value) {
|
||||
// Generic hash computation for scalars. Simply apply the string hash
|
||||
// to the bit representation of the value.
|
||||
|
||||
// XXX in the case of FP values, we'd like equal values to have the same hash,
|
||||
// even if they have different bit representations...
|
||||
return ComputeStringHash<AlgNum>(&value, sizeof(value));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, uint64_t AlgNum = 0, typename Enable = void>
|
||||
struct ScalarHelper : public ScalarHelperBase<Scalar, AlgNum> {};
|
||||
|
||||
template <typename Scalar, uint64_t AlgNum>
|
||||
struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_integral<Scalar>::value>>
|
||||
: public ScalarHelperBase<Scalar, AlgNum> {
|
||||
// ScalarHelper specialization for integers
|
||||
|
||||
static hash_t ComputeHash(const Scalar& value) {
|
||||
// Faster hash computation for integers.
|
||||
|
||||
// Two of xxhash's prime multipliers (which are chosen for their
|
||||
// bit dispersion properties)
|
||||
static constexpr uint64_t multipliers[] = {11400714785074694791ULL,
|
||||
14029467366897019727ULL};
|
||||
|
||||
// Multiplying by the prime number mixes the low bits into the high bits,
|
||||
// then byte-swapping (which is a single CPU instruction) allows the
|
||||
// combined high and low bits to participate in the initial hash table index.
|
||||
auto h = static_cast<hash_t>(value);
|
||||
return bit_util::ByteSwap(multipliers[AlgNum] * h);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, uint64_t AlgNum>
|
||||
struct ScalarHelper<Scalar, AlgNum,
|
||||
enable_if_t<std::is_same<std::string_view, Scalar>::value>>
|
||||
: public ScalarHelperBase<Scalar, AlgNum> {
|
||||
// ScalarHelper specialization for std::string_view
|
||||
|
||||
static hash_t ComputeHash(std::string_view value) {
|
||||
return ComputeStringHash<AlgNum>(value.data(), static_cast<int64_t>(value.size()));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, uint64_t AlgNum>
|
||||
struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_floating_point<Scalar>::value>>
|
||||
: public ScalarHelperBase<Scalar, AlgNum> {
|
||||
// ScalarHelper specialization for reals
|
||||
|
||||
static bool CompareScalars(Scalar u, Scalar v) {
|
||||
if (std::isnan(u)) {
|
||||
// XXX should we do a bit-precise comparison?
|
||||
return std::isnan(v);
|
||||
}
|
||||
return u == v;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, uint64_t AlgNum>
|
||||
struct ScalarHelper<Scalar, AlgNum,
|
||||
enable_if_t<std::is_same_v<Scalar, ::arrow::util::Float16>>>
|
||||
: public ScalarHelperBase<Scalar, AlgNum> {
|
||||
// ScalarHelper specialization for Float16
|
||||
|
||||
static bool CompareScalars(Scalar u, Scalar v) {
|
||||
if (u.is_nan()) {
|
||||
// XXX should we do a bit-precise comparison?
|
||||
return v.is_nan();
|
||||
}
|
||||
return u == v;
|
||||
}
|
||||
};
|
||||
|
||||
template <uint64_t AlgNum = 0>
|
||||
hash_t ComputeStringHash(const void* data, int64_t length) {
|
||||
if (ARROW_PREDICT_TRUE(length <= 16)) {
|
||||
// Specialize for small hash strings, as they are quite common as
|
||||
// hash table keys. Even XXH3 isn't quite as fast.
|
||||
auto p = reinterpret_cast<const uint8_t*>(data);
|
||||
auto n = static_cast<uint32_t>(length);
|
||||
if (n <= 8) {
|
||||
if (n <= 3) {
|
||||
if (n == 0) {
|
||||
return 1U;
|
||||
}
|
||||
uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1];
|
||||
return ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
|
||||
}
|
||||
// 4 <= length <= 8
|
||||
// We can read the string as two overlapping 32-bit ints, apply
|
||||
// different hash functions to each of them in parallel, then XOR
|
||||
// the results
|
||||
uint32_t x, y;
|
||||
hash_t hx, hy;
|
||||
x = util::SafeLoadAs<uint32_t>(p + n - 4);
|
||||
y = util::SafeLoadAs<uint32_t>(p);
|
||||
hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
|
||||
hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y);
|
||||
return n ^ hx ^ hy;
|
||||
}
|
||||
// 8 <= length <= 16
|
||||
// Apply the same principle as above
|
||||
uint64_t x, y;
|
||||
hash_t hx, hy;
|
||||
x = util::SafeLoadAs<uint64_t>(p + n - 8);
|
||||
y = util::SafeLoadAs<uint64_t>(p);
|
||||
hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x);
|
||||
hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y);
|
||||
return n ^ hx ^ hy;
|
||||
}
|
||||
|
||||
#if XXH3_SECRET_SIZE_MIN != 136
|
||||
# error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets
|
||||
#endif
|
||||
|
||||
// XXH3_64bits_withSeed generates a secret based on the seed, which is too slow.
|
||||
// Instead, we use hard-coded random secrets. To maximize cache efficiency,
|
||||
// they reuse the same memory area.
|
||||
static constexpr unsigned char kXxh3Secrets[XXH3_SECRET_SIZE_MIN + 1] = {
|
||||
0xe7, 0x8b, 0x13, 0xf9, 0xfc, 0xb5, 0x8e, 0xef, 0x81, 0x48, 0x2c, 0xbf, 0xf9, 0x9f,
|
||||
0xc1, 0x1e, 0x43, 0x6d, 0xbf, 0xa6, 0x6d, 0xb5, 0x72, 0xbc, 0x97, 0xd8, 0x61, 0x24,
|
||||
0x0f, 0x12, 0xe3, 0x05, 0x21, 0xf7, 0x5c, 0x66, 0x67, 0xa5, 0x65, 0x03, 0x96, 0x26,
|
||||
0x69, 0xd8, 0x29, 0x20, 0xf8, 0xc7, 0xb0, 0x3d, 0xdd, 0x7d, 0x18, 0xa0, 0x60, 0x75,
|
||||
0x92, 0xa4, 0xce, 0xba, 0xc0, 0x77, 0xf4, 0xac, 0xb7, 0x03, 0x53, 0xf0, 0x98, 0xce,
|
||||
0xe6, 0x2b, 0x20, 0xc7, 0x82, 0x91, 0xab, 0xbf, 0x68, 0x5c, 0x62, 0x4d, 0x33, 0xa3,
|
||||
0xe1, 0xb3, 0xff, 0x97, 0x54, 0x4c, 0x44, 0x34, 0xb5, 0xb9, 0x32, 0x4c, 0x75, 0x42,
|
||||
0x89, 0x53, 0x94, 0xd4, 0x9f, 0x2b, 0x76, 0x4d, 0x4e, 0xe6, 0xfa, 0x15, 0x3e, 0xc1,
|
||||
0xdb, 0x71, 0x4b, 0x2c, 0x94, 0xf5, 0xfc, 0x8c, 0x89, 0x4b, 0xfb, 0xc1, 0x82, 0xa5,
|
||||
0x6a, 0x53, 0xf9, 0x4a, 0xba, 0xce, 0x1f, 0xc0, 0x97, 0x1a, 0x87};
|
||||
|
||||
static_assert(AlgNum < 2, "AlgNum too large");
|
||||
static constexpr auto secret = kXxh3Secrets + AlgNum;
|
||||
return XXH3_64bits_withSecret(data, static_cast<size_t>(length), secret,
|
||||
XXH3_SECRET_SIZE_MIN);
|
||||
}
|
||||
|
||||
// XXX add a HashEq<ArrowType> struct with both hash and compare functions?
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// An open-addressing insert-only hash table (no deletes)
|
||||
|
||||
template <typename Payload>
|
||||
class HashTable {
|
||||
public:
|
||||
static constexpr hash_t kSentinel = 0ULL;
|
||||
static constexpr int64_t kLoadFactor = 2UL;
|
||||
|
||||
struct Entry {
|
||||
hash_t h;
|
||||
Payload payload;
|
||||
|
||||
// An entry is valid if the hash is different from the sentinel value
|
||||
operator bool() const { return h != kSentinel; }
|
||||
};
|
||||
|
||||
HashTable(MemoryPool* pool, uint64_t capacity) : entries_builder_(pool) {
|
||||
ARROW_DCHECK_NE(pool, nullptr);
|
||||
// Minimum of 32 elements
|
||||
capacity = std::max<uint64_t>(capacity, 32UL);
|
||||
capacity_ = bit_util::NextPower2(capacity);
|
||||
capacity_mask_ = capacity_ - 1;
|
||||
size_ = 0;
|
||||
|
||||
ARROW_DCHECK_OK(UpsizeBuffer(capacity_));
|
||||
}
|
||||
|
||||
// Lookup with non-linear probing
|
||||
// cmp_func should have signature bool(const Payload*).
|
||||
// Return a (Entry*, found) pair.
|
||||
template <typename CmpFunc>
|
||||
std::pair<Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) {
|
||||
auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
|
||||
std::forward<CmpFunc>(cmp_func));
|
||||
return {&entries_[p.first], p.second};
|
||||
}
|
||||
|
||||
template <typename CmpFunc>
|
||||
std::pair<const Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) const {
|
||||
auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
|
||||
std::forward<CmpFunc>(cmp_func));
|
||||
return {&entries_[p.first], p.second};
|
||||
}
|
||||
|
||||
Status Insert(Entry* entry, hash_t h, const Payload& payload) {
|
||||
// Ensure entry is empty before inserting
|
||||
assert(!*entry);
|
||||
entry->h = FixHash(h);
|
||||
entry->payload = payload;
|
||||
++size_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(NeedUpsizing())) {
|
||||
// Resize less frequently since it is expensive
|
||||
return Upsize(capacity_ * kLoadFactor * 2);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
uint64_t size() const { return size_; }
|
||||
|
||||
// Visit all non-empty entries in the table
|
||||
// The visit_func should have signature void(const Entry*)
|
||||
template <typename VisitFunc>
|
||||
void VisitEntries(VisitFunc&& visit_func) const {
|
||||
for (uint64_t i = 0; i < capacity_; i++) {
|
||||
const auto& entry = entries_[i];
|
||||
if (entry) {
|
||||
visit_func(&entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
// NoCompare is for when the value is known not to exist in the table
|
||||
enum CompareKind { DoCompare, NoCompare };
|
||||
|
||||
// The workhorse lookup function
|
||||
template <CompareKind CKind, typename CmpFunc>
|
||||
std::pair<uint64_t, bool> Lookup(hash_t h, const Entry* entries, uint64_t size_mask,
|
||||
CmpFunc&& cmp_func) const {
|
||||
static constexpr uint8_t perturb_shift = 5;
|
||||
|
||||
uint64_t index, perturb;
|
||||
const Entry* entry;
|
||||
|
||||
h = FixHash(h);
|
||||
index = h & size_mask;
|
||||
perturb = (h >> perturb_shift) + 1U;
|
||||
|
||||
while (true) {
|
||||
entry = &entries[index];
|
||||
if (CompareEntry<CKind, CmpFunc>(h, entry, std::forward<CmpFunc>(cmp_func))) {
|
||||
// Found
|
||||
return {index, true};
|
||||
}
|
||||
if (entry->h == kSentinel) {
|
||||
// Empty slot
|
||||
return {index, false};
|
||||
}
|
||||
|
||||
// Perturbation logic inspired from CPython's set / dict object.
|
||||
// The goal is that all 64 bits of the unmasked hash value eventually
|
||||
// participate in the probing sequence, to minimize clustering.
|
||||
index = (index + perturb) & size_mask;
|
||||
perturb = (perturb >> perturb_shift) + 1U;
|
||||
}
|
||||
}
|
||||
|
||||
template <CompareKind CKind, typename CmpFunc>
|
||||
bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const {
|
||||
if (CKind == NoCompare) {
|
||||
return false;
|
||||
} else {
|
||||
return entry->h == h && cmp_func(&entry->payload);
|
||||
}
|
||||
}
|
||||
|
||||
bool NeedUpsizing() const {
|
||||
// Keep the load factor <= 1/2
|
||||
return size_ * kLoadFactor >= capacity_;
|
||||
}
|
||||
|
||||
Status UpsizeBuffer(uint64_t capacity) {
|
||||
RETURN_NOT_OK(entries_builder_.Resize(capacity));
|
||||
entries_ = entries_builder_.mutable_data();
|
||||
memset(static_cast<void*>(entries_), 0, capacity * sizeof(Entry));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Upsize(uint64_t new_capacity) {
|
||||
assert(new_capacity > capacity_);
|
||||
uint64_t new_mask = new_capacity - 1;
|
||||
assert((new_capacity & new_mask) == 0); // it's a power of two
|
||||
|
||||
// Stash old entries and seal builder, effectively resetting the Buffer
|
||||
const Entry* old_entries = entries_;
|
||||
ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
|
||||
// Allocate new buffer
|
||||
RETURN_NOT_OK(UpsizeBuffer(new_capacity));
|
||||
|
||||
for (uint64_t i = 0; i < capacity_; i++) {
|
||||
const auto& entry = old_entries[i];
|
||||
if (entry) {
|
||||
// Dummy compare function will not be called
|
||||
auto p = Lookup<NoCompare>(entry.h, entries_, new_mask,
|
||||
[](const Payload*) { return false; });
|
||||
// Lookup<NoCompare> (and CompareEntry<NoCompare>) ensure that an
|
||||
// empty slots is always returned
|
||||
assert(!p.second);
|
||||
entries_[p.first] = entry;
|
||||
}
|
||||
}
|
||||
capacity_ = new_capacity;
|
||||
capacity_mask_ = new_mask;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
hash_t FixHash(hash_t h) const { return (h == kSentinel) ? 42U : h; }
|
||||
|
||||
// The number of slots available in the hash table array.
|
||||
uint64_t capacity_;
|
||||
uint64_t capacity_mask_;
|
||||
// The number of used slots in the hash table array.
|
||||
uint64_t size_;
|
||||
|
||||
Entry* entries_;
|
||||
TypedBufferBuilder<Entry> entries_builder_;
|
||||
};
|
||||
|
||||
// XXX typedef memo_index_t int32_t ?
|
||||
|
||||
constexpr int32_t kKeyNotFound = -1;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// A base class for memoization table.
|
||||
|
||||
class MemoTable {
|
||||
public:
|
||||
virtual ~MemoTable() = default;
|
||||
|
||||
virtual int32_t size() const = 0;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// A memoization table for memory-cheap scalar values.
|
||||
|
||||
// The memoization table remembers and allows to look up the insertion
|
||||
// index for each key.
|
||||
|
||||
template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
|
||||
class ScalarMemoTable : public MemoTable {
|
||||
public:
|
||||
explicit ScalarMemoTable(MemoryPool* pool, int64_t entries = 0)
|
||||
: hash_table_(pool, static_cast<uint64_t>(entries)) {}
|
||||
|
||||
template <typename Value>
|
||||
int32_t Get(Value&& v) const {
|
||||
const Scalar value(std::forward<Value>(v));
|
||||
auto cmp_func = [value](const Payload* payload) -> bool {
|
||||
return ScalarHelper<Scalar, 0>::CompareScalars(payload->value, value);
|
||||
};
|
||||
hash_t h = ComputeHash(value);
|
||||
auto p = hash_table_.Lookup(h, cmp_func);
|
||||
if (p.second) {
|
||||
return p.first->payload.memo_index;
|
||||
} else {
|
||||
return kKeyNotFound;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Value, typename Func1, typename Func2>
|
||||
Status GetOrInsert(Value&& v, Func1&& on_found, Func2&& on_not_found,
|
||||
int32_t* out_memo_index) {
|
||||
const Scalar value(std::forward<Value>(v));
|
||||
auto cmp_func = [value](const Payload* payload) -> bool {
|
||||
return ScalarHelper<Scalar, 0>::CompareScalars(value, payload->value);
|
||||
};
|
||||
hash_t h = ComputeHash(value);
|
||||
auto p = hash_table_.Lookup(h, cmp_func);
|
||||
int32_t memo_index;
|
||||
if (p.second) {
|
||||
memo_index = p.first->payload.memo_index;
|
||||
on_found(memo_index);
|
||||
} else {
|
||||
memo_index = size();
|
||||
RETURN_NOT_OK(hash_table_.Insert(p.first, h, {value, memo_index}));
|
||||
on_not_found(memo_index);
|
||||
}
|
||||
*out_memo_index = memo_index;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename Value>
|
||||
Status GetOrInsert(Value&& value, int32_t* out_memo_index) {
|
||||
return GetOrInsert(
|
||||
value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
|
||||
}
|
||||
|
||||
int32_t GetNull() const { return null_index_; }
|
||||
|
||||
template <typename Func1, typename Func2>
|
||||
int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
|
||||
int32_t memo_index = GetNull();
|
||||
if (memo_index != kKeyNotFound) {
|
||||
on_found(memo_index);
|
||||
} else {
|
||||
null_index_ = memo_index = size();
|
||||
on_not_found(memo_index);
|
||||
}
|
||||
return memo_index;
|
||||
}
|
||||
|
||||
int32_t GetOrInsertNull() {
|
||||
return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
|
||||
}
|
||||
|
||||
// The number of entries in the memo table +1 if null was added.
|
||||
// (which is also 1 + the largest memo index)
|
||||
int32_t size() const override {
|
||||
return static_cast<int32_t>(hash_table_.size()) + (GetNull() != kKeyNotFound);
|
||||
}
|
||||
|
||||
// Copy values starting from index `start` into `out_data`
|
||||
template <typename Value>
|
||||
void CopyValues(int32_t start, Value* out_data) const {
|
||||
// So that both uint16_t and Float16 are allowed
|
||||
static_assert(sizeof(Value) == sizeof(Scalar));
|
||||
Scalar* out = reinterpret_cast<Scalar*>(out_data);
|
||||
hash_table_.VisitEntries([=](const HashTableEntry* entry) {
|
||||
int32_t index = entry->payload.memo_index - start;
|
||||
if (index >= 0) {
|
||||
out[index] = entry->payload.value;
|
||||
}
|
||||
});
|
||||
// Zero-initialize the null entry
|
||||
if (null_index_ != kKeyNotFound) {
|
||||
int32_t index = null_index_ - start;
|
||||
if (index >= 0) {
|
||||
out[index] = Scalar{};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Value>
|
||||
void CopyValues(Value* out_data) const {
|
||||
CopyValues(0, out_data);
|
||||
}
|
||||
|
||||
protected:
|
||||
struct Payload {
|
||||
Scalar value;
|
||||
int32_t memo_index;
|
||||
};
|
||||
|
||||
using HashTableType = HashTableTemplateType<Payload>;
|
||||
using HashTableEntry = typename HashTableType::Entry;
|
||||
HashTableType hash_table_;
|
||||
int32_t null_index_ = kKeyNotFound;
|
||||
|
||||
hash_t ComputeHash(const Scalar& value) const {
|
||||
return ScalarHelper<Scalar, 0>::ComputeHash(value);
|
||||
}
|
||||
|
||||
public:
|
||||
// defined here so that `HashTableType` is visible
|
||||
// Merge entries from `other_table` into `this->hash_table_`.
|
||||
Status MergeTable(const ScalarMemoTable& other_table) {
|
||||
const HashTableType& other_hashtable = other_table.hash_table_;
|
||||
|
||||
other_hashtable.VisitEntries([this](const HashTableEntry* other_entry) {
|
||||
int32_t unused;
|
||||
ARROW_DCHECK_OK(this->GetOrInsert(other_entry->payload.value, &unused));
|
||||
});
|
||||
// TODO: ARROW-17074 - implement proper error handling
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// A memoization table for small scalar values, using direct indexing
|
||||
|
||||
template <typename Scalar, typename Enable = void>
|
||||
struct SmallScalarTraits {};
|
||||
|
||||
template <>
|
||||
struct SmallScalarTraits<bool> {
|
||||
static constexpr int32_t cardinality = 2;
|
||||
|
||||
static uint32_t AsIndex(bool value) { return value ? 1 : 0; }
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct SmallScalarTraits<Scalar, enable_if_t<std::is_integral<Scalar>::value>> {
|
||||
using Unsigned = typename std::make_unsigned<Scalar>::type;
|
||||
|
||||
static constexpr int32_t cardinality = 1U + std::numeric_limits<Unsigned>::max();
|
||||
|
||||
static uint32_t AsIndex(Scalar value) { return static_cast<Unsigned>(value); }
|
||||
};
|
||||
|
||||
template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
|
||||
class SmallScalarMemoTable : public MemoTable {
|
||||
public:
|
||||
explicit SmallScalarMemoTable(MemoryPool* pool, int64_t entries = 0) {
|
||||
std::fill(value_to_index_, value_to_index_ + cardinality + 1, kKeyNotFound);
|
||||
index_to_value_.reserve(cardinality);
|
||||
}
|
||||
|
||||
int32_t Get(const Scalar value) const {
|
||||
auto value_index = AsIndex(value);
|
||||
return value_to_index_[value_index];
|
||||
}
|
||||
|
||||
template <typename Func1, typename Func2>
|
||||
Status GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found,
|
||||
int32_t* out_memo_index) {
|
||||
auto value_index = AsIndex(value);
|
||||
auto memo_index = value_to_index_[value_index];
|
||||
if (memo_index == kKeyNotFound) {
|
||||
memo_index = static_cast<int32_t>(index_to_value_.size());
|
||||
index_to_value_.push_back(value);
|
||||
value_to_index_[value_index] = memo_index;
|
||||
ARROW_DCHECK_LT(memo_index, cardinality + 1);
|
||||
on_not_found(memo_index);
|
||||
} else {
|
||||
on_found(memo_index);
|
||||
}
|
||||
*out_memo_index = memo_index;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GetOrInsert(const Scalar value, int32_t* out_memo_index) {
|
||||
return GetOrInsert(
|
||||
value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
|
||||
}
|
||||
|
||||
int32_t GetNull() const { return value_to_index_[cardinality]; }
|
||||
|
||||
template <typename Func1, typename Func2>
|
||||
int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
|
||||
auto memo_index = GetNull();
|
||||
if (memo_index == kKeyNotFound) {
|
||||
memo_index = value_to_index_[cardinality] = size();
|
||||
index_to_value_.push_back(0);
|
||||
on_not_found(memo_index);
|
||||
} else {
|
||||
on_found(memo_index);
|
||||
}
|
||||
return memo_index;
|
||||
}
|
||||
|
||||
int32_t GetOrInsertNull() {
|
||||
return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
|
||||
}
|
||||
|
||||
// The number of entries in the memo table
|
||||
// (which is also 1 + the largest memo index)
|
||||
int32_t size() const override { return static_cast<int32_t>(index_to_value_.size()); }
|
||||
|
||||
// Merge entries from `other_table` into `this`.
|
||||
Status MergeTable(const SmallScalarMemoTable& other_table) {
|
||||
for (const Scalar& other_val : other_table.index_to_value_) {
|
||||
int32_t unused;
|
||||
RETURN_NOT_OK(this->GetOrInsert(other_val, &unused));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Copy values starting from index `start` into `out_data`
|
||||
void CopyValues(int32_t start, Scalar* out_data) const {
|
||||
ARROW_DCHECK_GE(start, 0);
|
||||
ARROW_DCHECK_LE(static_cast<size_t>(start), index_to_value_.size());
|
||||
int64_t offset = start * static_cast<int32_t>(sizeof(Scalar));
|
||||
memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar));
|
||||
}
|
||||
|
||||
void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
|
||||
|
||||
const std::vector<Scalar>& values() const { return index_to_value_; }
|
||||
|
||||
protected:
|
||||
static constexpr auto cardinality = SmallScalarTraits<Scalar>::cardinality;
|
||||
static_assert(cardinality <= 256, "cardinality too large for direct-addressed table");
|
||||
|
||||
uint32_t AsIndex(Scalar value) const {
|
||||
return SmallScalarTraits<Scalar>::AsIndex(value);
|
||||
}
|
||||
|
||||
// The last index is reserved for the null element.
|
||||
int32_t value_to_index_[cardinality + 1];
|
||||
std::vector<Scalar> index_to_value_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// A memoization table for variable-sized binary data.
|
||||
|
||||
template <typename BinaryBuilderT>
|
||||
class BinaryMemoTable : public MemoTable {
|
||||
public:
|
||||
using builder_offset_type = typename BinaryBuilderT::offset_type;
|
||||
explicit BinaryMemoTable(MemoryPool* pool, int64_t entries = 0,
|
||||
int64_t values_size = -1)
|
||||
: hash_table_(pool, static_cast<uint64_t>(entries)), binary_builder_(pool) {
|
||||
const int64_t data_size = (values_size < 0) ? entries * 4 : values_size;
|
||||
ARROW_DCHECK_OK(binary_builder_.Resize(entries));
|
||||
ARROW_DCHECK_OK(binary_builder_.ReserveData(data_size));
|
||||
}
|
||||
|
||||
int32_t Get(const void* data, builder_offset_type length) const {
|
||||
hash_t h = ComputeStringHash<0>(data, length);
|
||||
auto p = Lookup(h, data, length);
|
||||
if (p.second) {
|
||||
return p.first->payload.memo_index;
|
||||
} else {
|
||||
return kKeyNotFound;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t Get(std::string_view value) const {
|
||||
return Get(value.data(), static_cast<builder_offset_type>(value.length()));
|
||||
}
|
||||
|
||||
template <typename Func1, typename Func2>
|
||||
Status GetOrInsert(const void* data, builder_offset_type length, Func1&& on_found,
|
||||
Func2&& on_not_found, int32_t* out_memo_index) {
|
||||
hash_t h = ComputeStringHash<0>(data, length);
|
||||
auto p = Lookup(h, data, length);
|
||||
int32_t memo_index;
|
||||
if (p.second) {
|
||||
memo_index = p.first->payload.memo_index;
|
||||
on_found(memo_index);
|
||||
} else {
|
||||
memo_index = size();
|
||||
// Insert string value
|
||||
RETURN_NOT_OK(binary_builder_.Append(static_cast<const char*>(data), length));
|
||||
// Insert hash entry
|
||||
RETURN_NOT_OK(
|
||||
hash_table_.Insert(const_cast<HashTableEntry*>(p.first), h, {memo_index}));
|
||||
|
||||
on_not_found(memo_index);
|
||||
}
|
||||
*out_memo_index = memo_index;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename Func1, typename Func2>
|
||||
Status GetOrInsert(std::string_view value, Func1&& on_found, Func2&& on_not_found,
|
||||
int32_t* out_memo_index) {
|
||||
return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
|
||||
std::forward<Func1>(on_found), std::forward<Func2>(on_not_found),
|
||||
out_memo_index);
|
||||
}
|
||||
|
||||
Status GetOrInsert(const void* data, builder_offset_type length,
|
||||
int32_t* out_memo_index) {
|
||||
return GetOrInsert(
|
||||
data, length, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
|
||||
}
|
||||
|
||||
Status GetOrInsert(std::string_view value, int32_t* out_memo_index) {
|
||||
return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
|
||||
out_memo_index);
|
||||
}
|
||||
|
||||
int32_t GetNull() const { return null_index_; }
|
||||
|
||||
template <typename Func1, typename Func2>
|
||||
int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
|
||||
int32_t memo_index = GetNull();
|
||||
if (memo_index == kKeyNotFound) {
|
||||
memo_index = null_index_ = size();
|
||||
ARROW_DCHECK_OK(binary_builder_.AppendNull());
|
||||
on_not_found(memo_index);
|
||||
} else {
|
||||
on_found(memo_index);
|
||||
}
|
||||
return memo_index;
|
||||
}
|
||||
|
||||
int32_t GetOrInsertNull() {
|
||||
return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
|
||||
}
|
||||
|
||||
// The number of entries in the memo table
|
||||
// (which is also 1 + the largest memo index)
|
||||
int32_t size() const override {
|
||||
return static_cast<int32_t>(hash_table_.size() + (GetNull() != kKeyNotFound));
|
||||
}
|
||||
|
||||
int64_t values_size() const { return binary_builder_.value_data_length(); }
|
||||
|
||||
// Copy (n + 1) offsets starting from index `start` into `out_data`
|
||||
template <class Offset>
|
||||
void CopyOffsets(int32_t start, Offset* out_data) const {
|
||||
ARROW_DCHECK_LE(start, size());
|
||||
|
||||
const builder_offset_type* offsets = binary_builder_.offsets_data();
|
||||
const builder_offset_type delta =
|
||||
start < binary_builder_.length() ? offsets[start] : 0;
|
||||
for (int32_t i = start; i < size(); ++i) {
|
||||
const builder_offset_type adjusted_offset = offsets[i] - delta;
|
||||
Offset cast_offset = static_cast<Offset>(adjusted_offset);
|
||||
assert(static_cast<builder_offset_type>(cast_offset) ==
|
||||
adjusted_offset); // avoid truncation
|
||||
*out_data++ = cast_offset;
|
||||
}
|
||||
|
||||
// Copy last value since BinaryBuilder only materializes it on in Finish()
|
||||
*out_data = static_cast<Offset>(binary_builder_.value_data_length() - delta);
|
||||
}
|
||||
|
||||
template <class Offset>
|
||||
void CopyOffsets(Offset* out_data) const {
|
||||
CopyOffsets(0, out_data);
|
||||
}
|
||||
|
||||
// Copy values starting from index `start` into `out_data`
|
||||
void CopyValues(int32_t start, uint8_t* out_data) const {
|
||||
CopyValues(start, -1, out_data);
|
||||
}
|
||||
|
||||
// Same as above, but check output size in debug mode
|
||||
void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const {
|
||||
ARROW_DCHECK_LE(start, size());
|
||||
|
||||
// The absolute byte offset of `start` value in the binary buffer.
|
||||
const builder_offset_type offset = binary_builder_.offset(start);
|
||||
const auto length = binary_builder_.value_data_length() - static_cast<size_t>(offset);
|
||||
|
||||
if (out_size != -1) {
|
||||
assert(static_cast<int64_t>(length) <= out_size);
|
||||
}
|
||||
|
||||
auto view = binary_builder_.GetView(start);
|
||||
memcpy(out_data, view.data(), length);
|
||||
}
|
||||
|
||||
void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); }
|
||||
|
||||
void CopyValues(int64_t out_size, uint8_t* out_data) const {
|
||||
CopyValues(0, out_size, out_data);
|
||||
}
|
||||
|
||||
void CopyFixedWidthValues(int32_t start, int32_t width_size, int64_t out_size,
|
||||
uint8_t* out_data) const {
|
||||
// This method exists to cope with the fact that the BinaryMemoTable does
|
||||
// not know the fixed width when inserting the null value. The data
|
||||
// buffer hold a zero length string for the null value (if found).
|
||||
//
|
||||
// Thus, the method will properly inject an empty value of the proper width
|
||||
// in the output buffer.
|
||||
//
|
||||
if (start >= size()) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t null_index = GetNull();
|
||||
if (null_index < start) {
|
||||
// Nothing to skip, proceed as usual.
|
||||
CopyValues(start, out_size, out_data);
|
||||
return;
|
||||
}
|
||||
|
||||
builder_offset_type left_offset = binary_builder_.offset(start);
|
||||
|
||||
// Ensure that the data length is exactly missing width_size bytes to fit
|
||||
// in the expected output (n_values * width_size).
|
||||
#ifndef NDEBUG
|
||||
int64_t data_length = values_size() - static_cast<size_t>(left_offset);
|
||||
assert(data_length + width_size == out_size);
|
||||
ARROW_UNUSED(data_length);
|
||||
#endif
|
||||
|
||||
auto in_data = binary_builder_.value_data() + left_offset;
|
||||
// The null use 0-length in the data, slice the data in 2 and skip by
|
||||
// width_size in out_data. [part_1][width_size][part_2]
|
||||
auto null_data_offset = binary_builder_.offset(null_index);
|
||||
auto left_size = null_data_offset - left_offset;
|
||||
if (left_size > 0) {
|
||||
memcpy(out_data, in_data + left_offset, left_size);
|
||||
}
|
||||
// Zero-initialize the null entry
|
||||
memset(out_data + left_size, 0, width_size);
|
||||
|
||||
auto right_size = values_size() - static_cast<size_t>(null_data_offset);
|
||||
if (right_size > 0) {
|
||||
// skip the null fixed size value.
|
||||
auto out_offset = left_size + width_size;
|
||||
assert(out_data + out_offset + right_size == out_data + out_size);
|
||||
memcpy(out_data + out_offset, in_data + null_data_offset, right_size);
|
||||
}
|
||||
}
|
||||
|
||||
// Visit the stored values in insertion order.
|
||||
// The visitor function should have the signature `void(std::string_view)`
|
||||
// or `void(const std::string_view&)`.
|
||||
template <typename VisitFunc>
|
||||
void VisitValues(int32_t start, VisitFunc&& visit) const {
|
||||
for (int32_t i = start; i < size(); ++i) {
|
||||
visit(binary_builder_.GetView(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Visit the stored value at a specific index in insertion order.
|
||||
// The visitor function should have the signature `void(std::string_view)`
|
||||
// or `void(const std::string_view&)`.
|
||||
template <typename VisitFunc>
|
||||
void VisitValue(int32_t idx, VisitFunc&& visit) const {
|
||||
visit(binary_builder_.GetView(idx));
|
||||
}
|
||||
|
||||
protected:
|
||||
struct Payload {
|
||||
int32_t memo_index;
|
||||
};
|
||||
|
||||
using HashTableType = HashTable<Payload>;
|
||||
using HashTableEntry = typename HashTable<Payload>::Entry;
|
||||
HashTableType hash_table_;
|
||||
BinaryBuilderT binary_builder_;
|
||||
|
||||
int32_t null_index_ = kKeyNotFound;
|
||||
|
||||
std::pair<const HashTableEntry*, bool> Lookup(hash_t h, const void* data,
|
||||
builder_offset_type length) const {
|
||||
auto cmp_func = [&](const Payload* payload) {
|
||||
std::string_view lhs = binary_builder_.GetView(payload->memo_index);
|
||||
std::string_view rhs(static_cast<const char*>(data), length);
|
||||
return lhs == rhs;
|
||||
};
|
||||
return hash_table_.Lookup(h, cmp_func);
|
||||
}
|
||||
|
||||
public:
|
||||
Status MergeTable(const BinaryMemoTable& other_table) {
|
||||
other_table.VisitValues(0, [this](std::string_view other_value) {
|
||||
int32_t unused;
|
||||
ARROW_DCHECK_OK(this->GetOrInsert(other_value, &unused));
|
||||
});
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Enable = void>
|
||||
struct HashTraits {};
|
||||
|
||||
template <>
|
||||
struct HashTraits<BooleanType> {
|
||||
using MemoTableType = SmallScalarMemoTable<bool>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct HashTraits<T, enable_if_8bit_int<T>> {
|
||||
using c_type = typename T::c_type;
|
||||
using MemoTableType = SmallScalarMemoTable<typename T::c_type>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct HashTraits<T, enable_if_t<has_c_type<T>::value && !is_8bit_int<T>::value>> {
|
||||
using c_type = typename T::c_type;
|
||||
using MemoTableType = ScalarMemoTable<c_type, HashTable>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct HashTraits<HalfFloatType> {
|
||||
using MemoTableType = ScalarMemoTable<::arrow::util::Float16>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct HashTraits<T, enable_if_t<has_string_view<T>::value &&
|
||||
!std::is_base_of<LargeBinaryType, T>::value>> {
|
||||
using MemoTableType = BinaryMemoTable<BinaryBuilder>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct HashTraits<T, enable_if_decimal<T>> {
|
||||
using MemoTableType = BinaryMemoTable<BinaryBuilder>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct HashTraits<T, enable_if_t<std::is_base_of<LargeBinaryType, T>::value>> {
|
||||
using MemoTableType = BinaryMemoTable<LargeBinaryBuilder>;
|
||||
};
|
||||
|
||||
template <typename MemoTableType>
|
||||
static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& memo_table,
|
||||
int64_t start_offset, int64_t* null_count,
|
||||
std::shared_ptr<Buffer>* null_bitmap) {
|
||||
int64_t dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
|
||||
int64_t null_index = memo_table.GetNull();
|
||||
|
||||
*null_count = 0;
|
||||
*null_bitmap = nullptr;
|
||||
|
||||
if (null_index != kKeyNotFound && null_index >= start_offset) {
|
||||
null_index -= start_offset;
|
||||
*null_count = 1;
|
||||
ARROW_ASSIGN_OR_RAISE(*null_bitmap,
|
||||
internal::BitmapAllButOne(pool, dict_length, null_index));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
struct StringViewHash {
|
||||
// std::hash compatible hasher for use with std::unordered_*
|
||||
// (the std::hash specialization provided by nonstd constructs std::string
|
||||
// temporaries then invokes std::hash<std::string> against those)
|
||||
hash_t operator()(std::string_view value) const {
|
||||
return ComputeStringHash<0>(value.data(), static_cast<int64_t>(value.size()));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,137 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/status.h"
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class DataType;
|
||||
struct ArraySpan;
|
||||
struct Scalar;
|
||||
|
||||
namespace internal {
|
||||
|
||||
ARROW_EXPORT
|
||||
uint8_t DetectUIntWidth(const uint64_t* values, int64_t length, uint8_t min_width = 1);
|
||||
|
||||
ARROW_EXPORT
|
||||
uint8_t DetectUIntWidth(const uint64_t* values, const uint8_t* valid_bytes,
|
||||
int64_t length, uint8_t min_width = 1);
|
||||
|
||||
ARROW_EXPORT
|
||||
uint8_t DetectIntWidth(const int64_t* values, int64_t length, uint8_t min_width = 1);
|
||||
|
||||
ARROW_EXPORT
|
||||
uint8_t DetectIntWidth(const int64_t* values, const uint8_t* valid_bytes, int64_t length,
|
||||
uint8_t min_width = 1);
|
||||
|
||||
ARROW_EXPORT
|
||||
void DowncastInts(const int64_t* source, int8_t* dest, int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
void DowncastInts(const int64_t* source, int16_t* dest, int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
void DowncastInts(const int64_t* source, int32_t* dest, int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
void DowncastInts(const int64_t* source, int64_t* dest, int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length);
|
||||
|
||||
ARROW_EXPORT
|
||||
void UpcastInts(const int32_t* source, int64_t* dest, int64_t length);
|
||||
|
||||
template <typename InputInt, typename OutputInt>
|
||||
inline typename std::enable_if<(sizeof(InputInt) >= sizeof(OutputInt))>::type CastInts(
|
||||
const InputInt* source, OutputInt* dest, int64_t length) {
|
||||
DowncastInts(source, dest, length);
|
||||
}
|
||||
|
||||
template <typename InputInt, typename OutputInt>
|
||||
inline typename std::enable_if<(sizeof(InputInt) < sizeof(OutputInt))>::type CastInts(
|
||||
const InputInt* source, OutputInt* dest, int64_t length) {
|
||||
UpcastInts(source, dest, length);
|
||||
}
|
||||
|
||||
template <typename InputInt, typename OutputInt>
|
||||
ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length,
|
||||
const int32_t* transpose_map);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status TransposeInts(const DataType& src_type, const DataType& dest_type,
|
||||
const uint8_t* src, uint8_t* dest, int64_t src_offset,
|
||||
int64_t dest_offset, int64_t length, const int32_t* transpose_map);
|
||||
|
||||
/// \brief Do vectorized boundschecking of integer-type array indices. The
|
||||
/// indices must be nonnegative and strictly less than the passed upper
|
||||
/// limit (which is usually the length of an array that is being indexed-into).
|
||||
ARROW_EXPORT
|
||||
Status CheckIndexBounds(const ArraySpan& values, uint64_t upper_limit);
|
||||
|
||||
/// \brief Boundscheck integer values to determine if they are all between the
|
||||
/// passed upper and lower limits (inclusive). Upper and lower bounds must be
|
||||
/// the same type as the data and are not currently casted.
|
||||
ARROW_EXPORT
|
||||
Status CheckIntegersInRange(const ArraySpan& values, const Scalar& bound_lower,
|
||||
const Scalar& bound_upper);
|
||||
|
||||
/// \brief Use CheckIntegersInRange to determine whether the passed integers
|
||||
/// can fit safely in the passed integer type. This helps quickly determine if
|
||||
/// integer narrowing (e.g. int64->int32) is safe to do.
|
||||
ARROW_EXPORT
|
||||
Status IntegersCanFit(const ArraySpan& values, const DataType& target_type);
|
||||
|
||||
/// \brief Convenience for boundschecking a single Scalar value
|
||||
ARROW_EXPORT
|
||||
Status IntegersCanFit(const Scalar& value, const DataType& target_type);
|
||||
|
||||
/// Upcast an integer to the largest possible width (currently 64 bits)
|
||||
|
||||
template <typename Integer>
|
||||
typename std::enable_if<
|
||||
std::is_integral<Integer>::value && std::is_signed<Integer>::value, int64_t>::type
|
||||
UpcastInt(Integer v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
template <typename Integer>
|
||||
typename std::enable_if<
|
||||
std::is_integral<Integer>::value && std::is_unsigned<Integer>::value, uint64_t>::type
|
||||
UpcastInt(Integer v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,118 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
// "safe-math.h" includes <intsafe.h> from the Windows headers.
|
||||
#include "arrow/util/windows_compatibility.h"
|
||||
#include "arrow/vendored/portable-snippets/safe-math.h"
|
||||
// clang-format off (avoid include reordering)
|
||||
#include "arrow/util/windows_fixup.h"
|
||||
// clang-format on
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// Define functions AddWithOverflow, SubtractWithOverflow, MultiplyWithOverflow
|
||||
// with the signature `bool(T u, T v, T* out)` where T is an integer type.
|
||||
// On overflow, these functions return true. Otherwise, false is returned
|
||||
// and `out` is updated with the result of the operation.
|
||||
|
||||
#define OP_WITH_OVERFLOW(_func_name, _psnip_op, _type, _psnip_type) \
|
||||
[[nodiscard]] static inline bool _func_name(_type u, _type v, _type* out) { \
|
||||
return !psnip_safe_##_psnip_type##_##_psnip_op(out, u, v); \
|
||||
}
|
||||
|
||||
#define OPS_WITH_OVERFLOW(_func_name, _psnip_op) \
|
||||
OP_WITH_OVERFLOW(_func_name, _psnip_op, int8_t, int8) \
|
||||
OP_WITH_OVERFLOW(_func_name, _psnip_op, int16_t, int16) \
|
||||
OP_WITH_OVERFLOW(_func_name, _psnip_op, int32_t, int32) \
|
||||
OP_WITH_OVERFLOW(_func_name, _psnip_op, int64_t, int64) \
|
||||
OP_WITH_OVERFLOW(_func_name, _psnip_op, uint8_t, uint8) \
|
||||
OP_WITH_OVERFLOW(_func_name, _psnip_op, uint16_t, uint16) \
|
||||
OP_WITH_OVERFLOW(_func_name, _psnip_op, uint32_t, uint32) \
|
||||
OP_WITH_OVERFLOW(_func_name, _psnip_op, uint64_t, uint64)
|
||||
|
||||
OPS_WITH_OVERFLOW(AddWithOverflow, add)
|
||||
OPS_WITH_OVERFLOW(SubtractWithOverflow, sub)
|
||||
OPS_WITH_OVERFLOW(MultiplyWithOverflow, mul)
|
||||
OPS_WITH_OVERFLOW(DivideWithOverflow, div)
|
||||
|
||||
#undef OP_WITH_OVERFLOW
|
||||
#undef OPS_WITH_OVERFLOW
|
||||
|
||||
// Define function NegateWithOverflow with the signature `bool(T u, T* out)`
|
||||
// where T is a signed integer type. On overflow, these functions return true.
|
||||
// Otherwise, false is returned and `out` is updated with the result of the
|
||||
// operation.
|
||||
|
||||
#define UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, _type, _psnip_type) \
|
||||
[[nodiscard]] static inline bool _func_name(_type u, _type* out) { \
|
||||
return !psnip_safe_##_psnip_type##_##_psnip_op(out, u); \
|
||||
}
|
||||
|
||||
#define SIGNED_UNARY_OPS_WITH_OVERFLOW(_func_name, _psnip_op) \
|
||||
UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int8_t, int8) \
|
||||
UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int16_t, int16) \
|
||||
UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int32_t, int32) \
|
||||
UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int64_t, int64)
|
||||
|
||||
SIGNED_UNARY_OPS_WITH_OVERFLOW(NegateWithOverflow, neg)
|
||||
|
||||
#undef UNARY_OP_WITH_OVERFLOW
|
||||
#undef SIGNED_UNARY_OPS_WITH_OVERFLOW
|
||||
|
||||
/// Signed addition with well-defined behaviour on overflow (as unsigned)
|
||||
template <typename SignedInt>
|
||||
SignedInt SafeSignedAdd(SignedInt u, SignedInt v) {
|
||||
using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
|
||||
return static_cast<SignedInt>(static_cast<UnsignedInt>(u) +
|
||||
static_cast<UnsignedInt>(v));
|
||||
}
|
||||
|
||||
/// Signed subtraction with well-defined behaviour on overflow (as unsigned)
|
||||
template <typename SignedInt>
|
||||
SignedInt SafeSignedSubtract(SignedInt u, SignedInt v) {
|
||||
using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
|
||||
return static_cast<SignedInt>(static_cast<UnsignedInt>(u) -
|
||||
static_cast<UnsignedInt>(v));
|
||||
}
|
||||
|
||||
/// Signed negation with well-defined behaviour on overflow (as unsigned)
|
||||
template <typename SignedInt>
|
||||
SignedInt SafeSignedNegate(SignedInt u) {
|
||||
using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
|
||||
return static_cast<SignedInt>(~static_cast<UnsignedInt>(u) + 1);
|
||||
}
|
||||
|
||||
/// Signed left shift with well-defined behaviour on negative numbers or overflow
|
||||
template <typename SignedInt, typename Shift>
|
||||
SignedInt SafeLeftShift(SignedInt u, Shift shift) {
|
||||
using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
|
||||
return static_cast<SignedInt>(static_cast<UnsignedInt>(u) << shift);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,458 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef _WIN32
|
||||
# define ARROW_HAVE_SIGACTION 1
|
||||
#endif
|
||||
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#if ARROW_HAVE_SIGACTION
|
||||
# include <csignal> // Needed for struct sigaction
|
||||
#endif
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/windows_fixup.h"
|
||||
|
||||
namespace arrow::internal {
|
||||
|
||||
// NOTE: 8-bit path strings on Windows are encoded using UTF-8.
|
||||
// Using MBCS would fail encoding some paths.
|
||||
|
||||
#if defined(_WIN32)
|
||||
using NativePathString = std::wstring;
|
||||
#else
|
||||
using NativePathString = std::string;
|
||||
#endif
|
||||
|
||||
class ARROW_EXPORT PlatformFilename {
|
||||
public:
|
||||
struct Impl;
|
||||
|
||||
~PlatformFilename();
|
||||
PlatformFilename();
|
||||
PlatformFilename(const PlatformFilename&);
|
||||
PlatformFilename(PlatformFilename&&);
|
||||
PlatformFilename& operator=(const PlatformFilename&);
|
||||
PlatformFilename& operator=(PlatformFilename&&);
|
||||
explicit PlatformFilename(NativePathString path);
|
||||
explicit PlatformFilename(const NativePathString::value_type* path);
|
||||
|
||||
const NativePathString& ToNative() const;
|
||||
std::string ToString() const;
|
||||
|
||||
PlatformFilename Parent() const;
|
||||
Result<PlatformFilename> Real() const;
|
||||
|
||||
// These functions can fail for character encoding reasons.
|
||||
static Result<PlatformFilename> FromString(std::string_view file_name);
|
||||
Result<PlatformFilename> Join(std::string_view child_name) const;
|
||||
|
||||
PlatformFilename Join(const PlatformFilename& child_name) const;
|
||||
|
||||
bool operator==(const PlatformFilename& other) const;
|
||||
bool operator!=(const PlatformFilename& other) const;
|
||||
|
||||
// Made public to avoid the proliferation of friend declarations.
|
||||
const Impl* impl() const { return impl_.get(); }
|
||||
|
||||
private:
|
||||
std::unique_ptr<Impl> impl_;
|
||||
|
||||
explicit PlatformFilename(Impl impl);
|
||||
};
|
||||
|
||||
/// Create a directory if it doesn't exist.
|
||||
///
|
||||
/// Return whether the directory was created.
|
||||
ARROW_EXPORT
|
||||
Result<bool> CreateDir(const PlatformFilename& dir_path);
|
||||
|
||||
/// Create a directory and its parents if it doesn't exist.
|
||||
///
|
||||
/// Return whether the directory was created.
|
||||
ARROW_EXPORT
|
||||
Result<bool> CreateDirTree(const PlatformFilename& dir_path);
|
||||
|
||||
/// Delete a directory's contents (but not the directory itself) if it exists.
|
||||
///
|
||||
/// Return whether the directory existed.
|
||||
ARROW_EXPORT
|
||||
Result<bool> DeleteDirContents(const PlatformFilename& dir_path,
|
||||
bool allow_not_found = true);
|
||||
|
||||
/// Delete a directory tree if it exists.
|
||||
///
|
||||
/// Return whether the directory existed.
|
||||
ARROW_EXPORT
|
||||
Result<bool> DeleteDirTree(const PlatformFilename& dir_path, bool allow_not_found = true);
|
||||
|
||||
// Non-recursively list the contents of the given directory.
|
||||
// The returned names are the children's base names, not including dir_path.
|
||||
ARROW_EXPORT
|
||||
Result<std::vector<PlatformFilename>> ListDir(const PlatformFilename& dir_path);
|
||||
|
||||
/// Delete a file if it exists.
|
||||
///
|
||||
/// Return whether the file existed.
|
||||
ARROW_EXPORT
|
||||
Result<bool> DeleteFile(const PlatformFilename& file_path, bool allow_not_found = true);
|
||||
|
||||
/// Return whether a file exists.
|
||||
ARROW_EXPORT
|
||||
Result<bool> FileExists(const PlatformFilename& path);
|
||||
|
||||
// TODO expose this more publicly to make it available from io/file.h?
|
||||
/// A RAII wrapper for a file descriptor.
|
||||
///
|
||||
/// The underlying file descriptor is automatically closed on destruction.
|
||||
/// Moving is supported with well-defined semantics.
|
||||
/// Furthermore, closing is idempotent.
|
||||
class ARROW_EXPORT FileDescriptor {
|
||||
public:
|
||||
FileDescriptor() = default;
|
||||
explicit FileDescriptor(int fd) : fd_(fd) {}
|
||||
FileDescriptor(FileDescriptor&&);
|
||||
FileDescriptor& operator=(FileDescriptor&&);
|
||||
|
||||
~FileDescriptor();
|
||||
|
||||
Status Close();
|
||||
|
||||
/// May return -1 if closed or default-initialized
|
||||
int fd() const { return fd_.load(); }
|
||||
|
||||
/// Detach and return the underlying file descriptor
|
||||
int Detach();
|
||||
|
||||
bool closed() const { return fd_.load() == -1; }
|
||||
|
||||
protected:
|
||||
static void CloseFromDestructor(int fd);
|
||||
|
||||
std::atomic<int> fd_{-1};
|
||||
};
|
||||
|
||||
/// Open a file for reading and return a file descriptor.
|
||||
ARROW_EXPORT
|
||||
Result<FileDescriptor> FileOpenReadable(const PlatformFilename& file_name);
|
||||
|
||||
/// Open a file for writing and return a file descriptor.
|
||||
ARROW_EXPORT
|
||||
Result<FileDescriptor> FileOpenWritable(const PlatformFilename& file_name,
|
||||
bool write_only = true, bool truncate = true,
|
||||
bool append = false);
|
||||
|
||||
/// Read from current file position. Return number of bytes read.
|
||||
ARROW_EXPORT
|
||||
Result<int64_t> FileRead(int fd, uint8_t* buffer, int64_t nbytes);
|
||||
/// Read from given file position. Return number of bytes read.
|
||||
ARROW_EXPORT
|
||||
Result<int64_t> FileReadAt(int fd, uint8_t* buffer, int64_t position, int64_t nbytes);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status FileWrite(int fd, const uint8_t* buffer, const int64_t nbytes);
|
||||
ARROW_EXPORT
|
||||
Status FileTruncate(int fd, const int64_t size);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status FileSeek(int fd, int64_t pos);
|
||||
ARROW_EXPORT
|
||||
Status FileSeek(int fd, int64_t pos, int whence);
|
||||
ARROW_EXPORT
|
||||
Result<int64_t> FileTell(int fd);
|
||||
ARROW_EXPORT
|
||||
Result<int64_t> FileGetSize(int fd);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status FileClose(int fd);
|
||||
|
||||
struct Pipe {
|
||||
FileDescriptor rfd;
|
||||
FileDescriptor wfd;
|
||||
|
||||
Status Close() { return rfd.Close() & wfd.Close(); }
|
||||
};
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<Pipe> CreatePipe();
|
||||
|
||||
ARROW_EXPORT
|
||||
Status SetPipeFileDescriptorNonBlocking(int fd);
|
||||
|
||||
class ARROW_EXPORT SelfPipe {
|
||||
public:
|
||||
static Result<std::shared_ptr<SelfPipe>> Make(bool signal_safe);
|
||||
virtual ~SelfPipe();
|
||||
|
||||
/// \brief Wait for a wakeup.
|
||||
///
|
||||
/// Status::Invalid is returned if the pipe has been shutdown.
|
||||
/// Otherwise the next sent payload is returned.
|
||||
virtual Result<uint64_t> Wait() = 0;
|
||||
|
||||
/// \brief Wake up the pipe by sending a payload.
|
||||
///
|
||||
/// This method is async-signal-safe if `signal_safe` was set to true.
|
||||
virtual void Send(uint64_t payload) = 0;
|
||||
|
||||
/// \brief Wake up the pipe and shut it down.
|
||||
virtual Status Shutdown() = 0;
|
||||
};
|
||||
|
||||
ARROW_EXPORT
|
||||
int64_t GetPageSize();
|
||||
|
||||
struct MemoryRegion {
|
||||
void* addr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
ARROW_EXPORT
|
||||
Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes,
|
||||
void** new_addr);
|
||||
ARROW_EXPORT
|
||||
Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions);
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::string> GetEnvVar(const char* name);
|
||||
ARROW_EXPORT
|
||||
Result<std::string> GetEnvVar(const std::string& name);
|
||||
ARROW_EXPORT
|
||||
Result<NativePathString> GetEnvVarNative(const char* name);
|
||||
ARROW_EXPORT
|
||||
Result<NativePathString> GetEnvVarNative(const std::string& name);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status SetEnvVar(const char* name, const char* value);
|
||||
ARROW_EXPORT
|
||||
Status SetEnvVar(const std::string& name, const std::string& value);
|
||||
ARROW_EXPORT
|
||||
Status DelEnvVar(const char* name);
|
||||
ARROW_EXPORT
|
||||
Status DelEnvVar(const std::string& name);
|
||||
|
||||
ARROW_EXPORT
|
||||
std::string ErrnoMessage(int errnum);
|
||||
#if _WIN32
|
||||
ARROW_EXPORT
|
||||
std::string WinErrorMessage(int errnum);
|
||||
#endif
|
||||
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<StatusDetail> StatusDetailFromErrno(int errnum);
|
||||
ARROW_EXPORT
|
||||
std::optional<int> ErrnoFromStatusDetail(const StatusDetail& detail);
|
||||
#if _WIN32
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<StatusDetail> StatusDetailFromWinError(int errnum);
|
||||
#endif
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum);
|
||||
|
||||
template <typename... Args>
|
||||
Status StatusFromErrno(int errnum, StatusCode code, Args&&... args) {
|
||||
return Status::FromDetailAndArgs(code, StatusDetailFromErrno(errnum),
|
||||
std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
Status IOErrorFromErrno(int errnum, Args&&... args) {
|
||||
return StatusFromErrno(errnum, StatusCode::IOError, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
#if _WIN32
|
||||
template <typename... Args>
|
||||
Status StatusFromWinError(int errnum, StatusCode code, Args&&... args) {
|
||||
return Status::FromDetailAndArgs(code, StatusDetailFromWinError(errnum),
|
||||
std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
Status IOErrorFromWinError(int errnum, Args&&... args) {
|
||||
return StatusFromWinError(errnum, StatusCode::IOError, std::forward<Args>(args)...);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename... Args>
|
||||
Status StatusFromSignal(int signum, StatusCode code, Args&&... args) {
|
||||
return Status::FromDetailAndArgs(code, StatusDetailFromSignal(signum),
|
||||
std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
Status CancelledFromSignal(int signum, Args&&... args) {
|
||||
return StatusFromSignal(signum, StatusCode::Cancelled, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
ARROW_EXPORT
|
||||
int ErrnoFromStatus(const Status&);
|
||||
|
||||
// Always returns 0 on non-Windows platforms (for Python).
|
||||
ARROW_EXPORT
|
||||
int WinErrorFromStatus(const Status&);
|
||||
|
||||
ARROW_EXPORT
|
||||
int SignalFromStatus(const Status&);
|
||||
|
||||
class ARROW_EXPORT TemporaryDir {
|
||||
public:
|
||||
~TemporaryDir();
|
||||
|
||||
/// '/'-terminated path to the temporary dir
|
||||
const PlatformFilename& path() { return path_; }
|
||||
|
||||
/// Create a temporary subdirectory in the system temporary dir,
|
||||
/// named starting with `prefix`.
|
||||
static Result<std::unique_ptr<TemporaryDir>> Make(const std::string& prefix);
|
||||
|
||||
private:
|
||||
PlatformFilename path_;
|
||||
|
||||
explicit TemporaryDir(PlatformFilename&&);
|
||||
};
|
||||
|
||||
class ARROW_EXPORT SignalHandler {
|
||||
public:
|
||||
using Callback = void (*)(int);
|
||||
|
||||
SignalHandler();
|
||||
explicit SignalHandler(Callback cb);
|
||||
#if ARROW_HAVE_SIGACTION
|
||||
explicit SignalHandler(const struct sigaction& sa);
|
||||
#endif
|
||||
|
||||
Callback callback() const;
|
||||
#if ARROW_HAVE_SIGACTION
|
||||
const struct sigaction& action() const;
|
||||
#endif
|
||||
|
||||
protected:
|
||||
#if ARROW_HAVE_SIGACTION
|
||||
// Storing the full sigaction allows to restore the entire signal handling
|
||||
// configuration.
|
||||
struct sigaction sa_;
|
||||
#else
|
||||
Callback cb_;
|
||||
#endif
|
||||
};
|
||||
|
||||
/// \brief Return the current handler for the given signal number.
|
||||
ARROW_EXPORT
|
||||
Result<SignalHandler> GetSignalHandler(int signum);
|
||||
|
||||
/// \brief Set a new handler for the given signal number.
|
||||
///
|
||||
/// The old signal handler is returned.
|
||||
ARROW_EXPORT
|
||||
Result<SignalHandler> SetSignalHandler(int signum, const SignalHandler& handler);
|
||||
|
||||
/// \brief Reinstate the signal handler
|
||||
///
|
||||
/// For use in signal handlers. This is needed on platforms without sigaction()
|
||||
/// such as Windows, as the default signal handler is restored there as
|
||||
/// soon as a signal is raised.
|
||||
ARROW_EXPORT
|
||||
void ReinstateSignalHandler(int signum, SignalHandler::Callback handler);
|
||||
|
||||
/// \brief Send a signal to the current process
|
||||
///
|
||||
/// The thread which will receive the signal is unspecified.
|
||||
ARROW_EXPORT
|
||||
Status SendSignal(int signum);
|
||||
|
||||
/// \brief Send a signal to the given thread
|
||||
///
|
||||
/// This function isn't supported on Windows.
|
||||
ARROW_EXPORT
|
||||
Status SendSignalToThread(int signum, uint64_t thread_id);
|
||||
|
||||
/// \brief Get an unpredictable random seed
|
||||
///
|
||||
/// This function may be slightly costly, so should only be used to initialize
|
||||
/// a PRNG, not to generate a large amount of random numbers.
|
||||
/// It is better to use this function rather than std::random_device, unless
|
||||
/// absolutely necessary (e.g. to generate a cryptographic secret).
|
||||
ARROW_EXPORT
|
||||
int64_t GetRandomSeed();
|
||||
|
||||
/// \brief Get the current thread id
|
||||
///
|
||||
/// In addition to having the same properties as std::thread, the returned value
|
||||
/// is a regular integer value, which is more convenient than an opaque type.
|
||||
ARROW_EXPORT
|
||||
uint64_t GetThreadId();
|
||||
|
||||
/// \brief Get the current memory used by the current process in bytes
|
||||
///
|
||||
/// This function supports Windows, Linux, and Mac and will return 0 otherwise
|
||||
ARROW_EXPORT
|
||||
int64_t GetCurrentRSS();
|
||||
|
||||
/// \brief Get the total memory available to the system in bytes
|
||||
///
|
||||
/// This function supports Windows, Linux, and Mac and will return 0 otherwise
|
||||
ARROW_EXPORT
|
||||
int64_t GetTotalMemoryBytes();
|
||||
|
||||
/// \brief Get the number of affinity core on the system.
|
||||
///
|
||||
/// This is only implemented on Linux.
|
||||
/// If a value is returned, it is guaranteed to be greater or equal to one.
|
||||
ARROW_EXPORT Result<int32_t> GetNumAffinityCores();
|
||||
|
||||
/// \brief Load a dynamic library
|
||||
///
|
||||
/// This wraps dlopen() except on Windows, where LoadLibrary() is called.
|
||||
/// These two platforms handle absolute paths consistently; relative paths
|
||||
/// or the library's bare name may be handled but inconsistently.
|
||||
///
|
||||
/// \return An opaque handle for the dynamic library, which can be used for
|
||||
/// subsequent symbol lookup. Nullptr will never be returned; instead
|
||||
/// an error will be raised.
|
||||
ARROW_EXPORT Result<void*> LoadDynamicLibrary(const PlatformFilename& path);
|
||||
|
||||
/// \brief Load a dynamic library
|
||||
///
|
||||
/// An overload taking null terminated string.
|
||||
ARROW_EXPORT Result<void*> LoadDynamicLibrary(const char* path);
|
||||
|
||||
/// \brief Retrieve a symbol by name from a library handle.
|
||||
///
|
||||
/// This wraps dlsym() except on Windows, where GetProcAddress() is called.
|
||||
///
|
||||
/// \return The address associated with the named symbol. Nullptr will never be
|
||||
/// returned; instead an error will be raised.
|
||||
ARROW_EXPORT Result<void*> GetSymbol(void* handle, const char* name);
|
||||
|
||||
template <typename T>
|
||||
Result<T*> GetSymbolAs(void* handle, const char* name) {
|
||||
ARROW_ASSIGN_OR_RAISE(void* sym, GetSymbol(handle, name));
|
||||
return reinterpret_cast<T*>(sym);
|
||||
}
|
||||
|
||||
} // namespace arrow::internal
|
||||
@@ -0,0 +1,582 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/compare.h"
|
||||
#include "arrow/util/functional.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
template <typename T>
|
||||
class Iterator;
|
||||
|
||||
template <typename T>
|
||||
struct IterationTraits {
|
||||
/// \brief a reserved value which indicates the end of iteration. By
|
||||
/// default this is NULLPTR since most iterators yield pointer types.
|
||||
/// Specialize IterationTraits if different end semantics are required.
|
||||
///
|
||||
/// Note: This should not be used to determine if a given value is a
|
||||
/// terminal value. Use IsIterationEnd (which uses IsEnd) instead. This
|
||||
/// is only for returning terminal values.
|
||||
static T End() { return T(NULLPTR); }
|
||||
|
||||
/// \brief Checks to see if the value is a terminal value.
|
||||
/// A method is used here since T is not necessarily comparable in many
|
||||
/// cases even though it has a distinct final value
|
||||
static bool IsEnd(const T& val) { return val == End(); }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
T IterationEnd() {
|
||||
return IterationTraits<T>::End();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool IsIterationEnd(const T& val) {
|
||||
return IterationTraits<T>::IsEnd(val);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct IterationTraits<std::optional<T>> {
|
||||
/// \brief by default when iterating through a sequence of optional,
|
||||
/// nullopt indicates the end of iteration.
|
||||
/// Specialize IterationTraits if different end semantics are required.
|
||||
static std::optional<T> End() { return std::nullopt; }
|
||||
|
||||
/// \brief by default when iterating through a sequence of optional,
|
||||
/// nullopt (!has_value()) indicates the end of iteration.
|
||||
/// Specialize IterationTraits if different end semantics are required.
|
||||
static bool IsEnd(const std::optional<T>& val) { return !val.has_value(); }
|
||||
|
||||
// TODO(bkietz) The range-for loop over Iterator<optional<T>> yields
|
||||
// Result<optional<T>> which is unnecessary (since only the unyielded end optional
|
||||
// is nullopt. Add IterationTraits::GetRangeElement() to handle this case
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct IterationTraits<Enumerated<T>> {
|
||||
static Enumerated<T> End() { return Enumerated<T>{IterationEnd<T>(), -1, false}; }
|
||||
static bool IsEnd(const Enumerated<T>& val) { return val.index < 0; }
|
||||
};
|
||||
|
||||
/// \brief A generic Iterator that can return errors
|
||||
template <typename T>
|
||||
class Iterator : public util::EqualityComparable<Iterator<T>> {
|
||||
public:
|
||||
/// \brief Iterator may be constructed from any type which has a member function
|
||||
/// with signature Result<T> Next();
|
||||
/// End of iterator is signalled by returning IteratorTraits<T>::End();
|
||||
///
|
||||
/// The argument is moved or copied to the heap and kept in a unique_ptr<void>. Only
|
||||
/// its destructor and its Next method (which are stored in function pointers) are
|
||||
/// referenced after construction.
|
||||
///
|
||||
/// This approach is used to dodge MSVC linkage hell (ARROW-6244, ARROW-6558) when using
|
||||
/// an abstract template base class: instead of being inlined as usual for a template
|
||||
/// function the base's virtual destructor will be exported, leading to multiple
|
||||
/// definition errors when linking to any other TU where the base is instantiated.
|
||||
template <typename Wrapped>
|
||||
explicit Iterator(Wrapped has_next)
|
||||
: ptr_(new Wrapped(std::move(has_next)), Delete<Wrapped>), next_(Next<Wrapped>) {}
|
||||
|
||||
Iterator() : ptr_(NULLPTR, [](void*) {}) {}
|
||||
|
||||
/// \brief Return the next element of the sequence, IterationTraits<T>::End() when the
|
||||
/// iteration is completed.
|
||||
Result<T> Next() {
|
||||
if (ptr_) {
|
||||
auto next_result = next_(ptr_.get());
|
||||
if (next_result.ok() && IsIterationEnd(next_result.ValueUnsafe())) {
|
||||
ptr_.reset(NULLPTR);
|
||||
}
|
||||
return next_result;
|
||||
} else {
|
||||
return IterationTraits<T>::End();
|
||||
}
|
||||
}
|
||||
|
||||
/// Pass each element of the sequence to a visitor. Will return any error status
|
||||
/// returned by the visitor, terminating iteration.
|
||||
template <typename Visitor>
|
||||
Status Visit(Visitor&& visitor) {
|
||||
for (;;) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto value, Next());
|
||||
|
||||
if (IsIterationEnd(value)) break;
|
||||
|
||||
ARROW_RETURN_NOT_OK(visitor(std::move(value)));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Iterators will only compare equal if they are both null.
|
||||
/// Equality comparability is required to make an Iterator of Iterators
|
||||
/// (to check for the end condition).
|
||||
bool Equals(const Iterator& other) const { return ptr_ == other.ptr_; }
|
||||
|
||||
explicit operator bool() const { return ptr_ != NULLPTR; }
|
||||
|
||||
class RangeIterator {
|
||||
public:
|
||||
RangeIterator() : value_(IterationTraits<T>::End()) {}
|
||||
|
||||
explicit RangeIterator(Iterator i)
|
||||
: value_(IterationTraits<T>::End()),
|
||||
iterator_(std::make_shared<Iterator>(std::move(i))) {
|
||||
Next();
|
||||
}
|
||||
|
||||
bool operator!=(const RangeIterator& other) const { return value_ != other.value_; }
|
||||
|
||||
RangeIterator& operator++() {
|
||||
Next();
|
||||
return *this;
|
||||
}
|
||||
|
||||
Result<T> operator*() {
|
||||
ARROW_RETURN_NOT_OK(value_);
|
||||
|
||||
auto value = std::move(value_);
|
||||
value_ = IterationTraits<T>::End();
|
||||
return value;
|
||||
}
|
||||
|
||||
private:
|
||||
void Next() {
|
||||
if (!value_.ok()) {
|
||||
value_ = IterationTraits<T>::End();
|
||||
return;
|
||||
}
|
||||
value_ = iterator_->Next();
|
||||
}
|
||||
|
||||
Result<T> value_;
|
||||
std::shared_ptr<Iterator> iterator_;
|
||||
};
|
||||
|
||||
RangeIterator begin() { return RangeIterator(std::move(*this)); }
|
||||
|
||||
RangeIterator end() { return RangeIterator(); }
|
||||
|
||||
/// \brief Move every element of this iterator into a vector.
|
||||
Result<std::vector<T>> ToVector() {
|
||||
std::vector<T> out;
|
||||
for (auto maybe_element : *this) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto element, maybe_element);
|
||||
out.push_back(std::move(element));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
private:
|
||||
/// Implementation of deleter for ptr_: Casts from void* to the wrapped type and
|
||||
/// deletes that.
|
||||
template <typename HasNext>
|
||||
static void Delete(void* ptr) {
|
||||
delete static_cast<HasNext*>(ptr);
|
||||
}
|
||||
|
||||
/// Implementation of Next: Casts from void* to the wrapped type and invokes that
|
||||
/// type's Next member function.
|
||||
template <typename HasNext>
|
||||
static Result<T> Next(void* ptr) {
|
||||
return static_cast<HasNext*>(ptr)->Next();
|
||||
}
|
||||
|
||||
/// ptr_ is a unique_ptr to void with a custom deleter: a function pointer which first
|
||||
/// casts from void* to a pointer to the wrapped type then deletes that.
|
||||
std::unique_ptr<void, void (*)(void*)> ptr_;
|
||||
|
||||
/// next_ is a function pointer which first casts from void* to a pointer to the wrapped
|
||||
/// type then invokes its Next member function.
|
||||
Result<T> (*next_)(void*) = NULLPTR;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TransformFlow {
|
||||
using YieldValueType = T;
|
||||
|
||||
TransformFlow(YieldValueType value, bool ready_for_next)
|
||||
: finished_(false),
|
||||
ready_for_next_(ready_for_next),
|
||||
yield_value_(std::move(value)) {}
|
||||
TransformFlow(bool finished, bool ready_for_next)
|
||||
: finished_(finished), ready_for_next_(ready_for_next), yield_value_() {}
|
||||
|
||||
bool HasValue() const { return yield_value_.has_value(); }
|
||||
bool Finished() const { return finished_; }
|
||||
bool ReadyForNext() const { return ready_for_next_; }
|
||||
T Value() const { return *yield_value_; }
|
||||
|
||||
bool finished_ = false;
|
||||
bool ready_for_next_ = false;
|
||||
std::optional<YieldValueType> yield_value_;
|
||||
};
|
||||
|
||||
struct TransformFinish {
|
||||
template <typename T>
|
||||
operator TransformFlow<T>() && { // NOLINT explicit
|
||||
return TransformFlow<T>(true, true);
|
||||
}
|
||||
};
|
||||
|
||||
struct TransformSkip {
|
||||
template <typename T>
|
||||
operator TransformFlow<T>() && { // NOLINT explicit
|
||||
return TransformFlow<T>(false, true);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
TransformFlow<T> TransformYield(T value = {}, bool ready_for_next = true) {
|
||||
return TransformFlow<T>(std::move(value), ready_for_next);
|
||||
}
|
||||
|
||||
template <typename T, typename V>
|
||||
using Transformer = std::function<Result<TransformFlow<V>>(T)>;
|
||||
|
||||
template <typename T, typename V>
|
||||
class TransformIterator {
|
||||
public:
|
||||
explicit TransformIterator(Iterator<T> it, Transformer<T, V> transformer)
|
||||
: it_(std::move(it)),
|
||||
transformer_(std::move(transformer)),
|
||||
last_value_(),
|
||||
finished_() {}
|
||||
|
||||
Result<V> Next() {
|
||||
while (!finished_) {
|
||||
ARROW_ASSIGN_OR_RAISE(std::optional<V> next, Pump());
|
||||
if (next.has_value()) {
|
||||
return std::move(*next);
|
||||
}
|
||||
ARROW_ASSIGN_OR_RAISE(last_value_, it_.Next());
|
||||
}
|
||||
return IterationTraits<V>::End();
|
||||
}
|
||||
|
||||
private:
|
||||
// Calls the transform function on the current value. Can return in several ways
|
||||
// * If the next value is requested (e.g. skip) it will return an empty optional
|
||||
// * If an invalid status is encountered that will be returned
|
||||
// * If finished it will return IterationTraits<V>::End()
|
||||
// * If a value is returned by the transformer that will be returned
|
||||
Result<std::optional<V>> Pump() {
|
||||
if (!finished_ && last_value_.has_value()) {
|
||||
auto next_res = transformer_(*last_value_);
|
||||
if (!next_res.ok()) {
|
||||
finished_ = true;
|
||||
return next_res.status();
|
||||
}
|
||||
auto next = std::move(*next_res);
|
||||
if (next.ReadyForNext()) {
|
||||
if (IsIterationEnd(*last_value_)) {
|
||||
finished_ = true;
|
||||
}
|
||||
last_value_.reset();
|
||||
}
|
||||
if (next.Finished()) {
|
||||
finished_ = true;
|
||||
}
|
||||
if (next.HasValue()) {
|
||||
return next.Value();
|
||||
}
|
||||
}
|
||||
if (finished_) {
|
||||
return IterationTraits<V>::End();
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
Iterator<T> it_;
|
||||
Transformer<T, V> transformer_;
|
||||
std::optional<T> last_value_;
|
||||
bool finished_ = false;
|
||||
};
|
||||
|
||||
/// \brief Transforms an iterator according to a transformer, returning a new Iterator.
|
||||
///
|
||||
/// The transformer will be called on each element of the source iterator and for each
|
||||
/// call it can yield a value, skip, or finish the iteration. When yielding a value the
|
||||
/// transformer can choose to consume the source item (the default, ready_for_next = true)
|
||||
/// or to keep it and it will be called again on the same value.
|
||||
///
|
||||
/// This is essentially a more generic form of the map operation that can return 0, 1, or
|
||||
/// many values for each of the source items.
|
||||
///
|
||||
/// The transformer will be exposed to the end of the source sequence
|
||||
/// (IterationTraits::End) in case it needs to return some penultimate item(s).
|
||||
///
|
||||
/// Any invalid status returned by the transformer will be returned immediately.
|
||||
template <typename T, typename V>
|
||||
Iterator<V> MakeTransformedIterator(Iterator<T> it, Transformer<T, V> op) {
|
||||
return Iterator<V>(TransformIterator<T, V>(std::move(it), std::move(op)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct IterationTraits<Iterator<T>> {
|
||||
// The end condition for an Iterator of Iterators is a default constructed (null)
|
||||
// Iterator.
|
||||
static Iterator<T> End() { return Iterator<T>(); }
|
||||
static bool IsEnd(const Iterator<T>& val) { return !val; }
|
||||
};
|
||||
|
||||
template <typename Fn, typename T>
|
||||
class FunctionIterator {
|
||||
public:
|
||||
explicit FunctionIterator(Fn fn) : fn_(std::move(fn)) {}
|
||||
|
||||
Result<T> Next() { return fn_(); }
|
||||
|
||||
private:
|
||||
Fn fn_;
|
||||
};
|
||||
|
||||
/// \brief Construct an Iterator which invokes a callable on Next()
|
||||
template <typename Fn,
|
||||
typename Ret = typename internal::call_traits::return_type<Fn>::ValueType>
|
||||
Iterator<Ret> MakeFunctionIterator(Fn fn) {
|
||||
return Iterator<Ret>(FunctionIterator<Fn, Ret>(std::move(fn)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Iterator<T> MakeEmptyIterator() {
|
||||
return MakeFunctionIterator([]() -> Result<T> { return IterationTraits<T>::End(); });
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Iterator<T> MakeErrorIterator(Status s) {
|
||||
return MakeFunctionIterator([s]() -> Result<T> {
|
||||
ARROW_RETURN_NOT_OK(s);
|
||||
return IterationTraits<T>::End();
|
||||
});
|
||||
}
|
||||
|
||||
/// \brief Simple iterator which yields the elements of a std::vector
|
||||
template <typename T>
|
||||
class VectorIterator {
|
||||
public:
|
||||
explicit VectorIterator(std::vector<T> v) : elements_(std::move(v)) {}
|
||||
|
||||
Result<T> Next() {
|
||||
if (i_ == elements_.size()) {
|
||||
return IterationTraits<T>::End();
|
||||
}
|
||||
return std::move(elements_[i_++]);
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<T> elements_;
|
||||
size_t i_ = 0;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
Iterator<T> MakeVectorIterator(std::vector<T> v) {
|
||||
return Iterator<T>(VectorIterator<T>(std::move(v)));
|
||||
}
|
||||
|
||||
/// \brief Simple iterator which yields *pointers* to the elements of a std::vector<T>.
|
||||
/// This is provided to support T where IterationTraits<T>::End is not specialized
|
||||
template <typename T>
|
||||
class VectorPointingIterator {
|
||||
public:
|
||||
explicit VectorPointingIterator(std::vector<T> v) : elements_(std::move(v)) {}
|
||||
|
||||
Result<T*> Next() {
|
||||
if (i_ == elements_.size()) {
|
||||
return NULLPTR;
|
||||
}
|
||||
return &elements_[i_++];
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<T> elements_;
|
||||
size_t i_ = 0;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
Iterator<T*> MakeVectorPointingIterator(std::vector<T> v) {
|
||||
return Iterator<T*>(VectorPointingIterator<T>(std::move(v)));
|
||||
}
|
||||
|
||||
/// \brief MapIterator takes ownership of an iterator and a function to apply
|
||||
/// on every element. The mapped function is not allowed to fail.
|
||||
template <typename Fn, typename I, typename O>
|
||||
class MapIterator {
|
||||
public:
|
||||
explicit MapIterator(Fn map, Iterator<I> it)
|
||||
: map_(std::move(map)), it_(std::move(it)) {}
|
||||
|
||||
Result<O> Next() {
|
||||
ARROW_ASSIGN_OR_RAISE(I i, it_.Next());
|
||||
|
||||
if (IsIterationEnd(i)) {
|
||||
return IterationTraits<O>::End();
|
||||
}
|
||||
|
||||
return map_(std::move(i));
|
||||
}
|
||||
|
||||
private:
|
||||
Fn map_;
|
||||
Iterator<I> it_;
|
||||
};
|
||||
|
||||
/// \brief MapIterator takes ownership of an iterator and a function to apply
|
||||
/// on every element. The mapped function is not allowed to fail.
|
||||
template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
|
||||
typename To = internal::call_traits::return_type<Fn>>
|
||||
Iterator<To> MakeMapIterator(Fn map, Iterator<From> it) {
|
||||
return Iterator<To>(MapIterator<Fn, From, To>(std::move(map), std::move(it)));
|
||||
}
|
||||
|
||||
/// \brief Like MapIterator, but where the function can fail.
|
||||
template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
|
||||
typename To = typename internal::call_traits::return_type<Fn>::ValueType>
|
||||
Iterator<To> MakeMaybeMapIterator(Fn map, Iterator<From> it) {
|
||||
return Iterator<To>(MapIterator<Fn, From, To>(std::move(map), std::move(it)));
|
||||
}
|
||||
|
||||
struct FilterIterator {
|
||||
enum Action { ACCEPT, REJECT };
|
||||
|
||||
template <typename To>
|
||||
static Result<std::pair<To, Action>> Reject() {
|
||||
return std::make_pair(IterationTraits<To>::End(), REJECT);
|
||||
}
|
||||
|
||||
template <typename To>
|
||||
static Result<std::pair<To, Action>> Accept(To out) {
|
||||
return std::make_pair(std::move(out), ACCEPT);
|
||||
}
|
||||
|
||||
template <typename To>
|
||||
static Result<std::pair<To, Action>> MaybeAccept(Result<To> maybe_out) {
|
||||
return std::move(maybe_out).Map(Accept<To>);
|
||||
}
|
||||
|
||||
template <typename To>
|
||||
static Result<std::pair<To, Action>> Error(Status s) {
|
||||
return s;
|
||||
}
|
||||
|
||||
template <typename Fn, typename From, typename To>
|
||||
class Impl {
|
||||
public:
|
||||
explicit Impl(Fn filter, Iterator<From> it) : filter_(filter), it_(std::move(it)) {}
|
||||
|
||||
Result<To> Next() {
|
||||
To out = IterationTraits<To>::End();
|
||||
Action action;
|
||||
|
||||
for (;;) {
|
||||
ARROW_ASSIGN_OR_RAISE(From i, it_.Next());
|
||||
|
||||
if (IsIterationEnd(i)) {
|
||||
return IterationTraits<To>::End();
|
||||
}
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(std::tie(out, action), filter_(std::move(i)));
|
||||
|
||||
if (action == ACCEPT) return out;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
Fn filter_;
|
||||
Iterator<From> it_;
|
||||
};
|
||||
};
|
||||
|
||||
/// \brief Like MapIterator, but where the function can fail or reject elements.
|
||||
template <
|
||||
typename Fn, typename From = typename internal::call_traits::argument_type<0, Fn>,
|
||||
typename Ret = typename internal::call_traits::return_type<Fn>::ValueType,
|
||||
typename To = typename std::tuple_element<0, Ret>::type,
|
||||
typename Enable = typename std::enable_if<std::is_same<
|
||||
typename std::tuple_element<1, Ret>::type, FilterIterator::Action>::value>::type>
|
||||
Iterator<To> MakeFilterIterator(Fn filter, Iterator<From> it) {
|
||||
return Iterator<To>(
|
||||
FilterIterator::Impl<Fn, From, To>(std::move(filter), std::move(it)));
|
||||
}
|
||||
|
||||
/// \brief FlattenIterator takes an iterator generating iterators and yields a
|
||||
/// unified iterator that flattens/concatenates in a single stream.
|
||||
template <typename T>
|
||||
class FlattenIterator {
|
||||
public:
|
||||
explicit FlattenIterator(Iterator<Iterator<T>> it) : parent_(std::move(it)) {}
|
||||
|
||||
Result<T> Next() {
|
||||
if (IsIterationEnd(child_)) {
|
||||
// Pop from parent's iterator.
|
||||
ARROW_ASSIGN_OR_RAISE(child_, parent_.Next());
|
||||
|
||||
// Check if final iteration reached.
|
||||
if (IsIterationEnd(child_)) {
|
||||
return IterationTraits<T>::End();
|
||||
}
|
||||
|
||||
return Next();
|
||||
}
|
||||
|
||||
// Pop from child_ and check for depletion.
|
||||
ARROW_ASSIGN_OR_RAISE(T out, child_.Next());
|
||||
if (IsIterationEnd(out)) {
|
||||
// Reset state such that we pop from parent on the recursive call
|
||||
child_ = IterationTraits<Iterator<T>>::End();
|
||||
|
||||
return Next();
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
private:
|
||||
Iterator<Iterator<T>> parent_;
|
||||
Iterator<T> child_ = IterationTraits<Iterator<T>>::End();
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
Iterator<T> MakeFlattenIterator(Iterator<Iterator<T>> it) {
|
||||
return Iterator<T>(FlattenIterator<T>(std::move(it)));
|
||||
}
|
||||
|
||||
template <typename Reader>
|
||||
Iterator<typename Reader::ValueType> MakeIteratorFromReader(
|
||||
const std::shared_ptr<Reader>& reader) {
|
||||
return MakeFunctionIterator([reader] { return reader->Next(); });
|
||||
}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,99 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief A container for key-value pair type metadata. Not thread-safe
|
||||
class ARROW_EXPORT KeyValueMetadata {
|
||||
public:
|
||||
KeyValueMetadata();
|
||||
KeyValueMetadata(std::vector<std::string> keys, std::vector<std::string> values);
|
||||
explicit KeyValueMetadata(const std::unordered_map<std::string, std::string>& map);
|
||||
|
||||
static std::shared_ptr<KeyValueMetadata> Make(std::vector<std::string> keys,
|
||||
std::vector<std::string> values);
|
||||
|
||||
void ToUnorderedMap(std::unordered_map<std::string, std::string>* out) const;
|
||||
void Append(std::string key, std::string value);
|
||||
|
||||
Result<std::string> Get(std::string_view key) const;
|
||||
bool Contains(std::string_view key) const;
|
||||
// Note that deleting may invalidate known indices
|
||||
Status Delete(std::string_view key);
|
||||
Status Delete(int64_t index);
|
||||
Status DeleteMany(std::vector<int64_t> indices);
|
||||
Status Set(std::string key, std::string value);
|
||||
|
||||
void reserve(int64_t n);
|
||||
|
||||
int64_t size() const;
|
||||
const std::string& key(int64_t i) const;
|
||||
const std::string& value(int64_t i) const;
|
||||
const std::vector<std::string>& keys() const { return keys_; }
|
||||
const std::vector<std::string>& values() const { return values_; }
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> sorted_pairs() const;
|
||||
|
||||
/// \brief Perform linear search for key, returning -1 if not found
|
||||
int FindKey(std::string_view key) const;
|
||||
|
||||
std::shared_ptr<KeyValueMetadata> Copy() const;
|
||||
|
||||
/// \brief Return a new KeyValueMetadata by combining the passed metadata
|
||||
/// with this KeyValueMetadata. Colliding keys will be overridden by the
|
||||
/// passed metadata. Assumes keys in both containers are unique
|
||||
std::shared_ptr<KeyValueMetadata> Merge(const KeyValueMetadata& other) const;
|
||||
|
||||
bool Equals(const KeyValueMetadata& other) const;
|
||||
std::string ToString() const;
|
||||
|
||||
private:
|
||||
std::vector<std::string> keys_;
|
||||
std::vector<std::string> values_;
|
||||
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(KeyValueMetadata);
|
||||
};
|
||||
|
||||
/// \brief Create a KeyValueMetadata instance
|
||||
///
|
||||
/// \param pairs key-value mapping
|
||||
ARROW_EXPORT std::shared_ptr<KeyValueMetadata> key_value_metadata(
|
||||
const std::unordered_map<std::string, std::string>& pairs);
|
||||
|
||||
/// \brief Create a KeyValueMetadata instance
|
||||
///
|
||||
/// \param keys sequence of metadata keys
|
||||
/// \param values sequence of corresponding metadata values
|
||||
ARROW_EXPORT std::shared_ptr<KeyValueMetadata> key_value_metadata(
|
||||
std::vector<std::string> keys, std::vector<std::string> values);
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,35 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <new>
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
#if __cpp_lib_launder
|
||||
using std::launder;
|
||||
#else
|
||||
template <class T>
|
||||
constexpr T* launder(T* p) noexcept {
|
||||
return p;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,55 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace list_util {
|
||||
namespace internal {
|
||||
|
||||
/// \brief Calculate the smallest continuous range of values used by the
|
||||
/// var-length list-like input (list, map and list-view types).
|
||||
///
|
||||
/// \param input The input array such that is_var_length_list_like(input.type)
|
||||
/// is true
|
||||
/// \return A pair of (offset, length) describing the range
|
||||
ARROW_EXPORT Result<std::pair<int64_t, int64_t>> RangeOfValuesUsed(
|
||||
const ArraySpan& input);
|
||||
|
||||
/// \brief Calculate the sum of the sizes of all valid lists or list-views
|
||||
///
|
||||
/// This is usually the same as the length of the RangeOfValuesUsed() range, but
|
||||
/// it can be:
|
||||
/// - Smaller: when the child array contains many values that are not
|
||||
/// referenced by the lists or list-views in the parent array
|
||||
/// - Greater: when the list-views share child array ranges
|
||||
///
|
||||
/// \param input The input array such that is_var_length_list_like(input.type)
|
||||
/// is true
|
||||
/// \return The sum of all list or list-view sizes
|
||||
ARROW_EXPORT Result<int64_t> SumOfLogicalListSizes(const ArraySpan& input);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace list_util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,186 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <string_view>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
struct SourceLocation {
|
||||
const char* file = "";
|
||||
int line = 0;
|
||||
};
|
||||
|
||||
struct LogDetails {
|
||||
ArrowLogLevel severity = ArrowLogLevel::ARROW_INFO;
|
||||
std::chrono::system_clock::time_point timestamp = std::chrono::system_clock::now();
|
||||
SourceLocation source_location{};
|
||||
std::string_view message = "";
|
||||
};
|
||||
|
||||
/// \brief A base interface for custom loggers.
|
||||
///
|
||||
/// Loggers can be added to the LoggerRegistry for global access or directly provided to
|
||||
/// certain logging utilities.
|
||||
class Logger {
|
||||
public:
|
||||
virtual ~Logger() = default;
|
||||
|
||||
virtual void Log(const LogDetails& details) = 0;
|
||||
|
||||
virtual bool Flush(std::chrono::microseconds timeout) { return true; }
|
||||
bool Flush() { return this->Flush(std::chrono::microseconds::max()); }
|
||||
|
||||
virtual bool is_enabled() const { return true; }
|
||||
|
||||
virtual ArrowLogLevel severity_threshold() const { return ArrowLogLevel::ARROW_TRACE; }
|
||||
};
|
||||
|
||||
/// \brief Creates a simple logger that redirects output to std::cerr
|
||||
ARROW_EXPORT std::shared_ptr<Logger> MakeOStreamLogger(ArrowLogLevel severity_threshold);
|
||||
/// \brief Creates a simple logger that redirects output to the provided ostream
|
||||
ARROW_EXPORT std::shared_ptr<Logger> MakeOStreamLogger(ArrowLogLevel severity_threshold,
|
||||
std::ostream& sink);
|
||||
|
||||
class ARROW_EXPORT LoggerRegistry {
|
||||
public:
|
||||
/// \brief Add a logger to the registry with the associated name
|
||||
///
|
||||
/// Returns Invalid if a logger with the provided name already exists. Users should call
|
||||
/// `UnregisterLogger` first if they wish to overwrite it.
|
||||
static Status RegisterLogger(std::string_view name, std::shared_ptr<Logger> logger);
|
||||
|
||||
/// \brief Remove a logger from the registry
|
||||
static void UnregisterLogger(std::string_view name);
|
||||
|
||||
/// \brief Return the logger associated with the provided name
|
||||
///
|
||||
/// If `name` is empty, the default logger is returned. If `name` doesn't match any of
|
||||
/// the registered loggers then a non-null noop logger is returned
|
||||
static std::shared_ptr<Logger> GetLogger(std::string_view name = "");
|
||||
|
||||
/// \brief Return the default logger
|
||||
static std::shared_ptr<Logger> GetDefaultLogger();
|
||||
/// \brief Set the default logger
|
||||
static void SetDefaultLogger(std::shared_ptr<Logger> logger);
|
||||
};
|
||||
|
||||
/// \brief Represents a single log record to be emitted by an underlying logger
|
||||
class ARROW_EXPORT LogMessage {
|
||||
public:
|
||||
/// \brief Construct a LogMessage with the provided underlying logger
|
||||
LogMessage(ArrowLogLevel severity, std::shared_ptr<Logger> logger,
|
||||
SourceLocation source_location = {});
|
||||
/// \brief Construct a LogMessage with the provided logger name, which will be used to
|
||||
/// find an underlying logger in the registry
|
||||
LogMessage(ArrowLogLevel severity, std::string_view logger_name,
|
||||
SourceLocation source_location = {});
|
||||
|
||||
std::ostream& Stream();
|
||||
|
||||
// Convenience method - mainly for use in ARROW_LOG_* macros. This prevents unnecessary
|
||||
// argument evaluation when log statements are stripped in certain builds
|
||||
template <typename... Args>
|
||||
LogMessage& Append(Args&&... args) {
|
||||
if constexpr (sizeof...(Args) > 0) {
|
||||
if (CheckIsEnabled()) {
|
||||
(Stream() << ... << args);
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
bool CheckIsEnabled();
|
||||
|
||||
class Impl;
|
||||
std::shared_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
|
||||
// For the following macros, log statements with a lower severity than
|
||||
// `ARROW_MINIMUM_LOG_LEVEL` will be stripped from the build
|
||||
#ifndef ARROW_MINIMUM_LOG_LEVEL
|
||||
# define ARROW_MINIMUM_LOG_LEVEL -1000
|
||||
#endif
|
||||
|
||||
#define ARROW_LOGGER_INTERNAL(LOGGER, LEVEL) \
|
||||
(::arrow::util::LogMessage(::arrow::util::ArrowLogLevel::ARROW_##LEVEL, LOGGER, \
|
||||
::arrow::util::SourceLocation{__FILE__, __LINE__}))
|
||||
|
||||
static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_TRACE) == -2);
|
||||
#if ARROW_MINIMUM_LOG_LEVEL <= -2
|
||||
# define ARROW_LOGGER_TRACE(LOGGER, ...) \
|
||||
(ARROW_LOGGER_INTERNAL(LOGGER, TRACE).Append(__VA_ARGS__))
|
||||
#else
|
||||
# define ARROW_LOGGER_TRACE(...) ARROW_UNUSED(0)
|
||||
#endif
|
||||
|
||||
static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_DEBUG) == -1);
|
||||
#if ARROW_MINIMUM_LOG_LEVEL <= -1
|
||||
# define ARROW_LOGGER_DEBUG(LOGGER, ...) \
|
||||
(ARROW_LOGGER_INTERNAL(LOGGER, DEBUG).Append(__VA_ARGS__))
|
||||
#else
|
||||
# define ARROW_LOGGER_DEBUG(...) ARROW_UNUSED(0)
|
||||
#endif
|
||||
|
||||
static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_INFO) == 0);
|
||||
#if ARROW_MINIMUM_LOG_LEVEL <= 0
|
||||
# define ARROW_LOGGER_INFO(LOGGER, ...) \
|
||||
(ARROW_LOGGER_INTERNAL(LOGGER, INFO).Append(__VA_ARGS__))
|
||||
#else
|
||||
# define ARROW_LOGGER_INFO(...) ARROW_UNUSED(0)
|
||||
#endif
|
||||
|
||||
static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_WARNING) == 1);
|
||||
#if ARROW_MINIMUM_LOG_LEVEL <= 1
|
||||
# define ARROW_LOGGER_WARNING(LOGGER, ...) \
|
||||
(ARROW_LOGGER_INTERNAL(LOGGER, WARNING).Append(__VA_ARGS__))
|
||||
#else
|
||||
# define ARROW_LOGGER_WARNING(...) ARROW_UNUSED(0)
|
||||
#endif
|
||||
|
||||
static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_ERROR) == 2);
|
||||
#if ARROW_MINIMUM_LOG_LEVEL <= 2
|
||||
# define ARROW_LOGGER_ERROR(LOGGER, ...) \
|
||||
(ARROW_LOGGER_INTERNAL(LOGGER, ERROR).Append(__VA_ARGS__))
|
||||
#else
|
||||
# define ARROW_LOGGER_ERROR(...) ARROW_UNUSED(0)
|
||||
#endif
|
||||
|
||||
static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_FATAL) == 3);
|
||||
#if ARROW_MINIMUM_LOG_LEVEL <= 3
|
||||
# define ARROW_LOGGER_FATAL(LOGGER, ...) \
|
||||
(ARROW_LOGGER_INTERNAL(LOGGER, FATAL).Append(__VA_ARGS__))
|
||||
#else
|
||||
# define ARROW_LOGGER_FATAL(...) ARROW_UNUSED(0)
|
||||
#endif
|
||||
|
||||
#define ARROW_LOGGER_CALL(LOGGER, LEVEL, ...) ARROW_LOGGER_##LEVEL(LOGGER, __VA_ARGS__)
|
||||
@@ -0,0 +1,251 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef GANDIVA_IR
|
||||
|
||||
// The LLVM IR code doesn't have an NDEBUG mode. And, it shouldn't include references to
|
||||
// streams or stdc++. So, making the DCHECK calls void in that case.
|
||||
|
||||
# define ARROW_IGNORE_EXPR(expr) ((void)(expr))
|
||||
|
||||
# define ARROW_DCHECK(condition) ARROW_IGNORE_EXPR(condition)
|
||||
# define ARROW_DCHECK_OK(status) ARROW_IGNORE_EXPR(status)
|
||||
# define ARROW_DCHECK_EQ(val1, val2) ARROW_IGNORE_EXPR(val1)
|
||||
# define ARROW_DCHECK_NE(val1, val2) ARROW_IGNORE_EXPR(val1)
|
||||
# define ARROW_DCHECK_LE(val1, val2) ARROW_IGNORE_EXPR(val1)
|
||||
# define ARROW_DCHECK_LT(val1, val2) ARROW_IGNORE_EXPR(val1)
|
||||
# define ARROW_DCHECK_GE(val1, val2) ARROW_IGNORE_EXPR(val1)
|
||||
# define ARROW_DCHECK_GT(val1, val2) ARROW_IGNORE_EXPR(val1)
|
||||
|
||||
#else // !GANDIVA_IR
|
||||
|
||||
# include <memory>
|
||||
# include <ostream>
|
||||
# include <string>
|
||||
|
||||
# include "arrow/util/macros.h"
|
||||
# include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
enum class ArrowLogLevel : int {
|
||||
ARROW_TRACE = -2,
|
||||
ARROW_DEBUG = -1,
|
||||
ARROW_INFO = 0,
|
||||
ARROW_WARNING = 1,
|
||||
ARROW_ERROR = 2,
|
||||
ARROW_FATAL = 3
|
||||
};
|
||||
|
||||
# define ARROW_LOG_INTERNAL(level) ::arrow::util::ArrowLog(__FILE__, __LINE__, level)
|
||||
# define ARROW_LOG(level) ARROW_LOG_INTERNAL(::arrow::util::ArrowLogLevel::ARROW_##level)
|
||||
|
||||
# define ARROW_IGNORE_EXPR(expr) ((void)(expr))
|
||||
|
||||
# define ARROW_CHECK_OR_LOG(condition, level) \
|
||||
ARROW_PREDICT_TRUE(condition) \
|
||||
? ARROW_IGNORE_EXPR(0) \
|
||||
: ::arrow::util::Voidify() & ARROW_LOG(level) << " Check failed: " #condition " "
|
||||
|
||||
# define ARROW_CHECK(condition) ARROW_CHECK_OR_LOG(condition, FATAL)
|
||||
|
||||
// If 'to_call' returns a bad status, CHECK immediately with a logged message
|
||||
// of 'msg' followed by the status.
|
||||
# define ARROW_CHECK_OK_PREPEND(to_call, msg, level) \
|
||||
do { \
|
||||
::arrow::Status _s = ::arrow::ToStatus(to_call); \
|
||||
ARROW_CHECK_OR_LOG(_s.ok(), level) \
|
||||
<< "Operation failed: " << ARROW_STRINGIFY(to_call) << "\n" \
|
||||
<< (msg) << ": " << _s.ToString(); \
|
||||
} while (false)
|
||||
|
||||
// If the status is bad, CHECK immediately, appending the status to the
|
||||
// logged message.
|
||||
# define ARROW_CHECK_OK(s) ARROW_CHECK_OK_PREPEND(s, "Bad status", FATAL)
|
||||
|
||||
# define ARROW_CHECK_EQ(val1, val2) ARROW_CHECK((val1) == (val2))
|
||||
# define ARROW_CHECK_NE(val1, val2) ARROW_CHECK((val1) != (val2))
|
||||
# define ARROW_CHECK_LE(val1, val2) ARROW_CHECK((val1) <= (val2))
|
||||
# define ARROW_CHECK_LT(val1, val2) ARROW_CHECK((val1) < (val2))
|
||||
# define ARROW_CHECK_GE(val1, val2) ARROW_CHECK((val1) >= (val2))
|
||||
# define ARROW_CHECK_GT(val1, val2) ARROW_CHECK((val1) > (val2))
|
||||
|
||||
# ifdef NDEBUG
|
||||
# define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_WARNING
|
||||
|
||||
// CAUTION: DCHECK_OK() always evaluates its argument, but other DCHECK*() macros
|
||||
// only do so in debug mode.
|
||||
|
||||
# define ARROW_DCHECK(condition) \
|
||||
while (false) ARROW_IGNORE_EXPR(condition); \
|
||||
while (false) ::arrow::util::detail::NullLog()
|
||||
# define ARROW_DCHECK_OK(s) \
|
||||
ARROW_IGNORE_EXPR(s); \
|
||||
while (false) ::arrow::util::detail::NullLog()
|
||||
# define ARROW_DCHECK_EQ(val1, val2) \
|
||||
while (false) ARROW_IGNORE_EXPR(val1); \
|
||||
while (false) ARROW_IGNORE_EXPR(val2); \
|
||||
while (false) ::arrow::util::detail::NullLog()
|
||||
# define ARROW_DCHECK_NE(val1, val2) \
|
||||
while (false) ARROW_IGNORE_EXPR(val1); \
|
||||
while (false) ARROW_IGNORE_EXPR(val2); \
|
||||
while (false) ::arrow::util::detail::NullLog()
|
||||
# define ARROW_DCHECK_LE(val1, val2) \
|
||||
while (false) ARROW_IGNORE_EXPR(val1); \
|
||||
while (false) ARROW_IGNORE_EXPR(val2); \
|
||||
while (false) ::arrow::util::detail::NullLog()
|
||||
# define ARROW_DCHECK_LT(val1, val2) \
|
||||
while (false) ARROW_IGNORE_EXPR(val1); \
|
||||
while (false) ARROW_IGNORE_EXPR(val2); \
|
||||
while (false) ::arrow::util::detail::NullLog()
|
||||
# define ARROW_DCHECK_GE(val1, val2) \
|
||||
while (false) ARROW_IGNORE_EXPR(val1); \
|
||||
while (false) ARROW_IGNORE_EXPR(val2); \
|
||||
while (false) ::arrow::util::detail::NullLog()
|
||||
# define ARROW_DCHECK_GT(val1, val2) \
|
||||
while (false) ARROW_IGNORE_EXPR(val1); \
|
||||
while (false) ARROW_IGNORE_EXPR(val2); \
|
||||
while (false) ::arrow::util::detail::NullLog()
|
||||
|
||||
# else
|
||||
# define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_FATAL
|
||||
|
||||
# define ARROW_DCHECK ARROW_CHECK
|
||||
# define ARROW_DCHECK_OK ARROW_CHECK_OK
|
||||
# define ARROW_DCHECK_EQ ARROW_CHECK_EQ
|
||||
# define ARROW_DCHECK_NE ARROW_CHECK_NE
|
||||
# define ARROW_DCHECK_LE ARROW_CHECK_LE
|
||||
# define ARROW_DCHECK_LT ARROW_CHECK_LT
|
||||
# define ARROW_DCHECK_GE ARROW_CHECK_GE
|
||||
# define ARROW_DCHECK_GT ARROW_CHECK_GT
|
||||
|
||||
# endif // NDEBUG
|
||||
|
||||
// This code is adapted from
|
||||
// https://github.com/ray-project/ray/blob/master/src/ray/util/logging.h.
|
||||
|
||||
// To make the logging lib pluggable with other logging libs and make
|
||||
// the implementation unawared by the user, ArrowLog is only a declaration
|
||||
// which hide the implementation into logging.cc file.
|
||||
// In logging.cc, we can choose different log libs using different macros.
|
||||
|
||||
// This is also a null log which does not output anything.
|
||||
class ARROW_EXPORT ArrowLogBase {
|
||||
public:
|
||||
virtual ~ArrowLogBase() {}
|
||||
|
||||
virtual bool IsEnabled() const { return false; }
|
||||
|
||||
template <typename T>
|
||||
ArrowLogBase& operator<<(const T& t) {
|
||||
if (IsEnabled()) {
|
||||
Stream() << t;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual std::ostream& Stream() = 0;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ArrowLog : public ArrowLogBase {
|
||||
public:
|
||||
ArrowLog(const char* file_name, int line_number, ArrowLogLevel severity);
|
||||
~ArrowLog() override;
|
||||
|
||||
/// Return whether or not current logging instance is enabled.
|
||||
///
|
||||
/// \return True if logging is enabled and false otherwise.
|
||||
bool IsEnabled() const override;
|
||||
|
||||
/// The init function of arrow log for a program which should be called only once.
|
||||
///
|
||||
/// \param appName The app name which starts the log.
|
||||
/// \param severity_threshold Logging threshold for the program.
|
||||
/// \param logDir Logging output file name. If empty, the log won't output to file.
|
||||
static void StartArrowLog(const std::string& appName,
|
||||
ArrowLogLevel severity_threshold = ArrowLogLevel::ARROW_INFO,
|
||||
const std::string& logDir = "");
|
||||
|
||||
/// The shutdown function of arrow log, it should be used with StartArrowLog as a pair.
|
||||
static void ShutDownArrowLog();
|
||||
|
||||
/// Install the failure signal handler to output call stack when crash.
|
||||
/// If glog is not installed, this function won't do anything.
|
||||
static void InstallFailureSignalHandler();
|
||||
|
||||
/// Uninstall the signal actions installed by InstallFailureSignalHandler.
|
||||
static void UninstallSignalAction();
|
||||
|
||||
/// Return whether or not the log level is enabled in current setting.
|
||||
///
|
||||
/// \param log_level The input log level to test.
|
||||
/// \return True if input log level is not lower than the threshold.
|
||||
static bool IsLevelEnabled(ArrowLogLevel log_level);
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowLog);
|
||||
|
||||
// Hide the implementation of log provider by void *.
|
||||
// Otherwise, lib user may define the same macro to use the correct header file.
|
||||
void* logging_provider_;
|
||||
/// True if log messages should be logged and false if they should be ignored.
|
||||
bool is_enabled_;
|
||||
|
||||
static ArrowLogLevel severity_threshold_;
|
||||
|
||||
protected:
|
||||
std::ostream& Stream() override;
|
||||
};
|
||||
|
||||
// This class make ARROW_CHECK compilation pass to change the << operator to void.
|
||||
// This class is copied from glog.
|
||||
class ARROW_EXPORT Voidify {
|
||||
public:
|
||||
Voidify() {}
|
||||
// This has to be an operator with a precedence lower than << but
|
||||
// higher than ?:
|
||||
void operator&(ArrowLogBase&) {}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// @brief A helper for the nil log sink.
|
||||
///
|
||||
/// Using this helper is analogous to sending log messages to /dev/null:
|
||||
/// nothing gets logged.
|
||||
class NullLog {
|
||||
public:
|
||||
/// The no-op output operator.
|
||||
///
|
||||
/// @param [in] t
|
||||
/// The object to send into the nil sink.
|
||||
/// @return Reference to the updated object.
|
||||
template <class T>
|
||||
NullLog& operator<<(const T& t) {
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
|
||||
#endif // GANDIVA_IR
|
||||
@@ -0,0 +1,252 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define ARROW_EXPAND(x) x
|
||||
#define ARROW_STRINGIFY(x) #x
|
||||
#define ARROW_CONCAT(x, y) x##y
|
||||
|
||||
// From Google gutil
|
||||
#ifndef ARROW_DISALLOW_COPY_AND_ASSIGN
|
||||
# define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName) \
|
||||
TypeName(const TypeName&) = delete; \
|
||||
void operator=(const TypeName&) = delete
|
||||
#endif
|
||||
|
||||
#ifndef ARROW_DEFAULT_MOVE_AND_ASSIGN
|
||||
# define ARROW_DEFAULT_MOVE_AND_ASSIGN(TypeName) \
|
||||
TypeName(TypeName&&) = default; \
|
||||
TypeName& operator=(TypeName&&) = default
|
||||
#endif
|
||||
|
||||
// With ARROW_PREDICT_FALSE, GCC and clang can be told that a certain branch is
|
||||
// not likely to be taken (for instance, a CHECK failure), and use that information in
|
||||
// static analysis. Giving the compiler this information can affect the generated code
|
||||
// layout in the absence of better information (i.e. -fprofile-arcs). [1] explains how
|
||||
// this feature can be used to improve code generation. It was written as a positive
|
||||
// comment to a negative article about the use of these annotations.
|
||||
//
|
||||
// ARROW_COMPILER_ASSUME allows the compiler to assume that a given expression is
|
||||
// true, without evaluating it, and to optimise based on this assumption [2]. If this
|
||||
// condition is violated at runtime, the behavior is undefined. This can be useful to
|
||||
// generate both faster and smaller code in compute kernels.
|
||||
//
|
||||
// IMPORTANT: Different optimisers are likely to react differently to this annotation!
|
||||
// It should be used with care when we can prove by some means that the assumption
|
||||
// is (1) guaranteed to always hold and (2) is useful for optimization [3]. If the
|
||||
// assumption is pessimistic, it might even block the compiler from decisions that
|
||||
// could lead to better code [4]. If you have a good intuition for what the compiler
|
||||
// can do with assumptions [5], you can use this macro to guide it and end up with
|
||||
// results you would only get with more complex code transformations.
|
||||
// `clang -S -emit-llvm` can be used to check how the generated code changes with
|
||||
// your specific use of this macro.
|
||||
//
|
||||
// [1] https://lobste.rs/s/uwgtkt/don_t_use_likely_unlikely_attributes#c_xi3wmc
|
||||
// [2] "Portable assumptions"
|
||||
// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1774r4.pdf
|
||||
// [3] "Assertions Are Pessimistic, Assumptions Are Optimistic"
|
||||
// https://blog.regehr.org/archives/1096
|
||||
// [4] https://discourse.llvm.org/t/llvm-assume-blocks-optimization/71609
|
||||
// [5] J. Doerfert et al. 2019. "Performance Exploration Through Optimistic Static
|
||||
// Program Annotations". https://github.com/jdoerfert/PETOSPA/blob/master/ISC19.pdf
|
||||
#define ARROW_UNUSED(x) (void)(x)
|
||||
#ifdef ARROW_WARN_DOCUMENTATION
|
||||
# define ARROW_ARG_UNUSED(x) x
|
||||
#else
|
||||
# define ARROW_ARG_UNUSED(x)
|
||||
#endif
|
||||
#if defined(__GNUC__) // GCC and compatible compilers (clang, Intel ICC)
|
||||
# define ARROW_NORETURN __attribute__((noreturn))
|
||||
# define ARROW_NOINLINE __attribute__((noinline))
|
||||
# define ARROW_FORCE_INLINE __attribute__((always_inline))
|
||||
# define ARROW_PREDICT_FALSE(x) (__builtin_expect(!!(x), 0))
|
||||
# define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
|
||||
# define ARROW_RESTRICT __restrict
|
||||
# if defined(__clang__) // clang-specific
|
||||
# define ARROW_COMPILER_ASSUME(expr) __builtin_assume(expr)
|
||||
# else // GCC-specific
|
||||
# if __GNUC__ >= 13
|
||||
# define ARROW_COMPILER_ASSUME(expr) __attribute__((assume(expr)))
|
||||
# else
|
||||
// GCC does not have a built-in assume intrinsic before GCC 13, so we use an
|
||||
// if statement and __builtin_unreachable() to achieve the same effect [2].
|
||||
// Unlike clang's __builtin_assume and C++23's [[assume(expr)]], using this
|
||||
// on GCC won't warn about side-effects in the expression, so make sure expr
|
||||
// is side-effect free when working with GCC versions before 13 (Jan-2024),
|
||||
// otherwise clang/MSVC builds will fail in CI.
|
||||
# define ARROW_COMPILER_ASSUME(expr) \
|
||||
if (expr) { \
|
||||
} else { \
|
||||
__builtin_unreachable(); \
|
||||
}
|
||||
# endif // __GNUC__ >= 13
|
||||
# endif
|
||||
#elif defined(_MSC_VER) // MSVC
|
||||
# define ARROW_NORETURN __declspec(noreturn)
|
||||
# define ARROW_NOINLINE __declspec(noinline)
|
||||
# define ARROW_FORCE_INLINE __forceinline
|
||||
# define ARROW_PREDICT_FALSE(x) (x)
|
||||
# define ARROW_PREDICT_TRUE(x) (x)
|
||||
# define ARROW_RESTRICT __restrict
|
||||
# define ARROW_COMPILER_ASSUME(expr) __assume(expr)
|
||||
#else
|
||||
# define ARROW_NORETURN
|
||||
# define ARROW_NOINLINE
|
||||
# define ARROW_FORCE_INLINE
|
||||
# define ARROW_PREDICT_FALSE(x) (x)
|
||||
# define ARROW_PREDICT_TRUE(x) (x)
|
||||
# define ARROW_RESTRICT
|
||||
# define ARROW_COMPILER_ASSUME(expr)
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// C++/CLI support macros (see ARROW-1134)
|
||||
|
||||
#ifndef NULLPTR
|
||||
|
||||
# ifdef __cplusplus_cli
|
||||
# define NULLPTR __nullptr
|
||||
# else
|
||||
# define NULLPTR nullptr
|
||||
# endif
|
||||
|
||||
#endif // ifndef NULLPTR
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
// clang-format off
|
||||
// [[deprecated]] is only available in C++14, use this for the time being
|
||||
// This macro takes an optional deprecation message
|
||||
#ifdef __COVERITY__
|
||||
# define ARROW_DEPRECATED(...)
|
||||
#else
|
||||
# define ARROW_DEPRECATED(...) [[deprecated(__VA_ARGS__)]]
|
||||
#endif
|
||||
|
||||
#ifdef __COVERITY__
|
||||
# define ARROW_DEPRECATED_ENUM_VALUE(...)
|
||||
#else
|
||||
# define ARROW_DEPRECATED_ENUM_VALUE(...) [[deprecated(__VA_ARGS__)]]
|
||||
#endif
|
||||
|
||||
// clang-format on
|
||||
|
||||
// Macros to disable deprecation warnings
|
||||
|
||||
#ifdef __clang__
|
||||
# define ARROW_SUPPRESS_DEPRECATION_WARNING \
|
||||
_Pragma("clang diagnostic push"); \
|
||||
_Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
|
||||
# define ARROW_UNSUPPRESS_DEPRECATION_WARNING _Pragma("clang diagnostic pop")
|
||||
#elif defined(__GNUC__)
|
||||
# define ARROW_SUPPRESS_DEPRECATION_WARNING \
|
||||
_Pragma("GCC diagnostic push"); \
|
||||
_Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
|
||||
# define ARROW_UNSUPPRESS_DEPRECATION_WARNING _Pragma("GCC diagnostic pop")
|
||||
#elif defined(_MSC_VER)
|
||||
# define ARROW_SUPPRESS_DEPRECATION_WARNING \
|
||||
__pragma(warning(push)) __pragma(warning(disable : 4996))
|
||||
# define ARROW_UNSUPPRESS_DEPRECATION_WARNING __pragma(warning(pop))
|
||||
#else
|
||||
# define ARROW_SUPPRESS_DEPRECATION_WARNING
|
||||
# define ARROW_UNSUPPRESS_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
// Macros to disable warnings about undeclared global functions
|
||||
#if defined(__GNUC__)
|
||||
# define ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING \
|
||||
_Pragma("GCC diagnostic push"); \
|
||||
_Pragma("GCC diagnostic ignored \"-Wmissing-declarations\"")
|
||||
# define ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING _Pragma("GCC diagnostic pop")
|
||||
#else
|
||||
# define ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING
|
||||
# define ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
// macros to disable padding
|
||||
// these macros are portable across different compilers and platforms
|
||||
//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355]
|
||||
#if !defined(MANUALLY_ALIGNED_STRUCT)
|
||||
# if defined(_MSC_VER)
|
||||
# define MANUALLY_ALIGNED_STRUCT(alignment) \
|
||||
__pragma(pack(1)); \
|
||||
struct __declspec(align(alignment))
|
||||
# define STRUCT_END(name, size) \
|
||||
__pragma(pack()); \
|
||||
static_assert(sizeof(name) == size, "compiler breaks packing rules")
|
||||
# elif defined(__GNUC__) || defined(__clang__)
|
||||
# define MANUALLY_ALIGNED_STRUCT(alignment) \
|
||||
_Pragma("pack(1)") struct __attribute__((aligned(alignment)))
|
||||
# define STRUCT_END(name, size) \
|
||||
_Pragma("pack()") static_assert(sizeof(name) == size, \
|
||||
"compiler breaks packing rules")
|
||||
# else
|
||||
# error Unknown compiler, please define structure alignment macros
|
||||
# endif
|
||||
#endif // !defined(MANUALLY_ALIGNED_STRUCT)
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Convenience macro disabling a particular UBSan check in a function
|
||||
|
||||
#if defined(__clang__)
|
||||
# define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature)))
|
||||
#else
|
||||
# define ARROW_DISABLE_UBSAN(feature)
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Machine information
|
||||
|
||||
#if INTPTR_MAX == INT64_MAX
|
||||
# define ARROW_BITNESS 64
|
||||
#elif INTPTR_MAX == INT32_MAX
|
||||
# define ARROW_BITNESS 32
|
||||
#else
|
||||
# error Unexpected INTPTR_MAX
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// From googletest
|
||||
// (also in parquet-cpp)
|
||||
|
||||
// When you need to test the private or protected members of a class,
|
||||
// use the FRIEND_TEST macro to declare your tests as friends of the
|
||||
// class. For example:
|
||||
//
|
||||
// class MyClass {
|
||||
// private:
|
||||
// void MyMethod();
|
||||
// FRIEND_TEST(MyClassTest, MyMethod);
|
||||
// };
|
||||
//
|
||||
// class MyClassTest : public testing::Test {
|
||||
// // ...
|
||||
// };
|
||||
//
|
||||
// TEST_F(MyClassTest, MyMethod) {
|
||||
// // Can call MyClass::MyMethod() here.
|
||||
// }
|
||||
|
||||
#define FRIEND_TEST(test_case_name, test_name) \
|
||||
friend class test_case_name##_##test_name##_Test
|
||||
@@ -0,0 +1,32 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
|
||||
// Not provided by default in MSVC,
|
||||
// and _USE_MATH_DEFINES is not reliable with unity builds
|
||||
#ifndef M_PI
|
||||
# define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
#ifndef M_PI_2
|
||||
# define M_PI_2 1.57079632679489661923
|
||||
#endif
|
||||
#ifndef M_PI_4
|
||||
# define M_PI_4 0.785398163397448309616
|
||||
#endif
|
||||
@@ -0,0 +1,85 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
/// A wrapper around std::mutex since we can't use it directly in
|
||||
/// public headers due to C++/CLI.
|
||||
/// https://docs.microsoft.com/en-us/cpp/standard-library/mutex#remarks
|
||||
class ARROW_EXPORT Mutex {
|
||||
public:
|
||||
Mutex();
|
||||
Mutex(Mutex&&) = default;
|
||||
Mutex& operator=(Mutex&&) = default;
|
||||
|
||||
/// A Guard is falsy if a lock could not be acquired.
|
||||
class ARROW_EXPORT Guard {
|
||||
public:
|
||||
Guard() : locked_(NULLPTR, [](Mutex* mutex) {}) {}
|
||||
Guard(Guard&&) = default;
|
||||
Guard& operator=(Guard&&) = default;
|
||||
|
||||
explicit operator bool() const { return bool(locked_); }
|
||||
|
||||
void Unlock() { locked_.reset(); }
|
||||
|
||||
private:
|
||||
explicit Guard(Mutex* locked);
|
||||
|
||||
std::unique_ptr<Mutex, void (*)(Mutex*)> locked_;
|
||||
friend Mutex;
|
||||
};
|
||||
|
||||
Guard TryLock();
|
||||
Guard Lock();
|
||||
|
||||
private:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl, void (*)(Impl*)> impl_;
|
||||
};
|
||||
|
||||
#ifndef _WIN32
|
||||
/// Return a pointer to a process-wide, process-specific Mutex that can be used
|
||||
/// at any point in a child process. NULL is returned when called in the parent.
|
||||
///
|
||||
/// The rule is to first check that getpid() corresponds to the parent process pid
|
||||
/// and, if not, call this function to lock any after-fork reinitialization code.
|
||||
/// Like this:
|
||||
///
|
||||
/// std::atomic<pid_t> pid{getpid()};
|
||||
/// ...
|
||||
/// if (pid.load() != getpid()) {
|
||||
/// // In child process
|
||||
/// auto lock = GlobalForkSafeMutex()->Lock();
|
||||
/// if (pid.load() != getpid()) {
|
||||
/// // Reinitialize internal structures after fork
|
||||
/// ...
|
||||
/// pid.store(getpid());
|
||||
ARROW_EXPORT
|
||||
Mutex* GlobalForkSafeMutex();
|
||||
#endif
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,104 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/functional.h"
|
||||
#include "arrow/util/thread_pool.h"
|
||||
#include "arrow/util/vector.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// A parallelizer that takes a `Status(int)` function and calls it with
|
||||
// arguments between 0 and `num_tasks - 1`, on an arbitrary number of threads.
|
||||
|
||||
template <class FUNCTION>
|
||||
Status ParallelFor(int num_tasks, FUNCTION&& func,
|
||||
Executor* executor = internal::GetCpuThreadPool()) {
|
||||
std::vector<Future<>> futures(num_tasks);
|
||||
|
||||
for (int i = 0; i < num_tasks; ++i) {
|
||||
ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i));
|
||||
}
|
||||
auto st = Status::OK();
|
||||
for (auto& fut : futures) {
|
||||
st &= fut.status();
|
||||
}
|
||||
return st;
|
||||
}
|
||||
|
||||
template <class FUNCTION, typename T,
|
||||
typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
|
||||
Future<std::vector<R>> ParallelForAsync(std::vector<T> inputs, FUNCTION&& func,
|
||||
Executor* executor = internal::GetCpuThreadPool(),
|
||||
TaskHints hints = TaskHints{}) {
|
||||
std::vector<Future<R>> futures(inputs.size());
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
ARROW_ASSIGN_OR_RAISE(futures[i],
|
||||
executor->Submit(hints, func, i, std::move(inputs[i])));
|
||||
}
|
||||
return All(std::move(futures))
|
||||
.Then([](const std::vector<Result<R>>& results) -> Result<std::vector<R>> {
|
||||
return UnwrapOrRaise(results);
|
||||
});
|
||||
}
|
||||
|
||||
// A parallelizer that takes a `Status(int)` function and calls it with
|
||||
// arguments between 0 and `num_tasks - 1`, in sequence or in parallel,
|
||||
// depending on the input boolean.
|
||||
|
||||
template <class FUNCTION>
|
||||
Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
|
||||
Executor* executor = internal::GetCpuThreadPool()) {
|
||||
if (use_threads) {
|
||||
return ParallelFor(num_tasks, std::forward<FUNCTION>(func), executor);
|
||||
} else {
|
||||
for (int i = 0; i < num_tasks; ++i) {
|
||||
RETURN_NOT_OK(func(i));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
// A parallelizer that takes a `Result<R>(int index, T item)` function and
|
||||
// calls it with each item from the input array, in sequence or in parallel,
|
||||
// depending on the input boolean.
|
||||
|
||||
template <class FUNCTION, typename T,
|
||||
typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
|
||||
Future<std::vector<R>> OptionalParallelForAsync(
|
||||
bool use_threads, std::vector<T> inputs, FUNCTION&& func,
|
||||
Executor* executor = internal::GetCpuThreadPool(), TaskHints hints = TaskHints{}) {
|
||||
if (use_threads) {
|
||||
return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor,
|
||||
hints);
|
||||
} else {
|
||||
std::vector<R> result(inputs.size());
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
ARROW_ASSIGN_OR_RAISE(result[i], func(i, inputs[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,33 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/vendored/pcg/pcg_random.hpp" // IWYU pragma: export
|
||||
|
||||
namespace arrow {
|
||||
namespace random {
|
||||
|
||||
using pcg32 = ::arrow_vendored::pcg32;
|
||||
using pcg64 = ::arrow_vendored::pcg64;
|
||||
using pcg32_fast = ::arrow_vendored::pcg32_fast;
|
||||
using pcg64_fast = ::arrow_vendored::pcg64_fast;
|
||||
using pcg32_oneseq = ::arrow_vendored::pcg32_oneseq;
|
||||
using pcg64_oneseq = ::arrow_vendored::pcg64_oneseq;
|
||||
|
||||
} // namespace random
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,31 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(__GNUC__) // GCC and compatible compilers (clang, Intel ICC)
|
||||
# define ARROW_PREFETCH(addr) __builtin_prefetch(addr)
|
||||
#elif defined(_MSC_VER) // MSVC
|
||||
# if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_RUNTIME_SSE4_2)
|
||||
# include <nmmintrin.h>
|
||||
# define ARROW_PREFETCH(addr) _mm_prefetch((const char*)(addr), _MM_HINT_T0)
|
||||
# else
|
||||
# define ARROW_PREFETCH(addr)
|
||||
# endif
|
||||
#else
|
||||
# define ARROW_PREFETCH(addr)
|
||||
#endif
|
||||
@@ -0,0 +1,29 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/vendored/ProducerConsumerQueue.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
template <typename T>
|
||||
using SpscQueue = arrow_vendored::folly::ProducerConsumerQueue<T>;
|
||||
|
||||
}
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,258 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace arrow::internal {
|
||||
|
||||
/// Create a vector containing the values from start up to stop
|
||||
template <typename T>
|
||||
std::vector<T> Iota(T start, T stop) {
|
||||
if (start > stop) {
|
||||
return {};
|
||||
}
|
||||
std::vector<T> result(static_cast<size_t>(stop - start));
|
||||
std::iota(result.begin(), result.end(), start);
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Create a vector containing the values from 0 up to length
|
||||
template <typename T>
|
||||
std::vector<T> Iota(T length) {
|
||||
return Iota(static_cast<T>(0), length);
|
||||
}
|
||||
|
||||
/// Create a range from a callable which takes a single index parameter
|
||||
/// and returns the value of iterator on each call and a length.
|
||||
/// Only iterators obtained from the same range should be compared, the
|
||||
/// behaviour generally similar to other STL containers.
|
||||
template <typename Generator>
|
||||
class LazyRange {
|
||||
private:
|
||||
// callable which generates the values
|
||||
// has to be defined at the beginning of the class for type deduction
|
||||
const Generator gen_;
|
||||
// the length of the range
|
||||
int64_t length_;
|
||||
#ifdef _MSC_VER
|
||||
// workaround to VS2010 not supporting decltype properly
|
||||
// see https://stackoverflow.com/questions/21782846/decltype-for-class-member-function
|
||||
static Generator gen_static_;
|
||||
#endif
|
||||
|
||||
public:
|
||||
#ifdef _MSC_VER
|
||||
using return_type = decltype(gen_static_(0));
|
||||
#else
|
||||
using return_type = decltype(gen_(0));
|
||||
#endif
|
||||
|
||||
/// Construct a new range from a callable and length
|
||||
LazyRange(Generator gen, int64_t length) : gen_(gen), length_(length) {}
|
||||
|
||||
// Class of the dependent iterator, created implicitly by begin and end
|
||||
class RangeIter {
|
||||
public:
|
||||
using difference_type = int64_t;
|
||||
using value_type = return_type;
|
||||
using reference = const value_type&;
|
||||
using pointer = const value_type*;
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// msvc complains about unchecked iterators,
|
||||
// see https://stackoverflow.com/questions/21655496/error-c4996-checked-iterators
|
||||
using _Unchecked_type = typename LazyRange<Generator>::RangeIter;
|
||||
#endif
|
||||
|
||||
RangeIter() = delete;
|
||||
RangeIter(const RangeIter& other) = default;
|
||||
RangeIter& operator=(const RangeIter& other) = default;
|
||||
|
||||
RangeIter(const LazyRange<Generator>& range, int64_t index)
|
||||
: range_(&range), index_(index) {}
|
||||
|
||||
const return_type operator*() const { return range_->gen_(index_); }
|
||||
|
||||
RangeIter operator+(difference_type length) const {
|
||||
return RangeIter(*range_, index_ + length);
|
||||
}
|
||||
|
||||
// pre-increment
|
||||
RangeIter& operator++() {
|
||||
++index_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// post-increment
|
||||
RangeIter operator++(int) {
|
||||
auto copy = RangeIter(*this);
|
||||
++index_;
|
||||
return copy;
|
||||
}
|
||||
|
||||
bool operator==(const typename LazyRange<Generator>::RangeIter& other) const {
|
||||
return this->index_ == other.index_ && this->range_ == other.range_;
|
||||
}
|
||||
|
||||
bool operator!=(const typename LazyRange<Generator>::RangeIter& other) const {
|
||||
return this->index_ != other.index_ || this->range_ != other.range_;
|
||||
}
|
||||
|
||||
int64_t operator-(const typename LazyRange<Generator>::RangeIter& other) const {
|
||||
return this->index_ - other.index_;
|
||||
}
|
||||
|
||||
bool operator<(const typename LazyRange<Generator>::RangeIter& other) const {
|
||||
return this->index_ < other.index_;
|
||||
}
|
||||
|
||||
private:
|
||||
// parent range reference
|
||||
const LazyRange* range_;
|
||||
// current index
|
||||
int64_t index_;
|
||||
};
|
||||
|
||||
friend class RangeIter;
|
||||
|
||||
// Create a new begin const iterator
|
||||
RangeIter begin() { return RangeIter(*this, 0); }
|
||||
|
||||
// Create a new end const iterator
|
||||
RangeIter end() { return RangeIter(*this, length_); }
|
||||
};
|
||||
|
||||
/// Helper function to create a lazy range from a callable (e.g. lambda) and length
|
||||
template <typename Generator>
|
||||
LazyRange<Generator> MakeLazyRange(Generator&& gen, int64_t length) {
|
||||
return LazyRange<Generator>(std::forward<Generator>(gen), length);
|
||||
}
|
||||
|
||||
/// \brief A helper for iterating multiple ranges simultaneously, similar to C++23's
|
||||
/// zip() view adapter modelled after python's built-in zip() function.
|
||||
///
|
||||
/// \code {.cpp}
|
||||
/// const std::vector<SomeTable>& tables = ...
|
||||
/// std::function<std::vector<std::string>()> GetNames = ...
|
||||
/// for (auto [table, name] : Zip(tables, GetNames())) {
|
||||
/// static_assert(std::is_same_v<decltype(table), const SomeTable&>);
|
||||
/// static_assert(std::is_same_v<decltype(name), std::string&>);
|
||||
/// // temporaries (like this vector of strings) are kept alive for the
|
||||
/// // duration of a loop and are safely movable).
|
||||
/// RegisterTableWithName(std::move(name), &table);
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// The zipped sequence ends as soon as any of its member ranges ends.
|
||||
///
|
||||
/// Always use `auto` for the loop's declaration; it will always be a tuple
|
||||
/// of references so for example using `const auto&` will compile but will
|
||||
/// *look* like forcing const-ness even though the members of the tuple are
|
||||
/// still mutable references.
|
||||
///
|
||||
/// NOTE: we *could* make Zip a more full fledged range and enable things like
|
||||
/// - gtest recognizing it as a container; it currently doesn't since Zip is
|
||||
/// always mutable so this breaks:
|
||||
/// EXPECT_THAT(Zip(std::vector{0}, std::vector{1}),
|
||||
/// ElementsAre(std::tuple{0, 1}));
|
||||
/// - letting it be random access when possible so we can do things like *sort*
|
||||
/// parallel ranges
|
||||
/// - ...
|
||||
///
|
||||
/// However doing this will increase the compile time overhead of using Zip as
|
||||
/// long as we're still using headers. Therefore until we can use c++20 modules:
|
||||
/// *don't* extend Zip.
|
||||
template <typename Ranges, typename Indices>
|
||||
struct Zip;
|
||||
|
||||
template <typename... Ranges>
|
||||
Zip(Ranges&&...) -> Zip<std::tuple<Ranges...>, std::index_sequence_for<Ranges...>>;
|
||||
|
||||
template <typename... Ranges, size_t... I>
|
||||
struct Zip<std::tuple<Ranges...>, std::index_sequence<I...>> {
|
||||
explicit Zip(Ranges... ranges) : ranges_(std::forward<Ranges>(ranges)...) {}
|
||||
|
||||
std::tuple<Ranges...> ranges_;
|
||||
|
||||
using sentinel = std::tuple<decltype(std::end(std::get<I>(ranges_)))...>;
|
||||
constexpr sentinel end() { return {std::end(std::get<I>(ranges_))...}; }
|
||||
|
||||
struct iterator : std::tuple<decltype(std::begin(std::get<I>(ranges_)))...> {
|
||||
using std::tuple<decltype(std::begin(std::get<I>(ranges_)))...>::tuple;
|
||||
|
||||
constexpr auto operator*() {
|
||||
return std::tuple<decltype(*std::get<I>(*this))...>{*std::get<I>(*this)...};
|
||||
}
|
||||
|
||||
constexpr iterator& operator++() {
|
||||
(++std::get<I>(*this), ...);
|
||||
return *this;
|
||||
}
|
||||
|
||||
constexpr bool operator!=(const sentinel& s) const {
|
||||
bool all_iterators_valid = (... && (std::get<I>(*this) != std::get<I>(s)));
|
||||
return all_iterators_valid;
|
||||
}
|
||||
};
|
||||
constexpr iterator begin() { return {std::begin(std::get<I>(ranges_))...}; }
|
||||
};
|
||||
|
||||
/// \brief A lazy sequence of integers which starts from 0 and never stops.
|
||||
///
|
||||
/// This can be used in conjunction with Zip() to emulate python's built-in
|
||||
/// enumerate() function:
|
||||
///
|
||||
/// \code {.cpp}
|
||||
/// const std::vector<SomeTable>& tables = ...
|
||||
/// for (auto [i, table] : Zip(Enumerate<>, tables)) {
|
||||
/// std::cout << "#" << i << ": " << table.name() << std::endl;
|
||||
/// }
|
||||
/// \endcode
|
||||
template <typename I = size_t>
|
||||
constexpr auto Enumerate = [] {
|
||||
struct {
|
||||
struct sentinel {};
|
||||
constexpr sentinel end() const { return {}; }
|
||||
|
||||
struct iterator {
|
||||
I value{0};
|
||||
|
||||
constexpr I operator*() { return value; }
|
||||
|
||||
constexpr iterator& operator++() {
|
||||
++value;
|
||||
return *this;
|
||||
}
|
||||
|
||||
constexpr std::true_type operator!=(sentinel) const { return {}; }
|
||||
};
|
||||
constexpr iterator begin() const { return {}; }
|
||||
} out;
|
||||
|
||||
return out;
|
||||
}();
|
||||
|
||||
} // namespace arrow::internal
|
||||
@@ -0,0 +1,584 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace ree_util {
|
||||
|
||||
/// \brief Get the child array holding the run ends from an REE array
|
||||
inline const ArraySpan& RunEndsArray(const ArraySpan& span) { return span.child_data[0]; }
|
||||
|
||||
/// \brief Get the child array holding the data values from an REE array
|
||||
inline const ArraySpan& ValuesArray(const ArraySpan& span) { return span.child_data[1]; }
|
||||
|
||||
/// \brief Get a pointer to run ends values of an REE array
|
||||
template <typename RunEndCType>
|
||||
const RunEndCType* RunEnds(const ArraySpan& span) {
|
||||
assert(RunEndsArray(span).type->id() == CTypeTraits<RunEndCType>::ArrowType::type_id);
|
||||
return RunEndsArray(span).GetValues<RunEndCType>(1);
|
||||
}
|
||||
|
||||
/// \brief Perform basic validations on the parameters of an REE array
|
||||
/// and its two children arrays
|
||||
///
|
||||
/// All the checks complete in O(1) time. Consequently, this function:
|
||||
/// - DOES NOT check that run_ends is sorted and all-positive
|
||||
/// - DOES NOT check the actual contents of the run_ends and values arrays
|
||||
Status ValidateRunEndEncodedChildren(const RunEndEncodedType& type,
|
||||
int64_t logical_length,
|
||||
const std::shared_ptr<ArrayData>& run_ends_data,
|
||||
const std::shared_ptr<ArrayData>& values_data,
|
||||
int64_t null_count, int64_t logical_offset);
|
||||
|
||||
/// \brief Compute the logical null count of an REE array
|
||||
int64_t LogicalNullCount(const ArraySpan& span);
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Uses binary-search to find the physical offset given a logical offset
|
||||
/// and run-end values
|
||||
///
|
||||
/// \return the physical offset or run_ends_size if the physical offset is not
|
||||
/// found in run_ends
|
||||
template <typename RunEndCType>
|
||||
int64_t FindPhysicalIndex(const RunEndCType* run_ends, int64_t run_ends_size, int64_t i,
|
||||
int64_t absolute_offset) {
|
||||
assert(absolute_offset + i >= 0);
|
||||
auto it = std::upper_bound(run_ends, run_ends + run_ends_size, absolute_offset + i);
|
||||
int64_t result = std::distance(run_ends, it);
|
||||
assert(result <= run_ends_size);
|
||||
return result;
|
||||
}
|
||||
|
||||
/// \brief Uses binary-search to calculate the range of physical values (and
|
||||
/// run-ends) necessary to represent the logical range of values from
|
||||
/// offset to length
|
||||
///
|
||||
/// \return a pair of physical offset and physical length
|
||||
template <typename RunEndCType>
|
||||
std::pair<int64_t, int64_t> FindPhysicalRange(const RunEndCType* run_ends,
|
||||
int64_t run_ends_size, int64_t length,
|
||||
int64_t offset) {
|
||||
const int64_t physical_offset =
|
||||
FindPhysicalIndex<RunEndCType>(run_ends, run_ends_size, 0, offset);
|
||||
// The physical length is calculated by finding the offset of the last element
|
||||
// and adding 1 to it, so first we ensure there is at least one element.
|
||||
if (length == 0) {
|
||||
return {physical_offset, 0};
|
||||
}
|
||||
const int64_t physical_index_of_last = FindPhysicalIndex<RunEndCType>(
|
||||
run_ends + physical_offset, run_ends_size - physical_offset, length - 1, offset);
|
||||
|
||||
assert(physical_index_of_last < run_ends_size - physical_offset);
|
||||
return {physical_offset, physical_index_of_last + 1};
|
||||
}
|
||||
|
||||
/// \brief Uses binary-search to calculate the number of physical values (and
|
||||
/// run-ends) necessary to represent the logical range of values from
|
||||
/// offset to length
|
||||
template <typename RunEndCType>
|
||||
int64_t FindPhysicalLength(const RunEndCType* run_ends, int64_t run_ends_size,
|
||||
int64_t length, int64_t offset) {
|
||||
auto [_, physical_length] =
|
||||
FindPhysicalRange<RunEndCType>(run_ends, run_ends_size, length, offset);
|
||||
// GH-37107: This is a workaround for GCC 7. GCC 7 doesn't ignore
|
||||
// variables in structured binding automatically from unused
|
||||
// variables when one of these variables are used.
|
||||
// See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
|
||||
ARROW_UNUSED(_);
|
||||
return physical_length;
|
||||
}
|
||||
|
||||
/// \brief Find the physical index into the values array of the REE ArraySpan
|
||||
///
|
||||
/// This function uses binary-search, so it has a O(log N) cost.
|
||||
template <typename RunEndCType>
|
||||
int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_offset) {
|
||||
const int64_t run_ends_size = RunEndsArray(span).length;
|
||||
return FindPhysicalIndex(RunEnds<RunEndCType>(span), run_ends_size, i, absolute_offset);
|
||||
}
|
||||
|
||||
/// \brief Find the physical length of an REE ArraySpan
|
||||
///
|
||||
/// The physical length of an REE is the number of physical values (and
|
||||
/// run-ends) necessary to represent the logical range of values from
|
||||
/// offset to length.
|
||||
///
|
||||
/// Avoid calling this function if the physical length can be established in
|
||||
/// some other way (e.g. when iterating over the runs sequentially until the
|
||||
/// end). This function uses binary-search, so it has a O(log N) cost.
|
||||
template <typename RunEndCType>
|
||||
int64_t FindPhysicalLength(const ArraySpan& span) {
|
||||
return FindPhysicalLength(
|
||||
/*run_ends=*/RunEnds<RunEndCType>(span),
|
||||
/*run_ends_size=*/RunEndsArray(span).length,
|
||||
/*length=*/span.length,
|
||||
/*offset=*/span.offset);
|
||||
}
|
||||
|
||||
template <typename RunEndCType>
|
||||
struct PhysicalIndexFinder;
|
||||
|
||||
// non-inline implementations for each run-end type
|
||||
ARROW_EXPORT int64_t FindPhysicalIndexImpl16(PhysicalIndexFinder<int16_t>& self,
|
||||
int64_t i);
|
||||
ARROW_EXPORT int64_t FindPhysicalIndexImpl32(PhysicalIndexFinder<int32_t>& self,
|
||||
int64_t i);
|
||||
ARROW_EXPORT int64_t FindPhysicalIndexImpl64(PhysicalIndexFinder<int64_t>& self,
|
||||
int64_t i);
|
||||
|
||||
/// \brief Stateful version of FindPhysicalIndex() that caches the result of
|
||||
/// the previous search and uses it to optimize the next search.
|
||||
///
|
||||
/// When new queries for the physical index of a logical index come in,
|
||||
/// binary search is performed again but the first candidate checked is the
|
||||
/// result of the previous search (cached physical index) instead of the
|
||||
/// midpoint of the run-ends array.
|
||||
///
|
||||
/// If that test fails, internal::FindPhysicalIndex() is called with one of the
|
||||
/// partitions defined by the cached index. If the queried logical indices
|
||||
/// follow an increasing or decreasing pattern, this first test is much more
|
||||
/// effective in (1) finding the answer right away (close logical indices belong
|
||||
/// to the same runs) or (2) discarding many more candidates than probing
|
||||
/// the midpoint would.
|
||||
///
|
||||
/// The most adversarial case (i.e. alternating between 0 and length-1 queries)
|
||||
/// only adds one extra binary search probe when compared to always starting
|
||||
/// binary search from the midpoint without any of these optimizations.
|
||||
///
|
||||
/// \tparam RunEndCType The numeric type of the run-ends array.
|
||||
template <typename RunEndCType>
|
||||
struct PhysicalIndexFinder {
|
||||
const ArraySpan array_span;
|
||||
const RunEndCType* run_ends;
|
||||
int64_t last_physical_index = 0;
|
||||
|
||||
explicit PhysicalIndexFinder(const ArrayData& data)
|
||||
: array_span(data),
|
||||
run_ends(RunEndsArray(array_span).template GetValues<RunEndCType>(1)) {
|
||||
assert(CTypeTraits<RunEndCType>::ArrowType::type_id ==
|
||||
::arrow::internal::checked_cast<const RunEndEncodedType&>(*data.type)
|
||||
.run_end_type()
|
||||
->id());
|
||||
}
|
||||
|
||||
/// \brief Find the physical index into the values array of the REE array.
|
||||
///
|
||||
/// \pre 0 <= i < array_span.length()
|
||||
/// \param i the logical index into the REE array
|
||||
/// \return the physical index into the values array
|
||||
int64_t FindPhysicalIndex(int64_t i) {
|
||||
if constexpr (std::is_same_v<RunEndCType, int16_t>) {
|
||||
return FindPhysicalIndexImpl16(*this, i);
|
||||
} else if constexpr (std::is_same_v<RunEndCType, int32_t>) {
|
||||
return FindPhysicalIndexImpl32(*this, i);
|
||||
} else {
|
||||
static_assert(std::is_same_v<RunEndCType, int64_t>, "Unsupported RunEndCType.");
|
||||
return FindPhysicalIndexImpl64(*this, i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief Find the physical index into the values array of the REE ArraySpan
|
||||
///
|
||||
/// This function uses binary-search, so it has a O(log N) cost.
|
||||
ARROW_EXPORT int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i,
|
||||
int64_t absolute_offset);
|
||||
|
||||
/// \brief Find the physical length of an REE ArraySpan
|
||||
///
|
||||
/// The physical length of an REE is the number of physical values (and
|
||||
/// run-ends) necessary to represent the logical range of values from
|
||||
/// offset to length.
|
||||
///
|
||||
/// Avoid calling this function if the physical length can be established in
|
||||
/// some other way (e.g. when iterating over the runs sequentially until the
|
||||
/// end). This function uses binary-search, so it has a O(log N) cost.
|
||||
ARROW_EXPORT int64_t FindPhysicalLength(const ArraySpan& span);
|
||||
|
||||
/// \brief Find the physical range of physical values referenced by the REE in
|
||||
/// the logical range from offset to offset + length
|
||||
///
|
||||
/// \return a pair of physical offset and physical length
|
||||
ARROW_EXPORT std::pair<int64_t, int64_t> FindPhysicalRange(const ArraySpan& span,
|
||||
int64_t offset,
|
||||
int64_t length);
|
||||
|
||||
// Publish PhysicalIndexFinder outside of the internal namespace.
|
||||
template <typename RunEndCType>
|
||||
using PhysicalIndexFinder = internal::PhysicalIndexFinder<RunEndCType>;
|
||||
|
||||
template <typename RunEndCType>
|
||||
class RunEndEncodedArraySpan {
|
||||
private:
|
||||
struct PrivateTag {};
|
||||
|
||||
public:
|
||||
/// \brief Iterator representing the current run during iteration over a
|
||||
/// run-end encoded array
|
||||
class Iterator {
|
||||
public:
|
||||
Iterator(PrivateTag, const RunEndEncodedArraySpan& span, int64_t logical_pos,
|
||||
int64_t physical_pos)
|
||||
: span(span), logical_pos_(logical_pos), physical_pos_(physical_pos) {}
|
||||
|
||||
/// \brief Return the physical index of the run
|
||||
///
|
||||
/// The values array can be addressed with this index to get the value
|
||||
/// that makes up the run.
|
||||
///
|
||||
/// NOTE: if this Iterator is equal to RunEndEncodedArraySpan::end(),
|
||||
/// the value returned is undefined.
|
||||
int64_t index_into_array() const { return physical_pos_; }
|
||||
|
||||
/// \brief Return the initial logical position of the run
|
||||
///
|
||||
/// If this Iterator is equal to RunEndEncodedArraySpan::end(), this is
|
||||
/// the same as RunEndEncodedArraySpan::length().
|
||||
int64_t logical_position() const { return logical_pos_; }
|
||||
|
||||
/// \brief Return the logical position immediately after the run.
|
||||
///
|
||||
/// Pre-condition: *this != RunEndEncodedArraySpan::end()
|
||||
int64_t run_end() const { return span.run_end(physical_pos_); }
|
||||
|
||||
/// \brief Returns the logical length of the run.
|
||||
///
|
||||
/// Pre-condition: *this != RunEndEncodedArraySpan::end()
|
||||
int64_t run_length() const { return run_end() - logical_pos_; }
|
||||
|
||||
/// \brief Check if the iterator is at the end of the array.
|
||||
///
|
||||
/// This can be used to avoid paying the cost of a call to
|
||||
/// RunEndEncodedArraySpan::end().
|
||||
///
|
||||
/// \return true if the iterator is at the end of the array
|
||||
bool is_end(const RunEndEncodedArraySpan& span) const {
|
||||
return logical_pos_ >= span.length();
|
||||
}
|
||||
|
||||
Iterator& operator++() {
|
||||
logical_pos_ = span.run_end(physical_pos_);
|
||||
physical_pos_ += 1;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Iterator operator++(int) {
|
||||
const Iterator prev = *this;
|
||||
++(*this);
|
||||
return prev;
|
||||
}
|
||||
|
||||
Iterator& operator--() {
|
||||
physical_pos_ -= 1;
|
||||
logical_pos_ = (physical_pos_ > 0) ? span.run_end(physical_pos_ - 1) : 0;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Iterator operator--(int) {
|
||||
const Iterator prev = *this;
|
||||
--(*this);
|
||||
return prev;
|
||||
}
|
||||
|
||||
bool operator==(const Iterator& other) const {
|
||||
return logical_pos_ == other.logical_pos_;
|
||||
}
|
||||
|
||||
bool operator!=(const Iterator& other) const {
|
||||
return logical_pos_ != other.logical_pos_;
|
||||
}
|
||||
|
||||
public:
|
||||
const RunEndEncodedArraySpan& span;
|
||||
|
||||
private:
|
||||
int64_t logical_pos_;
|
||||
int64_t physical_pos_;
|
||||
};
|
||||
|
||||
// Prevent implicit ArrayData -> ArraySpan conversion in
|
||||
// RunEndEncodedArraySpan instantiation.
|
||||
explicit RunEndEncodedArraySpan(const ArrayData& data) = delete;
|
||||
|
||||
/// \brief Construct a RunEndEncodedArraySpan from an ArraySpan and new
|
||||
/// absolute offset and length.
|
||||
///
|
||||
/// RunEndEncodedArraySpan{span, off, len} is equivalent to:
|
||||
///
|
||||
/// span.SetSlice(off, len);
|
||||
/// RunEndEncodedArraySpan{span}
|
||||
///
|
||||
/// ArraySpan::SetSlice() updates the null_count to kUnknownNullCount, but
|
||||
/// we don't need that here as REE arrays have null_count set to 0 by
|
||||
/// convention.
|
||||
explicit RunEndEncodedArraySpan(const ArraySpan& array_span, int64_t offset,
|
||||
int64_t length)
|
||||
: array_span_{array_span},
|
||||
run_ends_(RunEnds<RunEndCType>(array_span_)),
|
||||
length_(length),
|
||||
offset_(offset) {
|
||||
assert(array_span_.type->id() == Type::RUN_END_ENCODED);
|
||||
}
|
||||
|
||||
explicit RunEndEncodedArraySpan(const ArraySpan& array_span)
|
||||
: RunEndEncodedArraySpan(array_span, array_span.offset, array_span.length) {}
|
||||
|
||||
int64_t offset() const { return offset_; }
|
||||
int64_t length() const { return length_; }
|
||||
|
||||
int64_t PhysicalIndex(int64_t logical_pos) const {
|
||||
return internal::FindPhysicalIndex(run_ends_, RunEndsArray(array_span_).length,
|
||||
logical_pos, offset_);
|
||||
}
|
||||
|
||||
/// \brief Create an iterator from a logical position and its
|
||||
/// pre-computed physical offset into the run ends array
|
||||
///
|
||||
/// \param logical_pos is an index in the [0, length()] range
|
||||
/// \param physical_offset the pre-calculated PhysicalIndex(logical_pos)
|
||||
Iterator iterator(int64_t logical_pos, int64_t physical_offset) const {
|
||||
return Iterator{PrivateTag{}, *this, logical_pos, physical_offset};
|
||||
}
|
||||
|
||||
/// \brief Create an iterator from a logical position
|
||||
///
|
||||
/// \param logical_pos is an index in the [0, length()] range
|
||||
Iterator iterator(int64_t logical_pos) const {
|
||||
if (logical_pos < length()) {
|
||||
return iterator(logical_pos, PhysicalIndex(logical_pos));
|
||||
}
|
||||
// If logical_pos is above the valid range, use length() as the logical
|
||||
// position and calculate the physical address right after the last valid
|
||||
// physical position. Which is the physical index of the last logical
|
||||
// position, plus 1.
|
||||
return (length() == 0) ? iterator(0, PhysicalIndex(0))
|
||||
: iterator(length(), PhysicalIndex(length() - 1) + 1);
|
||||
}
|
||||
|
||||
/// \brief Create an iterator representing the logical begin of the run-end
|
||||
/// encoded array
|
||||
Iterator begin() const { return iterator(0, PhysicalIndex(0)); }
|
||||
|
||||
/// \brief Create an iterator representing the first invalid logical position
|
||||
/// of the run-end encoded array
|
||||
///
|
||||
/// \warning Avoid calling end() in a loop, as it will recompute the physical
|
||||
/// length of the array on each call (O(log N) cost per call).
|
||||
///
|
||||
/// You can write your loops like this instead:
|
||||
///
|
||||
/// \code
|
||||
/// for (auto it = array.begin(), end = array.end(); it != end; ++it) {
|
||||
/// // ...
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// Or this version that does not look like idiomatic C++, but removes
|
||||
/// the need for calling end() completely:
|
||||
///
|
||||
/// \code
|
||||
/// for (auto it = array.begin(); !it.is_end(array); ++it) {
|
||||
/// // ...
|
||||
/// }
|
||||
/// \endcode
|
||||
Iterator end() const {
|
||||
return iterator(length(),
|
||||
(length() == 0) ? PhysicalIndex(0) : PhysicalIndex(length() - 1) + 1);
|
||||
}
|
||||
|
||||
// Pre-condition: physical_pos < RunEndsArray(array_span_).length);
|
||||
inline int64_t run_end(int64_t physical_pos) const {
|
||||
assert(physical_pos < RunEndsArray(array_span_).length);
|
||||
// Logical index of the end of the run at physical_pos with offset applied
|
||||
const int64_t logical_run_end =
|
||||
std::max<int64_t>(static_cast<int64_t>(run_ends_[physical_pos]) - offset(), 0);
|
||||
// The current run may go further than the logical length, cap it
|
||||
return std::min(logical_run_end, length());
|
||||
}
|
||||
|
||||
private:
|
||||
const ArraySpan& array_span_;
|
||||
const RunEndCType* run_ends_;
|
||||
const int64_t length_;
|
||||
const int64_t offset_;
|
||||
};
|
||||
|
||||
/// \brief Iterate over two run-end encoded arrays in runs or sub-runs that are
|
||||
/// inside run boundaries on both inputs
|
||||
///
|
||||
/// Both RunEndEncodedArraySpan should have the same logical length. Instances
|
||||
/// of this iterator only hold references to the RunEndEncodedArraySpan inputs.
|
||||
template <typename Left, typename Right>
|
||||
class MergedRunsIterator {
|
||||
private:
|
||||
using LeftIterator = typename Left::Iterator;
|
||||
using RightIterator = typename Right::Iterator;
|
||||
|
||||
MergedRunsIterator(LeftIterator left_it, RightIterator right_it,
|
||||
int64_t common_logical_length, int64_t common_logical_pos)
|
||||
: ree_iterators_{std::move(left_it), std::move(right_it)},
|
||||
logical_length_(common_logical_length),
|
||||
logical_pos_(common_logical_pos) {}
|
||||
|
||||
public:
|
||||
/// \brief Construct a MergedRunsIterator positioned at logical position 0.
|
||||
///
|
||||
/// Pre-condition: left.length() == right.length()
|
||||
MergedRunsIterator(const Left& left, const Right& right)
|
||||
: MergedRunsIterator(left.begin(), right.begin(), left.length(), 0) {
|
||||
assert(left.length() == right.length());
|
||||
}
|
||||
|
||||
static Result<MergedRunsIterator> MakeBegin(const Left& left, const Right& right) {
|
||||
if (left.length() != right.length()) {
|
||||
return Status::Invalid(
|
||||
"MergedRunsIterator expects RunEndEncodedArraySpans of the same length");
|
||||
}
|
||||
return MergedRunsIterator(left, right);
|
||||
}
|
||||
|
||||
static Result<MergedRunsIterator> MakeEnd(const Left& left, const Right& right) {
|
||||
if (left.length() != right.length()) {
|
||||
return Status::Invalid(
|
||||
"MergedRunsIterator expects RunEndEncodedArraySpans of the same length");
|
||||
}
|
||||
return MergedRunsIterator(left.end(), right.end(), left.length(), left.length());
|
||||
}
|
||||
|
||||
/// \brief Return the left RunEndEncodedArraySpan child
|
||||
const Left& left() const { return std::get<0>(ree_iterators_).span; }
|
||||
|
||||
/// \brief Return the right RunEndEncodedArraySpan child
|
||||
const Right& right() const { return std::get<1>(ree_iterators_).span; }
|
||||
|
||||
/// \brief Return the initial logical position of the run
|
||||
///
|
||||
/// If is_end(), this is the same as length().
|
||||
int64_t logical_position() const { return logical_pos_; }
|
||||
|
||||
/// \brief Whether the iterator is at logical position 0.
|
||||
bool is_begin() const { return logical_pos_ == 0; }
|
||||
|
||||
/// \brief Whether the iterator has reached the end of both arrays
|
||||
bool is_end() const { return logical_pos_ == logical_length_; }
|
||||
|
||||
/// \brief Return the logical position immediately after the run.
|
||||
///
|
||||
/// Pre-condition: !is_end()
|
||||
int64_t run_end() const {
|
||||
const auto& left_it = std::get<0>(ree_iterators_);
|
||||
const auto& right_it = std::get<1>(ree_iterators_);
|
||||
return std::min(left_it.run_end(), right_it.run_end());
|
||||
}
|
||||
|
||||
/// \brief returns the logical length of the current run
|
||||
///
|
||||
/// Pre-condition: !is_end()
|
||||
int64_t run_length() const { return run_end() - logical_pos_; }
|
||||
|
||||
/// \brief Return a physical index into the values array of a given input,
|
||||
/// pointing to the value of the current run
|
||||
template <size_t input_id>
|
||||
int64_t index_into_array() const {
|
||||
return std::get<input_id>(ree_iterators_).index_into_array();
|
||||
}
|
||||
|
||||
int64_t index_into_left_array() const { return index_into_array<0>(); }
|
||||
int64_t index_into_right_array() const { return index_into_array<1>(); }
|
||||
|
||||
MergedRunsIterator& operator++() {
|
||||
auto& left_it = std::get<0>(ree_iterators_);
|
||||
auto& right_it = std::get<1>(ree_iterators_);
|
||||
|
||||
const int64_t left_run_end = left_it.run_end();
|
||||
const int64_t right_run_end = right_it.run_end();
|
||||
|
||||
if (left_run_end < right_run_end) {
|
||||
logical_pos_ = left_run_end;
|
||||
++left_it;
|
||||
} else if (left_run_end > right_run_end) {
|
||||
logical_pos_ = right_run_end;
|
||||
++right_it;
|
||||
} else {
|
||||
logical_pos_ = left_run_end;
|
||||
++left_it;
|
||||
++right_it;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
MergedRunsIterator operator++(int) {
|
||||
MergedRunsIterator prev = *this;
|
||||
++(*this);
|
||||
return prev;
|
||||
}
|
||||
|
||||
MergedRunsIterator& operator--() {
|
||||
auto& left_it = std::get<0>(ree_iterators_);
|
||||
auto& right_it = std::get<1>(ree_iterators_);
|
||||
|
||||
// The logical position of each iterator is the run_end() of the previous run.
|
||||
const int64_t left_logical_pos = left_it.logical_position();
|
||||
const int64_t right_logical_pos = right_it.logical_position();
|
||||
|
||||
if (left_logical_pos < right_logical_pos) {
|
||||
--right_it;
|
||||
logical_pos_ = std::max(left_logical_pos, right_it.logical_position());
|
||||
} else if (left_logical_pos > right_logical_pos) {
|
||||
--left_it;
|
||||
logical_pos_ = std::max(left_it.logical_position(), right_logical_pos);
|
||||
} else {
|
||||
--left_it;
|
||||
--right_it;
|
||||
logical_pos_ = std::max(left_it.logical_position(), right_it.logical_position());
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
MergedRunsIterator operator--(int) {
|
||||
MergedRunsIterator prev = *this;
|
||||
--(*this);
|
||||
return prev;
|
||||
}
|
||||
|
||||
bool operator==(const MergedRunsIterator& other) const {
|
||||
return logical_pos_ == other.logical_position();
|
||||
}
|
||||
|
||||
bool operator!=(const MergedRunsIterator& other) const { return !(*this == other); }
|
||||
|
||||
private:
|
||||
std::tuple<LeftIterator, RightIterator> ree_iterators_;
|
||||
const int64_t logical_length_;
|
||||
int64_t logical_pos_;
|
||||
};
|
||||
|
||||
} // namespace ree_util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,51 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <initializer_list>
|
||||
#include <regex>
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
/// Match regex against target and produce string_views out of matches.
|
||||
inline bool RegexMatch(const std::regex& regex, std::string_view target,
|
||||
std::initializer_list<std::string_view*> out_matches) {
|
||||
assert(regex.mark_count() == out_matches.size());
|
||||
|
||||
std::match_results<decltype(target.begin())> match;
|
||||
if (!std::regex_match(target.begin(), target.end(), match, regex)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Match #0 is the whole matched sequence
|
||||
assert(regex.mark_count() + 1 == match.size());
|
||||
auto out_it = out_matches.begin();
|
||||
for (size_t i = 1; i < match.size(); ++i) {
|
||||
**out_it++ = target.substr(match.position(i), match.length(i));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,163 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/table_builder.h"
|
||||
#include "arrow/util/iterator.h"
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
namespace arrow::util {
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Default identity function row accessor. Used to for the common case where the value
|
||||
// of each row iterated over is it's self also directly iterable.
|
||||
[[nodiscard]] constexpr inline auto MakeDefaultRowAccessor() {
|
||||
return [](auto& x) -> Result<decltype(std::ref(x))> { return std::ref(x); };
|
||||
}
|
||||
|
||||
// Meta-function to check if a type `T` is a range (iterable using `std::begin()` /
|
||||
// `std::end()`). `is_range<T>::value` will be false if `T` is not a valid range.
|
||||
template <typename T, typename = void>
|
||||
struct is_range : std::false_type {};
|
||||
|
||||
template <typename T>
|
||||
struct is_range<T, std::void_t<decltype(std::begin(std::declval<T>())),
|
||||
decltype(std::end(std::declval<T>()))>> : std::true_type {
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// Delete overload for `const Range&& rows` because the data's lifetime must exceed
|
||||
/// the lifetime of the function call. `data` will be read when client uses the
|
||||
/// `RecordBatchReader`
|
||||
template <class Range, class DataPointConvertor,
|
||||
class RowAccessor = decltype(detail::MakeDefaultRowAccessor())>
|
||||
[[nodiscard]] typename std::enable_if_t<detail::is_range<Range>::value,
|
||||
Result<std::shared_ptr<RecordBatchReader>>>
|
||||
/* Result<std::shared_ptr<RecordBatchReader>>> */ RowsToBatches(
|
||||
const std::shared_ptr<Schema>& schema, const Range&& rows,
|
||||
DataPointConvertor&& data_point_convertor,
|
||||
RowAccessor&& row_accessor = detail::MakeDefaultRowAccessor(),
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
const std::size_t batch_size = 1024) = delete;
|
||||
|
||||
/// \brief Utility function for converting any row-based structure into an
|
||||
/// `arrow::RecordBatchReader` (this can be easily converted to an `arrow::Table` using
|
||||
/// `arrow::RecordBatchReader::ToTable()`).
|
||||
///
|
||||
/// Examples of supported types:
|
||||
/// - `std::vector<std::vector<std::variant<int, bsl::string>>>`
|
||||
/// - `std::vector<MyRowStruct>`
|
||||
|
||||
/// If `rows` (client’s row-based structure) is not a valid C++ range, the client will
|
||||
/// need to either make it iterable, or make an adapter/wrapper that is a valid C++
|
||||
/// range.
|
||||
|
||||
/// The client must provide a `DataPointConvertor` callable type that will convert the
|
||||
/// structure’s data points into the corresponding arrow types.
|
||||
|
||||
/// Complex nested rows can be supported by providing a custom `row_accessor` instead
|
||||
/// of the default.
|
||||
|
||||
/// Example usage:
|
||||
/// \code{.cpp}
|
||||
/// auto IntConvertor = [](ArrayBuilder& array_builder, int value) {
|
||||
/// return static_cast<Int64Builder&>(array_builder).Append(value);
|
||||
/// };
|
||||
/// std::vector<std::vector<int>> data = {{1, 2, 4}, {5, 6, 7}};
|
||||
/// auto batches = RowsToBatches(kTestSchema, data, IntConvertor);
|
||||
/// \endcode
|
||||
|
||||
/// \param[in] schema - The schema to be used in the `RecordBatchReader`
|
||||
|
||||
/// \param[in] rows - Iterable row-based structure that will be converted to arrow
|
||||
/// batches
|
||||
|
||||
/// \param[in] data_point_convertor - Client provided callable type that will convert
|
||||
/// the structure’s data points into the corresponding arrow types. The convertor must
|
||||
/// return an error `Status` if an error happens during conversion.
|
||||
|
||||
/// \param[in] row_accessor - In the common case where the value of each row iterated
|
||||
/// over is it's self also directly iterable, the client can just use the default.
|
||||
/// The provided callable must take the values of the `rows` range and return a
|
||||
/// `std::reference_wrapper<Range>` to the data points in a given row. The data points
|
||||
/// must be in order of their corresponding fields in the schema.
|
||||
/// see: /ref `MakeDefaultRowAccessor`
|
||||
|
||||
/// \param[in] pool - The MemoryPool to use for allocations.
|
||||
|
||||
/// \param[in] batch_size - Number of rows to insert into each RecordBatch.
|
||||
|
||||
/// \return `Result<std::shared_ptr<RecordBatchReader>>>` result will be a
|
||||
/// `std::shared_ptr<RecordBatchReader>>` if not errors occurred, else an error status.
|
||||
template <class Range, class DataPointConvertor,
|
||||
class RowAccessor = decltype(detail::MakeDefaultRowAccessor())>
|
||||
[[nodiscard]] typename std::enable_if_t<detail::is_range<Range>::value,
|
||||
Result<std::shared_ptr<RecordBatchReader>>>
|
||||
/* Result<std::shared_ptr<RecordBatchReader>>> */ RowsToBatches(
|
||||
const std::shared_ptr<Schema>& schema, const Range& rows,
|
||||
DataPointConvertor&& data_point_convertor,
|
||||
RowAccessor&& row_accessor = detail::MakeDefaultRowAccessor(),
|
||||
MemoryPool* pool = default_memory_pool(), const std::size_t batch_size = 1024) {
|
||||
auto make_next_batch =
|
||||
[pool = pool, batch_size = batch_size, rows_ittr = std::begin(rows),
|
||||
rows_ittr_end = std::end(rows), schema = schema,
|
||||
row_accessor = std::forward<RowAccessor>(row_accessor),
|
||||
data_point_convertor = std::forward<DataPointConvertor>(
|
||||
data_point_convertor)]() mutable -> Result<std::shared_ptr<RecordBatch>> {
|
||||
if (rows_ittr == rows_ittr_end) return NULLPTR;
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto record_batch_builder,
|
||||
RecordBatchBuilder::Make(schema, pool, batch_size));
|
||||
|
||||
for (size_t i = 0; i < batch_size && (rows_ittr != rows_ittr_end);
|
||||
i++, std::advance(rows_ittr, 1)) {
|
||||
int col_index = 0;
|
||||
ARROW_ASSIGN_OR_RAISE(const auto row, row_accessor(*rows_ittr));
|
||||
|
||||
// If the accessor returns a `std::reference_wrapper` unwrap if
|
||||
const auto& row_unwrapped = [&]() {
|
||||
if constexpr (detail::is_range<decltype(row)>::value)
|
||||
return row;
|
||||
else
|
||||
return row.get();
|
||||
}();
|
||||
|
||||
for (auto& data_point : row_unwrapped) {
|
||||
ArrayBuilder* array_builder = record_batch_builder->GetField(col_index);
|
||||
ARROW_RETURN_IF(array_builder == NULLPTR,
|
||||
Status::Invalid("array_builder == NULLPTR"));
|
||||
|
||||
ARROW_RETURN_NOT_OK(data_point_convertor(*array_builder, data_point));
|
||||
col_index++;
|
||||
}
|
||||
}
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto result, record_batch_builder->Flush());
|
||||
return result;
|
||||
};
|
||||
return RecordBatchReader::MakeFromIterator(MakeFunctionIterator(make_next_batch),
|
||||
schema);
|
||||
}
|
||||
|
||||
} // namespace arrow::util
|
||||
@@ -0,0 +1,72 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/util/span.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow::util {
|
||||
/**
|
||||
* A secure string that ensures the wrapped string is cleared from memory on
|
||||
* deconstruction. This class can only be created from std::string that are securely
|
||||
* erased after creation.
|
||||
*
|
||||
* Note: This class does not provide a constructor / assignment operator that copies a
|
||||
* std::string because that would allow code to create a SecureString while accidentally
|
||||
* not noticing the need to securely erasing the argument after invoking the constructor /
|
||||
* calling the assignment operator.
|
||||
*/
|
||||
class ARROW_EXPORT SecureString {
|
||||
public:
|
||||
SecureString() = default;
|
||||
SecureString(SecureString&&) noexcept;
|
||||
SecureString(const SecureString&) = default;
|
||||
explicit SecureString(std::string&&) noexcept;
|
||||
explicit SecureString(size_t, char) noexcept;
|
||||
|
||||
SecureString& operator=(SecureString&&) noexcept;
|
||||
SecureString& operator=(const SecureString&);
|
||||
SecureString& operator=(std::string&&) noexcept;
|
||||
|
||||
bool operator==(const SecureString&) const;
|
||||
bool operator!=(const SecureString&) const;
|
||||
|
||||
~SecureString() { Dispose(); }
|
||||
|
||||
[[nodiscard]] bool empty() const;
|
||||
[[nodiscard]] std::size_t size() const;
|
||||
[[nodiscard]] std::size_t length() const;
|
||||
[[nodiscard]] std::size_t capacity() const;
|
||||
|
||||
[[nodiscard]] span<uint8_t> as_span();
|
||||
[[nodiscard]] span<const uint8_t> as_span() const;
|
||||
[[nodiscard]] std::string_view as_view() const;
|
||||
|
||||
void Dispose();
|
||||
|
||||
static void SecureClear(std::string*);
|
||||
static void SecureClear(uint8_t* data, size_t size);
|
||||
|
||||
private:
|
||||
std::string secret_;
|
||||
};
|
||||
|
||||
} // namespace arrow::util
|
||||
@@ -0,0 +1,51 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// MSVC x86_64/arm64
|
||||
|
||||
# if defined(_M_AMD64) || defined(_M_X64)
|
||||
# include <intrin.h>
|
||||
# endif
|
||||
|
||||
#else
|
||||
// gcc/clang (possibly others)
|
||||
|
||||
# if defined(ARROW_HAVE_BMI2) || defined(ARROW_HAVE_RUNTIME_BMI2)
|
||||
# include <x86intrin.h>
|
||||
# endif
|
||||
|
||||
# if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_AVX512) || \
|
||||
defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512)
|
||||
# include <immintrin.h>
|
||||
# elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_RUNTIME_SSE4_2)
|
||||
# include <nmmintrin.h>
|
||||
# endif
|
||||
|
||||
# ifdef ARROW_HAVE_NEON
|
||||
# include <arm_neon.h>
|
||||
# endif
|
||||
|
||||
// GH-44098: Workaround for missing _mm256_set_m128i in older versions of GCC.
|
||||
# if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8
|
||||
# define _mm256_set_m128i(hi, lo) \
|
||||
_mm256_inserti128_si256(_mm256_castsi128_si256(lo), (hi), 1)
|
||||
# endif
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,512 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <initializer_list>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <new>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/aligned_storage.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
template <typename T, size_t N, bool NonTrivialDestructor>
|
||||
struct StaticVectorStorageBase {
|
||||
using storage_type = AlignedStorage<T>;
|
||||
|
||||
storage_type static_data_[N];
|
||||
size_t size_ = 0;
|
||||
|
||||
void destroy() noexcept {}
|
||||
};
|
||||
|
||||
template <typename T, size_t N>
|
||||
struct StaticVectorStorageBase<T, N, true> {
|
||||
using storage_type = AlignedStorage<T>;
|
||||
|
||||
storage_type static_data_[N];
|
||||
size_t size_ = 0;
|
||||
|
||||
~StaticVectorStorageBase() noexcept { destroy(); }
|
||||
|
||||
void destroy() noexcept { storage_type::destroy_several(static_data_, size_); }
|
||||
};
|
||||
|
||||
template <typename T, size_t N, bool D = !std::is_trivially_destructible<T>::value>
|
||||
struct StaticVectorStorage : public StaticVectorStorageBase<T, N, D> {
|
||||
using Base = StaticVectorStorageBase<T, N, D>;
|
||||
using typename Base::storage_type;
|
||||
|
||||
using Base::size_;
|
||||
using Base::static_data_;
|
||||
|
||||
StaticVectorStorage() noexcept = default;
|
||||
|
||||
constexpr storage_type* storage_ptr() { return static_data_; }
|
||||
|
||||
constexpr const storage_type* const_storage_ptr() const { return static_data_; }
|
||||
|
||||
// Adjust storage size, but don't initialize any objects
|
||||
void bump_size(size_t addend) {
|
||||
assert(size_ + addend <= N);
|
||||
size_ += addend;
|
||||
}
|
||||
|
||||
void ensure_capacity(size_t min_capacity) { assert(min_capacity <= N); }
|
||||
|
||||
// Adjust storage size, but don't destroy any objects
|
||||
void reduce_size(size_t reduce_by) {
|
||||
assert(reduce_by <= size_);
|
||||
size_ -= reduce_by;
|
||||
}
|
||||
|
||||
// Move objects from another storage, but don't destroy any objects currently
|
||||
// stored in *this.
|
||||
// You need to call destroy() first if necessary (e.g. in a
|
||||
// move assignment operator).
|
||||
void move_construct(StaticVectorStorage&& other) noexcept {
|
||||
size_ = other.size_;
|
||||
if (size_ != 0) {
|
||||
// Use a compile-time memcpy size (N) for trivial types
|
||||
storage_type::move_construct_several(other.static_data_, static_data_, size_, N);
|
||||
}
|
||||
}
|
||||
|
||||
constexpr size_t capacity() const { return N; }
|
||||
|
||||
constexpr size_t max_size() const { return N; }
|
||||
|
||||
void reserve(size_t n) {}
|
||||
|
||||
void clear() {
|
||||
storage_type::destroy_several(static_data_, size_);
|
||||
size_ = 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, size_t N>
|
||||
struct SmallVectorStorage {
|
||||
using storage_type = AlignedStorage<T>;
|
||||
|
||||
storage_type static_data_[N];
|
||||
size_t size_ = 0;
|
||||
storage_type* data_ = static_data_;
|
||||
size_t dynamic_capacity_ = 0;
|
||||
|
||||
SmallVectorStorage() noexcept = default;
|
||||
|
||||
~SmallVectorStorage() { destroy(); }
|
||||
|
||||
constexpr storage_type* storage_ptr() { return data_; }
|
||||
|
||||
constexpr const storage_type* const_storage_ptr() const { return data_; }
|
||||
|
||||
void bump_size(size_t addend) {
|
||||
const size_t new_size = size_ + addend;
|
||||
ensure_capacity(new_size);
|
||||
size_ = new_size;
|
||||
}
|
||||
|
||||
void ensure_capacity(size_t min_capacity) {
|
||||
if (dynamic_capacity_) {
|
||||
// Grow dynamic storage if necessary
|
||||
if (min_capacity > dynamic_capacity_) {
|
||||
size_t new_capacity = std::max(dynamic_capacity_ * 2, min_capacity);
|
||||
reallocate_dynamic(new_capacity);
|
||||
}
|
||||
} else if (min_capacity > N) {
|
||||
switch_to_dynamic(min_capacity);
|
||||
}
|
||||
}
|
||||
|
||||
void reduce_size(size_t reduce_by) {
|
||||
assert(reduce_by <= size_);
|
||||
size_ -= reduce_by;
|
||||
}
|
||||
|
||||
void destroy() noexcept {
|
||||
storage_type::destroy_several(data_, size_);
|
||||
if (dynamic_capacity_) {
|
||||
delete[] data_;
|
||||
}
|
||||
}
|
||||
|
||||
void move_construct(SmallVectorStorage&& other) noexcept {
|
||||
size_ = other.size_;
|
||||
dynamic_capacity_ = other.dynamic_capacity_;
|
||||
if (dynamic_capacity_) {
|
||||
data_ = other.data_;
|
||||
other.data_ = other.static_data_;
|
||||
other.dynamic_capacity_ = 0;
|
||||
other.size_ = 0;
|
||||
} else if (size_ != 0) {
|
||||
// Use a compile-time memcpy size (N) for trivial types
|
||||
storage_type::move_construct_several(other.static_data_, static_data_, size_, N);
|
||||
}
|
||||
}
|
||||
|
||||
constexpr size_t capacity() const { return dynamic_capacity_ ? dynamic_capacity_ : N; }
|
||||
|
||||
constexpr size_t max_size() const { return std::numeric_limits<size_t>::max(); }
|
||||
|
||||
void reserve(size_t n) {
|
||||
if (dynamic_capacity_) {
|
||||
if (n > dynamic_capacity_) {
|
||||
reallocate_dynamic(n);
|
||||
}
|
||||
} else if (n > N) {
|
||||
switch_to_dynamic(n);
|
||||
}
|
||||
}
|
||||
|
||||
void clear() {
|
||||
storage_type::destroy_several(data_, size_);
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
void switch_to_dynamic(size_t new_capacity) {
|
||||
dynamic_capacity_ = new_capacity;
|
||||
data_ = new storage_type[new_capacity];
|
||||
storage_type::move_construct_several_and_destroy_source(static_data_, data_, size_);
|
||||
}
|
||||
|
||||
void reallocate_dynamic(size_t new_capacity) {
|
||||
assert(new_capacity >= size_);
|
||||
auto new_data = new storage_type[new_capacity];
|
||||
storage_type::move_construct_several_and_destroy_source(data_, new_data, size_);
|
||||
delete[] data_;
|
||||
dynamic_capacity_ = new_capacity;
|
||||
data_ = new_data;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, size_t N, typename Storage>
|
||||
class StaticVectorImpl {
|
||||
private:
|
||||
Storage storage_;
|
||||
|
||||
T* data_ptr() { return storage_.storage_ptr()->get(); }
|
||||
|
||||
constexpr const T* const_data_ptr() const {
|
||||
return storage_.const_storage_ptr()->get();
|
||||
}
|
||||
|
||||
public:
|
||||
using size_type = size_t;
|
||||
using difference_type = ptrdiff_t;
|
||||
using value_type = T;
|
||||
using pointer = T*;
|
||||
using const_pointer = const T*;
|
||||
using reference = T&;
|
||||
using const_reference = const T&;
|
||||
using iterator = T*;
|
||||
using const_iterator = const T*;
|
||||
using reverse_iterator = std::reverse_iterator<iterator>;
|
||||
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
|
||||
|
||||
constexpr StaticVectorImpl() noexcept = default;
|
||||
|
||||
// Move and copy constructors
|
||||
StaticVectorImpl(StaticVectorImpl&& other) noexcept {
|
||||
storage_.move_construct(std::move(other.storage_));
|
||||
}
|
||||
|
||||
StaticVectorImpl& operator=(StaticVectorImpl&& other) noexcept {
|
||||
if (ARROW_PREDICT_TRUE(&other != this)) {
|
||||
// TODO move_assign?
|
||||
storage_.destroy();
|
||||
storage_.move_construct(std::move(other.storage_));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
StaticVectorImpl(const StaticVectorImpl& other) {
|
||||
init_by_copying(other.storage_.size_, other.const_data_ptr());
|
||||
}
|
||||
|
||||
StaticVectorImpl& operator=(const StaticVectorImpl& other) noexcept {
|
||||
if (ARROW_PREDICT_TRUE(&other != this)) {
|
||||
assign_by_copying(other.storage_.size_, other.data());
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Automatic conversion from std::vector<T>, for convenience
|
||||
StaticVectorImpl(const std::vector<T>& other) { // NOLINT: explicit
|
||||
init_by_copying(other.size(), other.data());
|
||||
}
|
||||
|
||||
StaticVectorImpl(std::vector<T>&& other) noexcept { // NOLINT: explicit
|
||||
init_by_moving(other.size(), other.data());
|
||||
}
|
||||
|
||||
StaticVectorImpl& operator=(const std::vector<T>& other) {
|
||||
assign_by_copying(other.size(), other.data());
|
||||
return *this;
|
||||
}
|
||||
|
||||
StaticVectorImpl& operator=(std::vector<T>&& other) noexcept {
|
||||
assign_by_moving(other.size(), other.data());
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Constructing from count and optional initialization value
|
||||
explicit StaticVectorImpl(size_t count) {
|
||||
storage_.bump_size(count);
|
||||
auto* p = storage_.storage_ptr();
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
p[i].construct();
|
||||
}
|
||||
}
|
||||
|
||||
StaticVectorImpl(size_t count, const T& value) {
|
||||
storage_.bump_size(count);
|
||||
auto* p = storage_.storage_ptr();
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
p[i].construct(value);
|
||||
}
|
||||
}
|
||||
|
||||
StaticVectorImpl(std::initializer_list<T> values) {
|
||||
storage_.bump_size(values.size());
|
||||
auto* p = storage_.storage_ptr();
|
||||
for (auto&& v : values) {
|
||||
// Unfortunately, cannot move initializer values
|
||||
p++->construct(v);
|
||||
}
|
||||
}
|
||||
|
||||
// Size inspection
|
||||
|
||||
constexpr bool empty() const { return storage_.size_ == 0; }
|
||||
|
||||
constexpr size_t size() const { return storage_.size_; }
|
||||
|
||||
constexpr size_t capacity() const { return storage_.capacity(); }
|
||||
|
||||
constexpr size_t max_size() const { return storage_.max_size(); }
|
||||
|
||||
// Data access
|
||||
|
||||
T& operator[](size_t i) { return data_ptr()[i]; }
|
||||
|
||||
constexpr const T& operator[](size_t i) const { return const_data_ptr()[i]; }
|
||||
|
||||
T& front() { return data_ptr()[0]; }
|
||||
|
||||
constexpr const T& front() const { return const_data_ptr()[0]; }
|
||||
|
||||
T& back() { return data_ptr()[storage_.size_ - 1]; }
|
||||
|
||||
constexpr const T& back() const { return const_data_ptr()[storage_.size_ - 1]; }
|
||||
|
||||
T* data() { return data_ptr(); }
|
||||
|
||||
constexpr const T* data() const { return const_data_ptr(); }
|
||||
|
||||
// Iterators
|
||||
|
||||
iterator begin() { return iterator(data_ptr()); }
|
||||
|
||||
constexpr const_iterator begin() const { return const_iterator(const_data_ptr()); }
|
||||
|
||||
constexpr const_iterator cbegin() const { return const_iterator(const_data_ptr()); }
|
||||
|
||||
iterator end() { return iterator(data_ptr() + storage_.size_); }
|
||||
|
||||
constexpr const_iterator end() const {
|
||||
return const_iterator(const_data_ptr() + storage_.size_);
|
||||
}
|
||||
|
||||
constexpr const_iterator cend() const {
|
||||
return const_iterator(const_data_ptr() + storage_.size_);
|
||||
}
|
||||
|
||||
reverse_iterator rbegin() { return reverse_iterator(end()); }
|
||||
|
||||
constexpr const_reverse_iterator rbegin() const {
|
||||
return const_reverse_iterator(end());
|
||||
}
|
||||
|
||||
constexpr const_reverse_iterator crbegin() const {
|
||||
return const_reverse_iterator(end());
|
||||
}
|
||||
|
||||
reverse_iterator rend() { return reverse_iterator(begin()); }
|
||||
|
||||
constexpr const_reverse_iterator rend() const {
|
||||
return const_reverse_iterator(begin());
|
||||
}
|
||||
|
||||
constexpr const_reverse_iterator crend() const {
|
||||
return const_reverse_iterator(begin());
|
||||
}
|
||||
|
||||
// Mutations
|
||||
|
||||
void reserve(size_t n) { storage_.reserve(n); }
|
||||
|
||||
void clear() { storage_.clear(); }
|
||||
|
||||
void push_back(const T& value) {
|
||||
storage_.bump_size(1);
|
||||
storage_.storage_ptr()[storage_.size_ - 1].construct(value);
|
||||
}
|
||||
|
||||
void push_back(T&& value) {
|
||||
storage_.bump_size(1);
|
||||
storage_.storage_ptr()[storage_.size_ - 1].construct(std::move(value));
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void emplace_back(Args&&... args) {
|
||||
storage_.bump_size(1);
|
||||
storage_.storage_ptr()[storage_.size_ - 1].construct(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename InputIt>
|
||||
iterator insert(const_iterator insert_at, InputIt first, InputIt last) {
|
||||
const size_t n = storage_.size_;
|
||||
const size_t it_size = static_cast<size_t>(last - first); // XXX might be O(n)?
|
||||
const size_t pos = static_cast<size_t>(insert_at - const_data_ptr());
|
||||
storage_.bump_size(it_size);
|
||||
auto* p = storage_.storage_ptr();
|
||||
if (it_size == 0) {
|
||||
return p[pos].get();
|
||||
}
|
||||
const size_t end_pos = pos + it_size;
|
||||
|
||||
// Move [pos; n) to [end_pos; end_pos + n - pos)
|
||||
size_t i = n;
|
||||
size_t j = end_pos + n - pos;
|
||||
while (j > std::max(n, end_pos)) {
|
||||
p[--j].move_construct(&p[--i]);
|
||||
}
|
||||
while (j > end_pos) {
|
||||
p[--j].move_assign(&p[--i]);
|
||||
}
|
||||
assert(j == end_pos);
|
||||
// Copy [first; last) to [pos; end_pos)
|
||||
j = pos;
|
||||
while (j < std::min(n, end_pos)) {
|
||||
p[j++].assign(*first++);
|
||||
}
|
||||
while (j < end_pos) {
|
||||
p[j++].construct(*first++);
|
||||
}
|
||||
assert(first == last);
|
||||
return p[pos].get();
|
||||
}
|
||||
|
||||
void resize(size_t n) {
|
||||
const size_t old_size = storage_.size_;
|
||||
if (n > storage_.size_) {
|
||||
storage_.bump_size(n - old_size);
|
||||
auto* p = storage_.storage_ptr();
|
||||
for (size_t i = old_size; i < n; ++i) {
|
||||
p[i].construct(T{});
|
||||
}
|
||||
} else {
|
||||
auto* p = storage_.storage_ptr();
|
||||
for (size_t i = n; i < old_size; ++i) {
|
||||
p[i].destroy();
|
||||
}
|
||||
storage_.reduce_size(old_size - n);
|
||||
}
|
||||
}
|
||||
|
||||
void resize(size_t n, const T& value) {
|
||||
const size_t old_size = storage_.size_;
|
||||
if (n > storage_.size_) {
|
||||
storage_.bump_size(n - old_size);
|
||||
auto* p = storage_.storage_ptr();
|
||||
for (size_t i = old_size; i < n; ++i) {
|
||||
p[i].construct(value);
|
||||
}
|
||||
} else {
|
||||
auto* p = storage_.storage_ptr();
|
||||
for (size_t i = n; i < old_size; ++i) {
|
||||
p[i].destroy();
|
||||
}
|
||||
storage_.reduce_size(old_size - n);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename InputIt>
|
||||
void init_by_copying(size_t n, InputIt src) {
|
||||
storage_.bump_size(n);
|
||||
auto* dest = storage_.storage_ptr();
|
||||
for (size_t i = 0; i < n; ++i, ++src) {
|
||||
dest[i].construct(*src);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIt>
|
||||
void init_by_moving(size_t n, InputIt src) {
|
||||
init_by_copying(n, std::make_move_iterator(src));
|
||||
}
|
||||
|
||||
template <typename InputIt>
|
||||
void assign_by_copying(size_t n, InputIt src) {
|
||||
const size_t old_size = storage_.size_;
|
||||
if (n > old_size) {
|
||||
storage_.bump_size(n - old_size);
|
||||
auto* dest = storage_.storage_ptr();
|
||||
for (size_t i = 0; i < old_size; ++i, ++src) {
|
||||
dest[i].assign(*src);
|
||||
}
|
||||
for (size_t i = old_size; i < n; ++i, ++src) {
|
||||
dest[i].construct(*src);
|
||||
}
|
||||
} else {
|
||||
auto* dest = storage_.storage_ptr();
|
||||
for (size_t i = 0; i < n; ++i, ++src) {
|
||||
dest[i].assign(*src);
|
||||
}
|
||||
for (size_t i = n; i < old_size; ++i) {
|
||||
dest[i].destroy();
|
||||
}
|
||||
storage_.reduce_size(old_size - n);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIt>
|
||||
void assign_by_moving(size_t n, InputIt src) {
|
||||
assign_by_copying(n, std::make_move_iterator(src));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, size_t N>
|
||||
using StaticVector = StaticVectorImpl<T, N, StaticVectorStorage<T, N>>;
|
||||
|
||||
template <typename T, size_t N>
|
||||
using SmallVector = StaticVectorImpl<T, N, SmallVectorStorage<T, N>>;
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,132 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
|
||||
namespace arrow::util {
|
||||
|
||||
template <class T>
|
||||
class span;
|
||||
|
||||
/// std::span polyfill.
|
||||
///
|
||||
/// Does not support static extents.
|
||||
template <typename T>
|
||||
class span {
|
||||
static_assert(sizeof(T),
|
||||
R"(
|
||||
std::span allows contiguous_iterators instead of just pointers, the enforcement
|
||||
of which requires T to be a complete type. arrow::util::span does not support
|
||||
contiguous_iterators, but T is still required to be a complete type to prevent
|
||||
writing code which would break when it is replaced by std::span.)");
|
||||
|
||||
public:
|
||||
using element_type = T;
|
||||
using value_type = std::remove_cv_t<T>;
|
||||
using iterator = T*;
|
||||
using const_iterator = T const*;
|
||||
|
||||
span() = default;
|
||||
span(const span&) = default;
|
||||
span& operator=(const span&) = default;
|
||||
|
||||
template <typename M, typename = std::enable_if_t<std::is_same_v<T, M const>>>
|
||||
// NOLINTNEXTLINE runtime/explicit
|
||||
constexpr span(span<M> mut) : span{mut.data(), mut.size()} {}
|
||||
|
||||
constexpr span(T* data, size_t count) : data_{data}, size_{count} {}
|
||||
|
||||
constexpr span(T* begin, T* end)
|
||||
: data_{begin}, size_{static_cast<size_t>(end - begin)} {}
|
||||
|
||||
template <typename R, typename RD = decltype(std::data(std::declval<R>())),
|
||||
typename RS = decltype(std::size(std::declval<R>())),
|
||||
typename E = std::enable_if_t<std::is_constructible_v<T*, RD> &&
|
||||
std::is_constructible_v<size_t, RS>>>
|
||||
// NOLINTNEXTLINE runtime/explicit, non-const reference
|
||||
constexpr span(R&& range) : data_{std::data(range)}, size_{std::size(range)} {}
|
||||
|
||||
constexpr T* begin() const { return data_; }
|
||||
constexpr T* end() const { return data_ + size_; }
|
||||
constexpr T* data() const { return data_; }
|
||||
|
||||
constexpr size_t size() const { return size_; }
|
||||
constexpr size_t size_bytes() const { return size_ * sizeof(T); }
|
||||
constexpr bool empty() const { return size_ == 0; }
|
||||
|
||||
constexpr T& operator[](size_t i) { return data_[i]; }
|
||||
constexpr const T& operator[](size_t i) const { return data_[i]; }
|
||||
|
||||
constexpr span subspan(size_t offset) const {
|
||||
if (offset > size_) return {data_, data_};
|
||||
return {data_ + offset, size_ - offset};
|
||||
}
|
||||
|
||||
constexpr span subspan(size_t offset, size_t count) const {
|
||||
auto out = subspan(offset);
|
||||
if (count < out.size_) {
|
||||
out.size_ = count;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
constexpr bool operator==(const span& other) const {
|
||||
if (size_ != other.size_) return false;
|
||||
|
||||
if constexpr (std::is_integral_v<T>) {
|
||||
if (size_ == 0) {
|
||||
return true; // memcmp does not handle null pointers, even if size_ == 0
|
||||
}
|
||||
return std::memcmp(data_, other.data_, size_bytes()) == 0;
|
||||
} else {
|
||||
T* ptr = data_;
|
||||
for (T const& e : other) {
|
||||
if (*ptr++ != e) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
constexpr bool operator!=(const span& other) const { return !(*this == other); }
|
||||
|
||||
private:
|
||||
T* data_{};
|
||||
size_t size_{};
|
||||
};
|
||||
|
||||
template <typename R>
|
||||
span(R& range) -> span<std::remove_pointer_t<decltype(std::data(range))>>;
|
||||
|
||||
template <typename T>
|
||||
span(T*, size_t) -> span<T>;
|
||||
|
||||
template <typename T>
|
||||
constexpr span<std::byte const> as_bytes(span<T> s) {
|
||||
return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr span<std::byte> as_writable_bytes(span<T> s) {
|
||||
return {reinterpret_cast<std::byte*>(s.data()), s.size_bytes()};
|
||||
}
|
||||
|
||||
} // namespace arrow::util
|
||||
@@ -0,0 +1,173 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#if __has_include(<charconv>)
|
||||
# include <charconv>
|
||||
#endif
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Status;
|
||||
|
||||
ARROW_EXPORT std::string HexEncode(const uint8_t* data, size_t length);
|
||||
|
||||
ARROW_EXPORT std::string Escape(const char* data, size_t length);
|
||||
|
||||
ARROW_EXPORT std::string HexEncode(const char* data, size_t length);
|
||||
|
||||
ARROW_EXPORT std::string HexEncode(std::string_view str);
|
||||
|
||||
ARROW_EXPORT std::string Escape(std::string_view str);
|
||||
|
||||
ARROW_EXPORT Status ParseHexValue(const char* hex_pair, uint8_t* out);
|
||||
|
||||
ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out);
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// Like std::string_view::starts_with in C++20
|
||||
inline bool StartsWith(std::string_view s, std::string_view prefix) {
|
||||
return s.length() >= prefix.length() &&
|
||||
(s.empty() || s.substr(0, prefix.length()) == prefix);
|
||||
}
|
||||
|
||||
/// Like std::string_view::ends_with in C++20
|
||||
inline bool EndsWith(std::string_view s, std::string_view suffix) {
|
||||
return s.length() >= suffix.length() &&
|
||||
(s.empty() || s.substr(s.length() - suffix.length()) == suffix);
|
||||
}
|
||||
|
||||
/// \brief Split a string with a delimiter
|
||||
ARROW_EXPORT
|
||||
std::vector<std::string_view> SplitString(std::string_view v, char delim,
|
||||
int64_t limit = 0);
|
||||
|
||||
/// \brief Join strings with a delimiter
|
||||
ARROW_EXPORT
|
||||
std::string JoinStrings(const std::vector<std::string_view>& strings,
|
||||
std::string_view delimiter);
|
||||
|
||||
/// \brief Join strings with a delimiter
|
||||
ARROW_EXPORT
|
||||
std::string JoinStrings(const std::vector<std::string>& strings,
|
||||
std::string_view delimiter);
|
||||
|
||||
/// \brief Trim whitespace from left and right sides of string
|
||||
ARROW_EXPORT
|
||||
std::string TrimString(std::string value);
|
||||
|
||||
ARROW_EXPORT
|
||||
bool AsciiEqualsCaseInsensitive(std::string_view left, std::string_view right);
|
||||
|
||||
ARROW_EXPORT
|
||||
std::string AsciiToLower(std::string_view value);
|
||||
|
||||
ARROW_EXPORT
|
||||
std::string AsciiToUpper(std::string_view value);
|
||||
|
||||
/// \brief Search for the first instance of a token and replace it or return nullopt if
|
||||
/// the token is not found.
|
||||
ARROW_EXPORT
|
||||
std::optional<std::string> Replace(std::string_view s, std::string_view token,
|
||||
std::string_view replacement);
|
||||
|
||||
/// \brief Get boolean value from string
|
||||
///
|
||||
/// If "1", "true" (case-insensitive), returns true
|
||||
/// If "0", "false" (case-insensitive), returns false
|
||||
/// Otherwise, returns Status::Invalid
|
||||
ARROW_EXPORT
|
||||
arrow::Result<bool> ParseBoolean(std::string_view value);
|
||||
|
||||
#if __has_include(<charconv>)
|
||||
|
||||
namespace detail {
|
||||
template <typename T, typename = void>
|
||||
struct can_to_chars : public std::false_type {};
|
||||
|
||||
template <typename T>
|
||||
struct can_to_chars<
|
||||
T, std::void_t<decltype(std::to_chars(std::declval<char*>(), std::declval<char*>(),
|
||||
std::declval<std::remove_reference_t<T>>()))>>
|
||||
: public std::true_type {};
|
||||
} // namespace detail
|
||||
|
||||
/// \brief Whether std::to_chars exists for the current value type.
|
||||
///
|
||||
/// This is useful as some C++ libraries do not implement all specified overloads
|
||||
/// for std::to_chars.
|
||||
template <typename T>
|
||||
inline constexpr bool have_to_chars = detail::can_to_chars<T>::value;
|
||||
|
||||
/// \brief An ergonomic wrapper around std::to_chars, returning a std::string
|
||||
///
|
||||
/// For most inputs, the std::string result will not incur any heap allocation
|
||||
/// thanks to small string optimization.
|
||||
///
|
||||
/// Compared to std::to_string, this function gives locale-agnostic results
|
||||
/// and might also be faster.
|
||||
template <typename T, typename... Args>
|
||||
std::string ToChars(T value, Args&&... args) {
|
||||
if constexpr (!have_to_chars<T>) {
|
||||
// Some C++ standard libraries do not yet implement std::to_chars for all types,
|
||||
// in which case we have to fallback to std::string.
|
||||
return std::to_string(value);
|
||||
} else {
|
||||
// According to various sources, the GNU libstdc++ and Microsoft's C++ STL
|
||||
// allow up to 15 bytes of small string optimization, while clang's libc++
|
||||
// goes up to 22 bytes. Choose the pessimistic value.
|
||||
std::string out(15, 0);
|
||||
auto res = std::to_chars(&out.front(), &out.back(), value, args...);
|
||||
while (res.ec != std::errc{}) {
|
||||
assert(res.ec == std::errc::value_too_large);
|
||||
out.resize(out.capacity() * 2);
|
||||
res = std::to_chars(&out.front(), &out.back(), value, args...);
|
||||
}
|
||||
const auto length = res.ptr - out.data();
|
||||
assert(length <= static_cast<int64_t>(out.length()));
|
||||
out.resize(length);
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
#else // !__has_include(<charconv>)
|
||||
|
||||
template <typename T>
|
||||
inline constexpr bool have_to_chars = false;
|
||||
|
||||
template <typename T, typename... Args>
|
||||
std::string ToChars(T value, Args&&... args) {
|
||||
return std::to_string(value);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,82 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License. template <typename T>
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT StringStreamWrapper {
|
||||
public:
|
||||
StringStreamWrapper();
|
||||
~StringStreamWrapper();
|
||||
|
||||
std::ostream& stream() { return ostream_; }
|
||||
std::string str();
|
||||
|
||||
protected:
|
||||
std::unique_ptr<std::ostringstream> sstream_;
|
||||
std::ostream& ostream_;
|
||||
};
|
||||
|
||||
template <typename... Args>
|
||||
std::string JoinToString(Args&&... args) {
|
||||
StringStreamWrapper ss;
|
||||
(
|
||||
[&ss](auto&& arg) {
|
||||
// Avoid losing precision when printing floating point numbers
|
||||
if constexpr (std::is_floating_point_v<std::decay_t<decltype(arg)>>) {
|
||||
ss.stream() << std::to_string(arg);
|
||||
} else {
|
||||
ss.stream() << arg;
|
||||
}
|
||||
}(std::forward<Args>(args)),
|
||||
...);
|
||||
return ss.str();
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
namespace util {
|
||||
/// CRTP helper for declaring string representation. Defines operator<<
|
||||
template <typename T>
|
||||
class ToStringOstreamable {
|
||||
public:
|
||||
~ToStringOstreamable() {
|
||||
static_assert(
|
||||
std::is_same<decltype(std::declval<const T>().ToString()), std::string>::value,
|
||||
"ToStringOstreamable depends on the method T::ToString() const");
|
||||
}
|
||||
|
||||
private:
|
||||
const T& cast() const { return static_cast<const T&>(*this); }
|
||||
|
||||
friend inline std::ostream& operator<<(std::ostream& os, const ToStringOstreamable& t) {
|
||||
return os << t.cast().ToString();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,106 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/cancel.h"
|
||||
#include "arrow/util/functional.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
/// \brief A group of related tasks
|
||||
///
|
||||
/// A TaskGroup executes tasks with the signature `Status()`.
|
||||
/// Execution can be serial or parallel, depending on the TaskGroup
|
||||
/// implementation. When Finish() returns, it is guaranteed that all
|
||||
/// tasks have finished, or at least one has errored.
|
||||
///
|
||||
/// Once an error has occurred any tasks that are submitted to the task group
|
||||
/// will not run. The call to Append will simply return without scheduling the
|
||||
/// task.
|
||||
///
|
||||
/// If the task group is parallel it is possible that multiple tasks could be
|
||||
/// running at the same time and one of those tasks fails. This will put the
|
||||
/// task group in a failure state (so additional tasks cannot be run) however
|
||||
/// it will not interrupt running tasks. Finish will not complete
|
||||
/// until all running tasks have finished, even if one task fails.
|
||||
///
|
||||
/// Once a task group has finished new tasks may not be added to it. If you need to start
|
||||
/// a new batch of work then you should create a new task group.
|
||||
class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
|
||||
public:
|
||||
/// Add a Status-returning function to execute. Execution order is
|
||||
/// undefined. The function may be executed immediately or later.
|
||||
template <typename Function>
|
||||
void Append(Function&& func) {
|
||||
return AppendReal(std::forward<Function>(func));
|
||||
}
|
||||
|
||||
/// Wait for execution of all tasks (and subgroups) to be finished,
|
||||
/// or for at least one task (or subgroup) to error out.
|
||||
/// The returned Status propagates the error status of the first failing
|
||||
/// task (or subgroup).
|
||||
virtual Status Finish() = 0;
|
||||
|
||||
/// Returns a future that will complete the first time all tasks are finished.
|
||||
/// This should be called only after all top level tasks
|
||||
/// have been added to the task group.
|
||||
///
|
||||
/// If you are using a TaskGroup asynchronously there are a few considerations to keep
|
||||
/// in mind. The tasks should not block on I/O, etc (defeats the purpose of using
|
||||
/// futures) and should not be doing any nested locking or you run the risk of the tasks
|
||||
/// getting stuck in the thread pool waiting for tasks which cannot get scheduled.
|
||||
///
|
||||
/// Primarily this call is intended to help migrate existing work written with TaskGroup
|
||||
/// in mind to using futures without having to do a complete conversion on the first
|
||||
/// pass.
|
||||
virtual Future<> FinishAsync() = 0;
|
||||
|
||||
/// The current aggregate error Status. Non-blocking, useful for stopping early.
|
||||
virtual Status current_status() = 0;
|
||||
|
||||
/// Whether some tasks have already failed. Non-blocking, useful for stopping early.
|
||||
virtual bool ok() const = 0;
|
||||
|
||||
/// How many tasks can typically be executed in parallel.
|
||||
/// This is only a hint, useful for testing or debugging.
|
||||
virtual int parallelism() = 0;
|
||||
|
||||
static std::shared_ptr<TaskGroup> MakeSerial(StopToken = StopToken::Unstoppable());
|
||||
static std::shared_ptr<TaskGroup> MakeThreaded(internal::Executor*,
|
||||
StopToken = StopToken::Unstoppable());
|
||||
|
||||
virtual ~TaskGroup() = default;
|
||||
|
||||
protected:
|
||||
TaskGroup() = default;
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(TaskGroup);
|
||||
|
||||
virtual void AppendReal(FnOnce<Status()> task) = 0;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,90 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iosfwd>
|
||||
|
||||
#include "arrow/testing/gtest_util.h"
|
||||
#include "arrow/util/iterator.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
struct TestInt {
|
||||
TestInt();
|
||||
TestInt(int i); // NOLINT runtime/explicit
|
||||
int value;
|
||||
|
||||
bool operator==(const TestInt& other) const;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const TestInt& v);
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IterationTraits<TestInt> {
|
||||
static TestInt End() { return TestInt(); }
|
||||
static bool IsEnd(const TestInt& val) { return val == IterationTraits<TestInt>::End(); }
|
||||
};
|
||||
|
||||
struct TestStr {
|
||||
TestStr();
|
||||
TestStr(const std::string& s); // NOLINT runtime/explicit
|
||||
TestStr(const char* s); // NOLINT runtime/explicit
|
||||
explicit TestStr(const TestInt& test_int);
|
||||
std::string value;
|
||||
|
||||
bool operator==(const TestStr& other) const;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const TestStr& v);
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IterationTraits<TestStr> {
|
||||
static TestStr End() { return TestStr(); }
|
||||
static bool IsEnd(const TestStr& val) { return val == IterationTraits<TestStr>::End(); }
|
||||
};
|
||||
|
||||
std::vector<TestInt> RangeVector(unsigned int max, unsigned int step = 1);
|
||||
|
||||
template <typename T>
|
||||
inline Iterator<T> VectorIt(std::vector<T> v) {
|
||||
return MakeVectorIterator<T>(std::move(v));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline Iterator<T> PossiblySlowVectorIt(std::vector<T> v, bool slow = false) {
|
||||
auto iterator = MakeVectorIterator<T>(std::move(v));
|
||||
if (slow) {
|
||||
return MakeTransformedIterator<T, T>(std::move(iterator),
|
||||
[](T item) -> Result<TransformFlow<T>> {
|
||||
SleepABit();
|
||||
return TransformYield(item);
|
||||
});
|
||||
} else {
|
||||
return iterator;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void AssertIteratorExhausted(Iterator<T>& it) {
|
||||
ASSERT_OK_AND_ASSIGN(T next, it.Next());
|
||||
ASSERT_TRUE(IsIterationEnd(next));
|
||||
}
|
||||
|
||||
Transformer<TestInt, TestStr> MakeFilter(std::function<bool(TestInt&)> filter);
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,643 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <queue>
|
||||
#include <type_traits>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/cancel.h"
|
||||
#include "arrow/util/config.h"
|
||||
#include "arrow/util/functional.h"
|
||||
#include "arrow/util/future.h"
|
||||
#include "arrow/util/iterator.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// Disable harmless warning for decorated name length limit
|
||||
# pragma warning(disable : 4503)
|
||||
#endif
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Get the capacity of the global thread pool
|
||||
///
|
||||
/// Return the number of worker threads in the thread pool to which
|
||||
/// Arrow dispatches various CPU-bound tasks. This is an ideal number,
|
||||
/// not necessarily the exact number of threads at a given point in time.
|
||||
///
|
||||
/// You can change this number using SetCpuThreadPoolCapacity().
|
||||
ARROW_EXPORT int GetCpuThreadPoolCapacity();
|
||||
|
||||
/// \brief Set the capacity of the global thread pool
|
||||
///
|
||||
/// Set the number of worker threads int the thread pool to which
|
||||
/// Arrow dispatches various CPU-bound tasks.
|
||||
///
|
||||
/// The current number is returned by GetCpuThreadPoolCapacity().
|
||||
ARROW_EXPORT Status SetCpuThreadPoolCapacity(int threads);
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Hints about a task that may be used by an Executor.
|
||||
// They are ignored by the provided ThreadPool implementation.
|
||||
struct TaskHints {
|
||||
// The lower, the more urgent
|
||||
int32_t priority = 0;
|
||||
// The IO transfer size in bytes
|
||||
int64_t io_size = -1;
|
||||
// The approximate CPU cost in number of instructions
|
||||
int64_t cpu_cost = -1;
|
||||
// An application-specific ID
|
||||
int64_t external_id = -1;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Executor {
|
||||
public:
|
||||
using StopCallback = internal::FnOnce<void(const Status&)>;
|
||||
|
||||
virtual ~Executor();
|
||||
|
||||
// Spawn a fire-and-forget task.
|
||||
template <typename Function>
|
||||
Status Spawn(Function&& func) {
|
||||
return SpawnReal(TaskHints{}, std::forward<Function>(func), StopToken::Unstoppable(),
|
||||
StopCallback{});
|
||||
}
|
||||
template <typename Function>
|
||||
Status Spawn(Function&& func, StopToken stop_token) {
|
||||
return SpawnReal(TaskHints{}, std::forward<Function>(func), std::move(stop_token),
|
||||
StopCallback{});
|
||||
}
|
||||
template <typename Function>
|
||||
Status Spawn(TaskHints hints, Function&& func) {
|
||||
return SpawnReal(hints, std::forward<Function>(func), StopToken::Unstoppable(),
|
||||
StopCallback{});
|
||||
}
|
||||
template <typename Function>
|
||||
Status Spawn(TaskHints hints, Function&& func, StopToken stop_token) {
|
||||
return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
|
||||
StopCallback{});
|
||||
}
|
||||
template <typename Function>
|
||||
Status Spawn(TaskHints hints, Function&& func, StopToken stop_token,
|
||||
StopCallback stop_callback) {
|
||||
return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
|
||||
std::move(stop_callback));
|
||||
}
|
||||
|
||||
// Transfers a future to this executor. Any continuations added to the
|
||||
// returned future will run in this executor. Otherwise they would run
|
||||
// on the same thread that called MarkFinished.
|
||||
//
|
||||
// This is necessary when (for example) an I/O task is completing a future.
|
||||
// The continuations of that future should run on the CPU thread pool keeping
|
||||
// CPU heavy work off the I/O thread pool. So the I/O task should transfer
|
||||
// the future to the CPU executor before returning.
|
||||
//
|
||||
// By default this method will only transfer if the future is not already completed. If
|
||||
// the future is already completed then any callback would be run synchronously and so
|
||||
// no transfer is typically necessary. However, in cases where you want to force a
|
||||
// transfer (e.g. to help the scheduler break up units of work across multiple cores)
|
||||
// then you can override this behavior with `always_transfer`.
|
||||
template <typename T>
|
||||
Future<T> Transfer(Future<T> future) {
|
||||
return DoTransfer(std::move(future), false);
|
||||
}
|
||||
|
||||
// Overload of Transfer which will always schedule callbacks on new threads even if the
|
||||
// future is finished when the callback is added.
|
||||
//
|
||||
// This can be useful in cases where you want to ensure parallelism
|
||||
template <typename T>
|
||||
Future<T> TransferAlways(Future<T> future) {
|
||||
return DoTransfer(std::move(future), true);
|
||||
}
|
||||
|
||||
// Submit a callable and arguments for execution. Return a future that
|
||||
// will return the callable's result value once.
|
||||
// The callable's arguments are copied before execution.
|
||||
template <typename Function, typename... Args,
|
||||
typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
|
||||
Function && (Args && ...)>>
|
||||
Result<FutureType> Submit(TaskHints hints, StopToken stop_token, Function&& func,
|
||||
Args&&... args) {
|
||||
using ValueType = typename FutureType::ValueType;
|
||||
|
||||
auto future = FutureType::Make();
|
||||
auto task = std::bind(::arrow::detail::ContinueFuture{}, future,
|
||||
std::forward<Function>(func), std::forward<Args>(args)...);
|
||||
struct {
|
||||
WeakFuture<ValueType> weak_fut;
|
||||
|
||||
void operator()(const Status& st) {
|
||||
auto fut = weak_fut.get();
|
||||
if (fut.is_valid()) {
|
||||
fut.MarkFinished(st);
|
||||
}
|
||||
}
|
||||
} stop_callback{WeakFuture<ValueType>(future)};
|
||||
ARROW_RETURN_NOT_OK(SpawnReal(hints, std::move(task), std::move(stop_token),
|
||||
std::move(stop_callback)));
|
||||
|
||||
return future;
|
||||
}
|
||||
|
||||
template <typename Function, typename... Args,
|
||||
typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
|
||||
Function && (Args && ...)>>
|
||||
Result<FutureType> Submit(StopToken stop_token, Function&& func, Args&&... args) {
|
||||
return Submit(TaskHints{}, stop_token, std::forward<Function>(func),
|
||||
std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename Function, typename... Args,
|
||||
typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
|
||||
Function && (Args && ...)>>
|
||||
Result<FutureType> Submit(TaskHints hints, Function&& func, Args&&... args) {
|
||||
return Submit(std::move(hints), StopToken::Unstoppable(),
|
||||
std::forward<Function>(func), std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename Function, typename... Args,
|
||||
typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
|
||||
Function && (Args && ...)>>
|
||||
Result<FutureType> Submit(Function&& func, Args&&... args) {
|
||||
return Submit(TaskHints{}, StopToken::Unstoppable(), std::forward<Function>(func),
|
||||
std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
// Return the level of parallelism (the number of tasks that may be executed
|
||||
// concurrently). This may be an approximate number.
|
||||
virtual int GetCapacity() = 0;
|
||||
|
||||
// Return true if the thread from which this function is called is owned by this
|
||||
// Executor. Returns false if this Executor does not support this property.
|
||||
virtual bool OwnsThisThread() { return false; }
|
||||
|
||||
// Return true if this is the current executor being called
|
||||
// n.b. this defaults to just calling OwnsThisThread
|
||||
// unless the threadpool is disabled
|
||||
virtual bool IsCurrentExecutor() { return OwnsThisThread(); }
|
||||
|
||||
/// \brief An interface to represent something with a custom destructor
|
||||
///
|
||||
/// \see KeepAlive
|
||||
class ARROW_EXPORT Resource {
|
||||
public:
|
||||
virtual ~Resource() = default;
|
||||
};
|
||||
|
||||
/// \brief Keep a resource alive until all executor threads have terminated
|
||||
///
|
||||
/// Executors may have static storage duration. In particular, the CPU and I/O
|
||||
/// executors are currently implemented this way. These threads may access other
|
||||
/// objects with static storage duration such as the OpenTelemetry runtime context
|
||||
/// the default memory pool, or other static executors.
|
||||
///
|
||||
/// The order in which these objects are destroyed is difficult to control. In order
|
||||
/// to ensure those objects remain alive until all threads have finished those objects
|
||||
/// should be wrapped in a Resource object and passed into this method. The given
|
||||
/// shared_ptr will be kept alive until all threads have finished their worker loops.
|
||||
virtual void KeepAlive(std::shared_ptr<Resource> resource);
|
||||
|
||||
protected:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Executor);
|
||||
|
||||
Executor() = default;
|
||||
|
||||
template <typename T, typename FT = Future<T>, typename FTSync = typename FT::SyncType>
|
||||
Future<T> DoTransfer(Future<T> future, bool always_transfer = false) {
|
||||
auto transferred = Future<T>::Make();
|
||||
if (always_transfer) {
|
||||
CallbackOptions callback_options = CallbackOptions::Defaults();
|
||||
callback_options.should_schedule = ShouldSchedule::Always;
|
||||
callback_options.executor = this;
|
||||
auto sync_callback = [transferred](const FTSync& result) mutable {
|
||||
transferred.MarkFinished(result);
|
||||
};
|
||||
future.AddCallback(sync_callback, callback_options);
|
||||
return transferred;
|
||||
}
|
||||
|
||||
// We could use AddCallback's ShouldSchedule::IfUnfinished but we can save a bit of
|
||||
// work by doing the test here.
|
||||
auto callback = [this, transferred](const FTSync& result) mutable {
|
||||
auto spawn_status =
|
||||
Spawn([transferred, result]() mutable { transferred.MarkFinished(result); });
|
||||
if (!spawn_status.ok()) {
|
||||
transferred.MarkFinished(spawn_status);
|
||||
}
|
||||
};
|
||||
auto callback_factory = [&callback]() { return callback; };
|
||||
if (future.TryAddCallback(callback_factory)) {
|
||||
return transferred;
|
||||
}
|
||||
// If the future is already finished and we aren't going to force spawn a thread
|
||||
// then we don't need to add another layer of callback and can return the original
|
||||
// future
|
||||
return future;
|
||||
}
|
||||
|
||||
// Subclassing API
|
||||
virtual Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
|
||||
StopCallback&&) = 0;
|
||||
};
|
||||
|
||||
/// \brief An executor implementation that runs all tasks on a single thread using an
|
||||
/// event loop.
|
||||
///
|
||||
/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
|
||||
/// fine but if one task needs to wait for another task it must be expressed as an
|
||||
/// asynchronous continuation.
|
||||
class ARROW_EXPORT SerialExecutor : public Executor {
|
||||
public:
|
||||
template <typename T = ::arrow::internal::Empty>
|
||||
using TopLevelTask = internal::FnOnce<Future<T>(Executor*)>;
|
||||
|
||||
~SerialExecutor() override;
|
||||
|
||||
int GetCapacity() override { return 1; };
|
||||
bool OwnsThisThread() override;
|
||||
Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
|
||||
StopCallback&&) override;
|
||||
|
||||
// Return the number of tasks either running or in the queue.
|
||||
int GetNumTasks();
|
||||
|
||||
/// \brief Runs the TopLevelTask and any scheduled tasks
|
||||
///
|
||||
/// The TopLevelTask (or one of the tasks it schedules) must either return an invalid
|
||||
/// status or call the finish signal. Failure to do this will result in a deadlock. For
|
||||
/// this reason it is preferable (if possible) to use the helper methods (below)
|
||||
/// RunSynchronously/RunSerially which delegates the responsibility onto a Future
|
||||
/// producer's existing responsibility to always mark a future finished (which can
|
||||
/// someday be aided by ARROW-12207).
|
||||
template <typename T = internal::Empty, typename FT = Future<T>,
|
||||
typename FTSync = typename FT::SyncType>
|
||||
static FTSync RunInSerialExecutor(TopLevelTask<T> initial_task) {
|
||||
Future<T> fut = SerialExecutor().Run<T>(std::move(initial_task));
|
||||
return FutureToSync(fut);
|
||||
}
|
||||
|
||||
/// \brief Transform an AsyncGenerator into an Iterator
|
||||
///
|
||||
/// An event loop will be created and each call to Next will power the event loop with
|
||||
/// the calling thread until the next item is ready to be delivered.
|
||||
///
|
||||
/// Note: The iterator's destructor will run until the given generator is fully
|
||||
/// exhausted. If you wish to abandon iteration before completion then the correct
|
||||
/// approach is to use a stop token to cause the generator to exhaust early.
|
||||
template <typename T>
|
||||
static Iterator<T> IterateGenerator(
|
||||
internal::FnOnce<Result<std::function<Future<T>()>>(Executor*)> initial_task) {
|
||||
auto serial_executor = std::unique_ptr<SerialExecutor>(new SerialExecutor());
|
||||
auto maybe_generator = std::move(initial_task)(serial_executor.get());
|
||||
if (!maybe_generator.ok()) {
|
||||
return MakeErrorIterator<T>(maybe_generator.status());
|
||||
}
|
||||
auto generator = maybe_generator.MoveValueUnsafe();
|
||||
struct SerialIterator {
|
||||
SerialIterator(std::unique_ptr<SerialExecutor> executor,
|
||||
std::function<Future<T>()> generator)
|
||||
: executor(std::move(executor)), generator(std::move(generator)) {}
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(SerialIterator);
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(SerialIterator);
|
||||
~SerialIterator() {
|
||||
// A serial iterator must be consumed before it can be destroyed. Allowing it to
|
||||
// do otherwise would lead to resource leakage. There will likely be deadlocks at
|
||||
// this spot in the future but these will be the result of other bugs and not the
|
||||
// fact that we are forcing consumption here.
|
||||
|
||||
// If a streaming API needs to support early abandonment then it should be done so
|
||||
// with a cancellation token and not simply discarding the iterator and expecting
|
||||
// the underlying work to clean up correctly.
|
||||
if (executor && !executor->IsFinished()) {
|
||||
while (true) {
|
||||
Result<T> maybe_next = Next();
|
||||
if (!maybe_next.ok() || IsIterationEnd(*maybe_next)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Result<T> Next() {
|
||||
executor->Unpause();
|
||||
// This call may lead to tasks being scheduled in the serial executor
|
||||
Future<T> next_fut = generator();
|
||||
next_fut.AddCallback([this](const Result<T>& res) {
|
||||
// If we're done iterating we should drain the rest of the tasks in the executor
|
||||
if (!res.ok() || IsIterationEnd(*res)) {
|
||||
executor->Finish();
|
||||
return;
|
||||
}
|
||||
// Otherwise we will break out immediately, leaving the remaining tasks for
|
||||
// the next call.
|
||||
executor->Pause();
|
||||
});
|
||||
#ifdef ARROW_ENABLE_THREADING
|
||||
// future must run on this thread
|
||||
// Borrow this thread and run tasks until the future is finished
|
||||
executor->RunLoop();
|
||||
#else
|
||||
next_fut.Wait();
|
||||
#endif
|
||||
if (!next_fut.is_finished()) {
|
||||
// Not clear this is possible since RunLoop wouldn't generally exit
|
||||
// unless we paused/finished which would imply next_fut has been
|
||||
// finished.
|
||||
return Status::Invalid(
|
||||
"Serial executor terminated before next result computed");
|
||||
}
|
||||
// At this point we may still have tasks in the executor, that is ok.
|
||||
// We will run those tasks the next time through.
|
||||
return next_fut.result();
|
||||
}
|
||||
|
||||
std::unique_ptr<SerialExecutor> executor;
|
||||
std::function<Future<T>()> generator;
|
||||
};
|
||||
return Iterator<T>(SerialIterator{std::move(serial_executor), std::move(generator)});
|
||||
}
|
||||
|
||||
#ifndef ARROW_ENABLE_THREADING
|
||||
// run a pending task from loop
|
||||
// returns true if any tasks were run in the last go round the loop (i.e. if it
|
||||
// returns false, all executors are waiting)
|
||||
static bool RunTasksOnAllExecutors();
|
||||
static SerialExecutor* GetCurrentExecutor();
|
||||
|
||||
bool IsCurrentExecutor() override;
|
||||
|
||||
#endif
|
||||
|
||||
protected:
|
||||
virtual void RunLoop();
|
||||
|
||||
// State uses mutex
|
||||
struct State;
|
||||
std::shared_ptr<State> state_;
|
||||
|
||||
SerialExecutor();
|
||||
|
||||
// We mark the serial executor "finished" when there should be
|
||||
// no more tasks scheduled on it. It's not strictly needed but
|
||||
// can help catch bugs where we are trying to use the executor
|
||||
// after we are done with it.
|
||||
void Finish();
|
||||
bool IsFinished();
|
||||
// We pause the executor when we are running an async generator
|
||||
// and we have received an item that we can deliver.
|
||||
void Pause();
|
||||
void Unpause();
|
||||
|
||||
template <typename T, typename FTSync = typename Future<T>::SyncType>
|
||||
Future<T> Run(TopLevelTask<T> initial_task) {
|
||||
auto final_fut = std::move(initial_task)(this);
|
||||
final_fut.AddCallback([this](const FTSync&) { Finish(); });
|
||||
RunLoop();
|
||||
return final_fut;
|
||||
}
|
||||
|
||||
#ifndef ARROW_ENABLE_THREADING
|
||||
// we have to run tasks from all live executors
|
||||
// during RunLoop if we don't have threading
|
||||
static std::unordered_set<SerialExecutor*> all_executors;
|
||||
// a pointer to the last one called by the loop
|
||||
// so all tasks get spawned equally
|
||||
// on multiple calls to RunTasksOnAllExecutors
|
||||
static SerialExecutor* last_called_executor;
|
||||
// without threading we can't tell which executor called the
|
||||
// current process - so we set it in spawning the task
|
||||
static SerialExecutor* current_executor;
|
||||
#endif // ARROW_ENABLE_THREADING
|
||||
};
|
||||
|
||||
#ifdef ARROW_ENABLE_THREADING
|
||||
|
||||
/// An Executor implementation spawning tasks in FIFO manner on a fixed-size
|
||||
/// pool of worker threads.
|
||||
///
|
||||
/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
|
||||
/// fine but if one task needs to wait for another task it must be expressed as an
|
||||
/// asynchronous continuation.
|
||||
class ARROW_EXPORT ThreadPool : public Executor {
|
||||
public:
|
||||
// Construct a thread pool with the given number of worker threads
|
||||
static Result<std::shared_ptr<ThreadPool>> Make(int threads);
|
||||
|
||||
// Like Make(), but takes care that the returned ThreadPool is compatible
|
||||
// with destruction late at process exit.
|
||||
static Result<std::shared_ptr<ThreadPool>> MakeEternal(int threads);
|
||||
|
||||
// Destroy thread pool; the pool will first be shut down
|
||||
~ThreadPool() override;
|
||||
|
||||
// Return the desired number of worker threads.
|
||||
// The actual number of workers may lag a bit before being adjusted to
|
||||
// match this value.
|
||||
int GetCapacity() override;
|
||||
|
||||
// Return the number of tasks either running or in the queue.
|
||||
int GetNumTasks();
|
||||
|
||||
bool OwnsThisThread() override;
|
||||
// Dynamically change the number of worker threads.
|
||||
//
|
||||
// This function always returns immediately.
|
||||
// If fewer threads are running than this number, new threads are spawned
|
||||
// on-demand when needed for task execution.
|
||||
// If more threads are running than this number, excess threads are reaped
|
||||
// as soon as possible.
|
||||
Status SetCapacity(int threads);
|
||||
|
||||
// Heuristic for the default capacity of a thread pool for CPU-bound tasks.
|
||||
// This is exposed as a static method to help with testing.
|
||||
// The number returned is guaranteed to be greater or equal to one.
|
||||
static int DefaultCapacity();
|
||||
|
||||
// Shutdown the pool. Once the pool starts shutting down, new tasks
|
||||
// cannot be submitted anymore.
|
||||
// If "wait" is true, shutdown waits for all pending tasks to be finished.
|
||||
// If "wait" is false, workers are stopped as soon as currently executing
|
||||
// tasks are finished.
|
||||
Status Shutdown(bool wait = true);
|
||||
|
||||
// Wait for the thread pool to become idle
|
||||
//
|
||||
// This is useful for sequencing tests
|
||||
void WaitForIdle();
|
||||
|
||||
void KeepAlive(std::shared_ptr<Executor::Resource> resource) override;
|
||||
|
||||
struct State;
|
||||
|
||||
protected:
|
||||
FRIEND_TEST(TestThreadPool, SetCapacity);
|
||||
FRIEND_TEST(TestGlobalThreadPool, Capacity);
|
||||
ARROW_FRIEND_EXPORT friend ThreadPool* GetCpuThreadPool();
|
||||
|
||||
ThreadPool();
|
||||
|
||||
Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
|
||||
StopCallback&&) override;
|
||||
|
||||
// Collect finished worker threads, making sure the OS threads have exited
|
||||
void CollectFinishedWorkersUnlocked();
|
||||
// Launch a given number of additional workers
|
||||
void LaunchWorkersUnlocked(int threads);
|
||||
// Get the current actual capacity
|
||||
int GetActualCapacity();
|
||||
|
||||
static std::shared_ptr<ThreadPool> MakeCpuThreadPool();
|
||||
|
||||
std::shared_ptr<State> sp_state_;
|
||||
State* state_;
|
||||
bool shutdown_on_destroy_;
|
||||
};
|
||||
#else // ARROW_ENABLE_THREADING
|
||||
// an executor implementation which pretends to be a thread pool but runs everything
|
||||
// on the main thread using a static queue (shared between all thread pools, otherwise
|
||||
// cross-threadpool dependencies will break everything)
|
||||
class ARROW_EXPORT ThreadPool : public SerialExecutor {
|
||||
public:
|
||||
ARROW_FRIEND_EXPORT friend ThreadPool* GetCpuThreadPool();
|
||||
|
||||
static Result<std::shared_ptr<ThreadPool>> Make(int threads);
|
||||
|
||||
// Like Make(), but takes care that the returned ThreadPool is compatible
|
||||
// with destruction late at process exit.
|
||||
static Result<std::shared_ptr<ThreadPool>> MakeEternal(int threads);
|
||||
|
||||
// Destroy thread pool; the pool will first be shut down
|
||||
~ThreadPool() override;
|
||||
|
||||
// Return the desired number of worker threads.
|
||||
// The actual number of workers may lag a bit before being adjusted to
|
||||
// match this value.
|
||||
int GetCapacity() override;
|
||||
|
||||
virtual int GetActualCapacity();
|
||||
|
||||
bool OwnsThisThread() override { return true; }
|
||||
|
||||
// Dynamically change the number of worker threads.
|
||||
// without threading this is equal to the
|
||||
// number of tasks that can be running at once
|
||||
// (inside each other)
|
||||
Status SetCapacity(int threads);
|
||||
|
||||
static int DefaultCapacity() { return 8; }
|
||||
|
||||
// Shutdown the pool. Once the pool starts shutting down, new tasks
|
||||
// cannot be submitted anymore.
|
||||
// If "wait" is true, shutdown waits for all pending tasks to be finished.
|
||||
// If "wait" is false, workers are stopped as soon as currently executing
|
||||
// tasks are finished.
|
||||
Status Shutdown(bool wait = true);
|
||||
|
||||
// Wait for the thread pool to become idle
|
||||
//
|
||||
// This is useful for sequencing tests
|
||||
void WaitForIdle();
|
||||
|
||||
protected:
|
||||
static std::shared_ptr<ThreadPool> MakeCpuThreadPool();
|
||||
ThreadPool();
|
||||
};
|
||||
|
||||
#endif // ARROW_ENABLE_THREADING
|
||||
|
||||
// Return the process-global thread pool for CPU-bound tasks.
|
||||
ARROW_EXPORT ThreadPool* GetCpuThreadPool();
|
||||
|
||||
/// \brief Potentially run an async operation serially (if use_threads is false)
|
||||
/// \see RunSerially
|
||||
///
|
||||
/// If `use_threads` is true, the global CPU executor is used.
|
||||
/// If `use_threads` is false, a temporary SerialExecutor is used.
|
||||
/// `get_future` is called (from this thread) with the chosen executor and must
|
||||
/// return a future that will eventually finish. This function returns once the
|
||||
/// future has finished.
|
||||
template <typename Fut, typename ValueType = typename Fut::ValueType>
|
||||
typename Fut::SyncType RunSynchronously(FnOnce<Fut(Executor*)> get_future,
|
||||
bool use_threads) {
|
||||
if (use_threads) {
|
||||
auto fut = std::move(get_future)(GetCpuThreadPool());
|
||||
return FutureToSync(fut);
|
||||
} else {
|
||||
return SerialExecutor::RunInSerialExecutor<ValueType>(std::move(get_future));
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Potentially iterate an async generator serially (if use_threads is false)
|
||||
/// using a potentially custom Executor
|
||||
/// \see IterateGenerator
|
||||
///
|
||||
/// If `use_threads` is true, the custom executor or, if null,
|
||||
/// the global CPU executor will be used. Each call to
|
||||
/// the iterator will simply wait until the next item is available. Tasks may run in
|
||||
/// the background between calls.
|
||||
///
|
||||
/// If `use_threads` is false, the calling thread only will be used. Each call to
|
||||
/// the iterator will use the calling thread to do enough work to generate one item.
|
||||
/// Tasks will be left in a queue until the next call and no work will be done between
|
||||
/// calls.
|
||||
template <typename T>
|
||||
Iterator<T> IterateSynchronously(
|
||||
FnOnce<Result<std::function<Future<T>()>>(Executor*)> get_gen, bool use_threads,
|
||||
Executor* executor) {
|
||||
if (use_threads) {
|
||||
auto used_executor = executor != NULLPTR ? executor : GetCpuThreadPool();
|
||||
auto maybe_gen = std::move(get_gen)(used_executor);
|
||||
if (!maybe_gen.ok()) {
|
||||
return MakeErrorIterator<T>(maybe_gen.status());
|
||||
}
|
||||
return MakeGeneratorIterator(*maybe_gen);
|
||||
} else {
|
||||
return SerialExecutor::IterateGenerator(std::move(get_gen));
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Potentially iterate an async generator serially (if use_threads is false)
|
||||
/// using the default CPU thread pool
|
||||
/// \see IterateGenerator
|
||||
///
|
||||
/// If `use_threads` is true, the global CPU executor will be used. Each call to
|
||||
/// the iterator will simply wait until the next item is available. Tasks may run in
|
||||
/// the background between calls.
|
||||
///
|
||||
/// If `use_threads` is false, the calling thread only will be used. Each call to
|
||||
/// the iterator will use the calling thread to do enough work to generate one item.
|
||||
/// Tasks will be left in a queue until the next call and no work will be done between
|
||||
/// calls.
|
||||
template <typename T>
|
||||
Iterator<T> IterateSynchronously(
|
||||
FnOnce<Result<std::function<Future<T>()>>(Executor*)> get_gen, bool use_threads) {
|
||||
return IterateSynchronously(std::move(get_gen), use_threads, NULLPTR);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,83 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
enum DivideOrMultiply {
|
||||
MULTIPLY,
|
||||
DIVIDE,
|
||||
};
|
||||
|
||||
ARROW_EXPORT
|
||||
std::pair<DivideOrMultiply, int64_t> GetTimestampConversion(TimeUnit::type in_unit,
|
||||
TimeUnit::type out_unit);
|
||||
|
||||
// Converts a Timestamp value into another Timestamp value.
|
||||
//
|
||||
// This function takes care of properly transforming from one unit to another.
|
||||
//
|
||||
// \param[in] in the input type. Must be TimestampType.
|
||||
// \param[in] out the output type. Must be TimestampType.
|
||||
// \param[in] value the input value.
|
||||
//
|
||||
// \return The converted value, or an error.
|
||||
ARROW_EXPORT Result<int64_t> ConvertTimestampValue(const std::shared_ptr<DataType>& in,
|
||||
const std::shared_ptr<DataType>& out,
|
||||
int64_t value);
|
||||
|
||||
template <typename Visitor, typename... Args>
|
||||
decltype(std::declval<Visitor>()(std::chrono::seconds{}, std::declval<Args&&>()...))
|
||||
VisitDuration(TimeUnit::type unit, Visitor&& visitor, Args&&... args) {
|
||||
switch (unit) {
|
||||
default:
|
||||
case TimeUnit::SECOND:
|
||||
break;
|
||||
case TimeUnit::MILLI:
|
||||
return visitor(std::chrono::milliseconds{}, std::forward<Args>(args)...);
|
||||
case TimeUnit::MICRO:
|
||||
return visitor(std::chrono::microseconds{}, std::forward<Args>(args)...);
|
||||
case TimeUnit::NANO:
|
||||
return visitor(std::chrono::nanoseconds{}, std::forward<Args>(args)...);
|
||||
}
|
||||
return visitor(std::chrono::seconds{}, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
/// Convert a count of seconds to the corresponding count in a different TimeUnit
|
||||
struct CastSecondsToUnitImpl {
|
||||
template <typename Duration>
|
||||
int64_t operator()(Duration, int64_t seconds) {
|
||||
auto duration = std::chrono::duration_cast<Duration>(std::chrono::seconds{seconds});
|
||||
return static_cast<int64_t>(duration.count());
|
||||
}
|
||||
};
|
||||
|
||||
inline int64_t CastSecondsToUnit(TimeUnit::type unit, int64_t seconds) {
|
||||
return VisitDuration(unit, CastSecondsToUnitImpl{}, seconds);
|
||||
}
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,45 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
namespace tracing {
|
||||
|
||||
class ARROW_EXPORT SpanDetails {
|
||||
public:
|
||||
virtual ~SpanDetails() {}
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Span {
|
||||
public:
|
||||
Span() noexcept;
|
||||
/// True if this span has been started with START_SPAN
|
||||
bool valid() const;
|
||||
/// End the span early
|
||||
void reset();
|
||||
std::unique_ptr<SpanDetails> details;
|
||||
};
|
||||
|
||||
} // namespace tracing
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,82 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace internal {
|
||||
struct Empty;
|
||||
} // namespace internal
|
||||
|
||||
template <typename T = internal::Empty>
|
||||
class WeakFuture;
|
||||
class FutureWaiter;
|
||||
|
||||
class TimestampParser;
|
||||
|
||||
namespace internal {
|
||||
|
||||
class Executor;
|
||||
class TaskGroup;
|
||||
class ThreadPool;
|
||||
class CpuInfo;
|
||||
|
||||
namespace tracing {
|
||||
|
||||
struct Scope;
|
||||
|
||||
} // namespace tracing
|
||||
} // namespace internal
|
||||
|
||||
struct Compression {
|
||||
/// \brief Compression algorithm
|
||||
enum type {
|
||||
UNCOMPRESSED,
|
||||
SNAPPY,
|
||||
GZIP,
|
||||
BROTLI,
|
||||
ZSTD,
|
||||
LZ4,
|
||||
LZ4_FRAME,
|
||||
LZO,
|
||||
BZ2,
|
||||
LZ4_HADOOP
|
||||
};
|
||||
};
|
||||
|
||||
namespace util {
|
||||
class AsyncTaskScheduler;
|
||||
class Compressor;
|
||||
class Decompressor;
|
||||
class Codec;
|
||||
class Uri;
|
||||
} // namespace util
|
||||
|
||||
template <typename T>
|
||||
struct Enumerated {
|
||||
T value;
|
||||
int index;
|
||||
bool last;
|
||||
|
||||
friend inline bool operator==(const Enumerated<T>& left, const Enumerated<T>& right) {
|
||||
return left.index == right.index && left.last == right.last &&
|
||||
left.value == right.value;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,73 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <type_traits>
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
/// \brief Metafunction to allow checking if a type matches any of another set of types
|
||||
template <typename...>
|
||||
struct IsOneOf : std::false_type {}; /// Base case: nothing has matched
|
||||
|
||||
template <typename T, typename U, typename... Args>
|
||||
struct IsOneOf<T, U, Args...> {
|
||||
/// Recursive case: T == U or T matches any other types provided (not including U).
|
||||
static constexpr bool value = std::is_same<T, U>::value || IsOneOf<T, Args...>::value;
|
||||
};
|
||||
|
||||
/// \brief Shorthand for using IsOneOf + std::enable_if
|
||||
template <typename T, typename... Args>
|
||||
using EnableIfIsOneOf = typename std::enable_if<IsOneOf<T, Args...>::value, T>::type;
|
||||
|
||||
/// \brief is_null_pointer from C++17
|
||||
template <typename T>
|
||||
struct is_null_pointer : std::is_same<std::nullptr_t, typename std::remove_cv<T>::type> {
|
||||
};
|
||||
|
||||
template <int kNumBytes>
|
||||
struct SizedIntImpl;
|
||||
|
||||
template <>
|
||||
struct SizedIntImpl<1> {
|
||||
using type = int8_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SizedIntImpl<2> {
|
||||
using type = int16_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SizedIntImpl<4> {
|
||||
using type = int32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SizedIntImpl<8> {
|
||||
using type = int64_t;
|
||||
};
|
||||
|
||||
// Map a number of bytes to a type
|
||||
template <int kNumBytes>
|
||||
using SizedInt = typename SizedIntImpl<kNumBytes>::type;
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,87 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Contains utilities for making UBSan happy.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
namespace internal {
|
||||
|
||||
constexpr uint8_t kNonNullFiller = 0;
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief Returns maybe_null if not null or a non-null pointer to an arbitrary memory
|
||||
/// that shouldn't be dereferenced.
|
||||
///
|
||||
/// Memset/Memcpy are undefined when a nullptr is passed as an argument use this utility
|
||||
/// method to wrap locations where this could happen.
|
||||
///
|
||||
/// Note: Flatbuffers has UBSan warnings if a zero length vector is passed.
|
||||
/// https://github.com/google/flatbuffers/pull/5355 is trying to resolve
|
||||
/// them.
|
||||
template <typename T>
|
||||
inline T* MakeNonNull(T* maybe_null = NULLPTR) {
|
||||
if (ARROW_PREDICT_TRUE(maybe_null != NULLPTR)) {
|
||||
return maybe_null;
|
||||
}
|
||||
|
||||
return const_cast<T*>(reinterpret_cast<const T*>(&internal::kNonNullFiller));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> SafeLoadAs(
|
||||
const uint8_t* unaligned) {
|
||||
std::remove_const_t<T> ret;
|
||||
std::memcpy(&ret, unaligned, sizeof(T));
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> SafeLoad(const T* unaligned) {
|
||||
std::remove_const_t<T> ret;
|
||||
std::memcpy(&ret, static_cast<const void*>(unaligned), sizeof(T));
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename U, typename T>
|
||||
inline std::enable_if_t<std::is_trivially_copyable_v<T> &&
|
||||
std::is_trivially_copyable_v<U> && sizeof(T) == sizeof(U),
|
||||
U>
|
||||
SafeCopy(T value) {
|
||||
std::remove_const_t<U> ret;
|
||||
std::memcpy(&ret, static_cast<const void*>(&value), sizeof(T));
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::enable_if_t<std::is_trivially_copyable_v<T>, void> SafeStore(void* unaligned,
|
||||
T value) {
|
||||
std::memcpy(unaligned, &value, sizeof(T));
|
||||
}
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,31 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <cstdint>
|
||||
#include "arrow/array/data.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace union_util {
|
||||
|
||||
/// \brief Compute the number of of logical nulls in a sparse union array
|
||||
int64_t LogicalSparseUnionNullCount(const ArraySpan& span);
|
||||
|
||||
/// \brief Compute the number of of logical nulls in a dense union array
|
||||
int64_t LogicalDenseUnionNullCount(const ArraySpan& span);
|
||||
|
||||
} // namespace union_util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,30 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
#include <string_view>
|
||||
|
||||
namespace arrow {
|
||||
|
||||
[[noreturn]] ARROW_EXPORT void Unreachable(const char* message = "Unreachable");
|
||||
|
||||
[[noreturn]] ARROW_EXPORT void Unreachable(std::string_view message);
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,119 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow::util {
|
||||
|
||||
/// \brief A parsed URI
|
||||
class ARROW_EXPORT Uri {
|
||||
public:
|
||||
Uri();
|
||||
~Uri();
|
||||
Uri(Uri&&);
|
||||
Uri& operator=(Uri&&);
|
||||
|
||||
// XXX Should we use std::string_view instead? These functions are
|
||||
// not performance-critical.
|
||||
|
||||
/// The URI scheme, such as "http", or the empty string if the URI has no
|
||||
/// explicit scheme.
|
||||
std::string scheme() const;
|
||||
|
||||
/// Convenience function that returns true if the scheme() is "file"
|
||||
bool is_file_scheme() const;
|
||||
|
||||
/// Whether the URI has an explicit host name. This may return true if
|
||||
/// the URI has an empty host (e.g. "file:///tmp/foo"), while it returns
|
||||
/// false is the URI has not host component at all (e.g. "file:/tmp/foo").
|
||||
bool has_host() const;
|
||||
/// The URI host name, such as "localhost", "127.0.0.1" or "::1", or the empty
|
||||
/// string is the URI does not have a host component.
|
||||
std::string host() const;
|
||||
|
||||
/// The URI port number, as a string such as "80", or the empty string is the URI
|
||||
/// does not have a port number component.
|
||||
std::string port_text() const;
|
||||
/// The URI port parsed as an integer, or -1 if the URI does not have a port
|
||||
/// number component.
|
||||
int32_t port() const;
|
||||
|
||||
/// The username specified in the URI.
|
||||
std::string username() const;
|
||||
/// The password specified in the URI.
|
||||
std::string password() const;
|
||||
|
||||
/// The URI path component.
|
||||
std::string path() const;
|
||||
|
||||
/// The URI query string
|
||||
std::string query_string() const;
|
||||
|
||||
/// The URI query items
|
||||
///
|
||||
/// Note this API doesn't allow differentiating between an empty value
|
||||
/// and a missing value, such in "a&b=1" vs. "a=&b=1".
|
||||
Result<std::vector<std::pair<std::string, std::string>>> query_items() const;
|
||||
|
||||
/// Get the string representation of this URI.
|
||||
const std::string& ToString() const;
|
||||
|
||||
/// Factory function to parse a URI from its string representation.
|
||||
Status Parse(const std::string& uri_string);
|
||||
|
||||
/// Factory function to parse a URI from its string representation.
|
||||
static Result<Uri> FromString(const std::string& uri_string);
|
||||
|
||||
private:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
/// Percent-encode the input string, for use e.g. as a URI query parameter.
|
||||
///
|
||||
/// This will escape directory separators, making this function unsuitable
|
||||
/// for encoding URI paths directly. See UriFromAbsolutePath() instead.
|
||||
ARROW_EXPORT
|
||||
std::string UriEscape(std::string_view s);
|
||||
|
||||
ARROW_EXPORT
|
||||
std::string UriUnescape(std::string_view s);
|
||||
|
||||
/// Encode a host for use within a URI, such as "localhost",
|
||||
/// "127.0.0.1", or "[::1]".
|
||||
ARROW_EXPORT
|
||||
std::string UriEncodeHost(std::string_view host);
|
||||
|
||||
/// Whether the string is a syntactically valid URI scheme according to RFC 3986.
|
||||
ARROW_EXPORT
|
||||
bool IsValidUriScheme(std::string_view s);
|
||||
|
||||
/// Create a file uri from a given absolute path
|
||||
ARROW_EXPORT
|
||||
Result<std::string> UriFromAbsolutePath(std::string_view path);
|
||||
|
||||
} // namespace arrow::util
|
||||
@@ -0,0 +1,59 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
// Convert a UTF8 string to a wstring (either UTF16 or UTF32, depending
|
||||
// on the wchar_t width).
|
||||
ARROW_EXPORT Result<std::wstring> UTF8ToWideString(std::string_view source);
|
||||
|
||||
// Similarly, convert a wstring to a UTF8 string.
|
||||
ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source);
|
||||
|
||||
// Convert UTF8 string to a UTF16 string.
|
||||
ARROW_EXPORT Result<std::u16string> UTF8StringToUTF16(std::string_view source);
|
||||
|
||||
// Convert UTF16 string to a UTF8 string.
|
||||
ARROW_EXPORT Result<std::string> UTF16StringToUTF8(std::u16string_view source);
|
||||
|
||||
// This function needs to be called before doing UTF8 validation.
|
||||
ARROW_EXPORT void InitializeUTF8();
|
||||
|
||||
ARROW_EXPORT bool ValidateUTF8(const uint8_t* data, int64_t size);
|
||||
|
||||
ARROW_EXPORT bool ValidateUTF8(std::string_view str);
|
||||
|
||||
// Skip UTF8 byte order mark, if any.
|
||||
ARROW_EXPORT
|
||||
Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size);
|
||||
|
||||
static constexpr uint32_t kMaxUnicodeCodepoint = 0x110000;
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,947 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This is a private header for string-to-number parsing utilities
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/config.h"
|
||||
#include "arrow/util/float16.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/time.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
#include "arrow/vendored/datetime.h"
|
||||
#include "arrow/vendored/strptime.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief A virtual string to timestamp parser
|
||||
class ARROW_EXPORT TimestampParser {
|
||||
public:
|
||||
virtual ~TimestampParser() = default;
|
||||
|
||||
virtual bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
|
||||
int64_t* out,
|
||||
bool* out_zone_offset_present = NULLPTR) const = 0;
|
||||
|
||||
virtual const char* kind() const = 0;
|
||||
|
||||
virtual const char* format() const;
|
||||
|
||||
/// \brief Create a TimestampParser that recognizes strptime-like format strings
|
||||
static std::shared_ptr<TimestampParser> MakeStrptime(std::string format);
|
||||
|
||||
/// \brief Create a TimestampParser that recognizes (locale-agnostic) ISO8601
|
||||
/// timestamps
|
||||
static std::shared_ptr<TimestampParser> MakeISO8601();
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief The entry point for conversion from strings.
|
||||
///
|
||||
/// Specializations of StringConverter for `ARROW_TYPE` must define:
|
||||
/// - A default constructible member type `value_type` which will be yielded on a
|
||||
/// successful parse.
|
||||
/// - The static member function `Convert`, callable with signature
|
||||
/// `(const ARROW_TYPE& t, const char* s, size_t length, value_type* out)`.
|
||||
/// `Convert` returns truthy for successful parses and assigns the parsed values to
|
||||
/// `*out`. Parameters required for parsing (for example a timestamp's TimeUnit)
|
||||
/// are acquired from the type parameter `t`.
|
||||
template <typename ARROW_TYPE, typename Enable = void>
|
||||
struct StringConverter;
|
||||
|
||||
template <typename T>
|
||||
struct is_parseable {
|
||||
template <typename U, typename = typename StringConverter<U>::value_type>
|
||||
static std::true_type Test(U*);
|
||||
|
||||
template <typename U>
|
||||
static std::false_type Test(...);
|
||||
|
||||
static constexpr bool value = decltype(Test<T>(NULLPTR))::value;
|
||||
};
|
||||
|
||||
template <typename T, typename R = void>
|
||||
using enable_if_parseable = enable_if_t<is_parseable<T>::value, R>;
|
||||
|
||||
template <>
|
||||
struct StringConverter<BooleanType> {
|
||||
using value_type = bool;
|
||||
|
||||
bool Convert(const BooleanType&, const char* s, size_t length, value_type* out) {
|
||||
if (length == 1) {
|
||||
// "0" or "1"?
|
||||
if (s[0] == '0') {
|
||||
*out = false;
|
||||
return true;
|
||||
}
|
||||
if (s[0] == '1') {
|
||||
*out = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (length == 4) {
|
||||
// "true"?
|
||||
*out = true;
|
||||
return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
|
||||
(s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
|
||||
}
|
||||
if (length == 5) {
|
||||
// "false"?
|
||||
*out = false;
|
||||
return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
|
||||
(s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
|
||||
(s[4] == 'e' || s[4] == 'E'));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// Ideas for faster float parsing:
|
||||
// - http://rapidjson.org/md_doc_internals.html#ParsingDouble
|
||||
// - https://github.com/google/double-conversion [used here]
|
||||
// - https://github.com/achan001/dtoa-fast
|
||||
|
||||
ARROW_EXPORT
|
||||
bool StringToFloat(const char* s, size_t length, char decimal_point, float* out);
|
||||
|
||||
ARROW_EXPORT
|
||||
bool StringToFloat(const char* s, size_t length, char decimal_point, double* out);
|
||||
|
||||
ARROW_EXPORT
|
||||
bool StringToFloat(const char* s, size_t length, char decimal_point,
|
||||
::arrow::util::Float16* out);
|
||||
|
||||
template <>
|
||||
struct StringConverter<FloatType> {
|
||||
using value_type = float;
|
||||
|
||||
explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}
|
||||
|
||||
bool Convert(const FloatType&, const char* s, size_t length, value_type* out) {
|
||||
return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
|
||||
}
|
||||
|
||||
private:
|
||||
const char decimal_point;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<DoubleType> {
|
||||
using value_type = double;
|
||||
|
||||
explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}
|
||||
|
||||
bool Convert(const DoubleType&, const char* s, size_t length, value_type* out) {
|
||||
return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
|
||||
}
|
||||
|
||||
private:
|
||||
const char decimal_point;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<HalfFloatType> {
|
||||
using value_type = ::arrow::util::Float16;
|
||||
|
||||
explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}
|
||||
|
||||
bool Convert(const HalfFloatType&, const char* s, size_t length, value_type* out) {
|
||||
return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
|
||||
}
|
||||
|
||||
private:
|
||||
const char decimal_point;
|
||||
};
|
||||
|
||||
// NOTE: HalfFloatType would require a half<->float conversion library
|
||||
|
||||
inline uint8_t ParseDecimalDigit(char c) { return static_cast<uint8_t>(c - '0'); }
|
||||
|
||||
#define PARSE_UNSIGNED_ITERATION(C_TYPE) \
|
||||
if (length > 0) { \
|
||||
uint8_t digit = ParseDecimalDigit(*s++); \
|
||||
result = static_cast<C_TYPE>(result * 10U); \
|
||||
length--; \
|
||||
if (ARROW_PREDICT_FALSE(digit > 9U)) { \
|
||||
/* Non-digit */ \
|
||||
return false; \
|
||||
} \
|
||||
result = static_cast<C_TYPE>(result + digit); \
|
||||
} else { \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE) \
|
||||
if (length > 0) { \
|
||||
if (ARROW_PREDICT_FALSE(result > std::numeric_limits<C_TYPE>::max() / 10U)) { \
|
||||
/* Overflow */ \
|
||||
return false; \
|
||||
} \
|
||||
uint8_t digit = ParseDecimalDigit(*s++); \
|
||||
result = static_cast<C_TYPE>(result * 10U); \
|
||||
C_TYPE new_result = static_cast<C_TYPE>(result + digit); \
|
||||
if (ARROW_PREDICT_FALSE(--length > 0)) { \
|
||||
/* Too many digits */ \
|
||||
return false; \
|
||||
} \
|
||||
if (ARROW_PREDICT_FALSE(digit > 9U)) { \
|
||||
/* Non-digit */ \
|
||||
return false; \
|
||||
} \
|
||||
if (ARROW_PREDICT_FALSE(new_result < result)) { \
|
||||
/* Overflow */ \
|
||||
return false; \
|
||||
} \
|
||||
result = new_result; \
|
||||
}
|
||||
|
||||
inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) {
|
||||
uint8_t result = 0;
|
||||
|
||||
do {
|
||||
PARSE_UNSIGNED_ITERATION(uint8_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint8_t);
|
||||
PARSE_UNSIGNED_ITERATION_LAST(uint8_t);
|
||||
} while (false);
|
||||
*out = result;
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) {
|
||||
uint16_t result = 0;
|
||||
do {
|
||||
PARSE_UNSIGNED_ITERATION(uint16_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint16_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint16_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint16_t);
|
||||
PARSE_UNSIGNED_ITERATION_LAST(uint16_t);
|
||||
} while (false);
|
||||
*out = result;
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) {
|
||||
uint32_t result = 0;
|
||||
do {
|
||||
PARSE_UNSIGNED_ITERATION(uint32_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint32_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint32_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint32_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint32_t);
|
||||
|
||||
PARSE_UNSIGNED_ITERATION(uint32_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint32_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint32_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint32_t);
|
||||
|
||||
PARSE_UNSIGNED_ITERATION_LAST(uint32_t);
|
||||
} while (false);
|
||||
*out = result;
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) {
|
||||
uint64_t result = 0;
|
||||
do {
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
PARSE_UNSIGNED_ITERATION(uint64_t);
|
||||
|
||||
PARSE_UNSIGNED_ITERATION_LAST(uint64_t);
|
||||
} while (false);
|
||||
*out = result;
|
||||
return true;
|
||||
}
|
||||
|
||||
#undef PARSE_UNSIGNED_ITERATION
|
||||
#undef PARSE_UNSIGNED_ITERATION_LAST
|
||||
|
||||
template <typename T>
|
||||
bool ParseHex(const char* s, size_t length, T* out) {
|
||||
// lets make sure that the length of the string is not too big
|
||||
if (!ARROW_PREDICT_TRUE(sizeof(T) * 2 >= length && length > 0)) {
|
||||
return false;
|
||||
}
|
||||
T result = 0;
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
result = static_cast<T>(result << 4);
|
||||
if (s[i] >= '0' && s[i] <= '9') {
|
||||
result = static_cast<T>(result | (s[i] - '0'));
|
||||
} else if (s[i] >= 'A' && s[i] <= 'F') {
|
||||
result = static_cast<T>(result | (s[i] - 'A' + 10));
|
||||
} else if (s[i] >= 'a' && s[i] <= 'f') {
|
||||
result = static_cast<T>(result | (s[i] - 'a' + 10));
|
||||
} else {
|
||||
/* Non-digit */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
*out = result;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class ARROW_TYPE>
|
||||
struct StringToUnsignedIntConverterMixin {
|
||||
using value_type = typename ARROW_TYPE::c_type;
|
||||
|
||||
bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
|
||||
if (ARROW_PREDICT_FALSE(length == 0)) {
|
||||
return false;
|
||||
}
|
||||
// If it starts with 0x then its hex
|
||||
if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
|
||||
length -= 2;
|
||||
s += 2;
|
||||
|
||||
return ARROW_PREDICT_TRUE(ParseHex(s, length, out));
|
||||
}
|
||||
// Skip leading zeros
|
||||
while (length > 0 && *s == '0') {
|
||||
length--;
|
||||
s++;
|
||||
}
|
||||
return ParseUnsigned(s, length, out);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<UInt8Type> : public StringToUnsignedIntConverterMixin<UInt8Type> {
|
||||
using StringToUnsignedIntConverterMixin<UInt8Type>::StringToUnsignedIntConverterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<UInt16Type>
|
||||
: public StringToUnsignedIntConverterMixin<UInt16Type> {
|
||||
using StringToUnsignedIntConverterMixin<UInt16Type>::StringToUnsignedIntConverterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<UInt32Type>
|
||||
: public StringToUnsignedIntConverterMixin<UInt32Type> {
|
||||
using StringToUnsignedIntConverterMixin<UInt32Type>::StringToUnsignedIntConverterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<UInt64Type>
|
||||
: public StringToUnsignedIntConverterMixin<UInt64Type> {
|
||||
using StringToUnsignedIntConverterMixin<UInt64Type>::StringToUnsignedIntConverterMixin;
|
||||
};
|
||||
|
||||
template <class ARROW_TYPE>
|
||||
struct StringToSignedIntConverterMixin {
|
||||
using value_type = typename ARROW_TYPE::c_type;
|
||||
using unsigned_type = typename std::make_unsigned<value_type>::type;
|
||||
|
||||
bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
|
||||
static constexpr auto max_positive =
|
||||
static_cast<unsigned_type>(std::numeric_limits<value_type>::max());
|
||||
// Assuming two's complement
|
||||
static constexpr unsigned_type max_negative = max_positive + 1;
|
||||
bool negative = false;
|
||||
unsigned_type unsigned_value = 0;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(length == 0)) {
|
||||
return false;
|
||||
}
|
||||
// If it starts with 0x then its hex
|
||||
if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
|
||||
length -= 2;
|
||||
s += 2;
|
||||
|
||||
if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) {
|
||||
return false;
|
||||
}
|
||||
*out = static_cast<value_type>(unsigned_value);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (*s == '-') {
|
||||
negative = true;
|
||||
s++;
|
||||
if (--length == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Skip leading zeros
|
||||
while (length > 0 && *s == '0') {
|
||||
length--;
|
||||
s++;
|
||||
}
|
||||
if (!ARROW_PREDICT_TRUE(ParseUnsigned(s, length, &unsigned_value))) {
|
||||
return false;
|
||||
}
|
||||
if (negative) {
|
||||
if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) {
|
||||
return false;
|
||||
}
|
||||
// To avoid both compiler warnings (with unsigned negation)
|
||||
// and undefined behaviour (with signed negation overflow),
|
||||
// use the expanded formula for 2's complement negation.
|
||||
*out = static_cast<value_type>(~unsigned_value + 1);
|
||||
} else {
|
||||
if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) {
|
||||
return false;
|
||||
}
|
||||
*out = static_cast<value_type>(unsigned_value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<Int8Type> : public StringToSignedIntConverterMixin<Int8Type> {
|
||||
using StringToSignedIntConverterMixin<Int8Type>::StringToSignedIntConverterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<Int16Type> : public StringToSignedIntConverterMixin<Int16Type> {
|
||||
using StringToSignedIntConverterMixin<Int16Type>::StringToSignedIntConverterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<Int32Type> : public StringToSignedIntConverterMixin<Int32Type> {
|
||||
using StringToSignedIntConverterMixin<Int32Type>::StringToSignedIntConverterMixin;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64Type> {
|
||||
using StringToSignedIntConverterMixin<Int64Type>::StringToSignedIntConverterMixin;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Inline-able ISO-8601 parser
|
||||
|
||||
using ts_type = TimestampType::c_type;
|
||||
|
||||
template <typename Duration>
|
||||
static inline bool ParseHH(const char* s, Duration* out) {
|
||||
uint8_t hours = 0;
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(hours >= 24)) {
|
||||
return false;
|
||||
}
|
||||
*out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours));
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Duration>
|
||||
static inline bool ParseHH_MM(const char* s, Duration* out) {
|
||||
uint8_t hours = 0;
|
||||
uint8_t minutes = 0;
|
||||
if (ARROW_PREDICT_FALSE(s[2] != ':')) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(hours >= 24)) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(minutes >= 60)) {
|
||||
return false;
|
||||
}
|
||||
*out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
|
||||
std::chrono::minutes(minutes));
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Duration>
|
||||
static inline bool ParseHHMM(const char* s, Duration* out) {
|
||||
uint8_t hours = 0;
|
||||
uint8_t minutes = 0;
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 2, 2, &minutes))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(hours >= 24)) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(minutes >= 60)) {
|
||||
return false;
|
||||
}
|
||||
*out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
|
||||
std::chrono::minutes(minutes));
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Duration>
|
||||
static inline bool ParseHH_MM_SS(const char* s, Duration* out) {
|
||||
uint8_t hours = 0;
|
||||
uint8_t minutes = 0;
|
||||
uint8_t seconds = 0;
|
||||
if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 6, 2, &seconds))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(hours >= 24)) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(minutes >= 60)) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(seconds >= 60)) {
|
||||
return false;
|
||||
}
|
||||
*out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
|
||||
std::chrono::minutes(minutes) +
|
||||
std::chrono::seconds(seconds));
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type unit,
|
||||
uint32_t* out) {
|
||||
// The decimal point has been peeled off at this point
|
||||
|
||||
// Fail if number of decimal places provided exceeds what the unit can hold.
|
||||
// Calculate how many trailing decimal places are omitted for the unit
|
||||
// e.g. if 4 decimal places are provided and unit is MICRO, 2 are missing
|
||||
size_t omitted = 0;
|
||||
switch (unit) {
|
||||
case TimeUnit::MILLI:
|
||||
if (ARROW_PREDICT_FALSE(length > 3)) {
|
||||
return false;
|
||||
}
|
||||
if (length < 3) {
|
||||
omitted = 3 - length;
|
||||
}
|
||||
break;
|
||||
case TimeUnit::MICRO:
|
||||
if (ARROW_PREDICT_FALSE(length > 6)) {
|
||||
return false;
|
||||
}
|
||||
if (length < 6) {
|
||||
omitted = 6 - length;
|
||||
}
|
||||
break;
|
||||
case TimeUnit::NANO:
|
||||
if (ARROW_PREDICT_FALSE(length > 9)) {
|
||||
return false;
|
||||
}
|
||||
if (length < 9) {
|
||||
omitted = 9 - length;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_TRUE(omitted == 0)) {
|
||||
return ParseUnsigned(s, length, out);
|
||||
} else {
|
||||
uint32_t subseconds = 0;
|
||||
bool success = ParseUnsigned(s, length, &subseconds);
|
||||
if (ARROW_PREDICT_TRUE(success)) {
|
||||
switch (omitted) {
|
||||
case 1:
|
||||
*out = subseconds * 10;
|
||||
break;
|
||||
case 2:
|
||||
*out = subseconds * 100;
|
||||
break;
|
||||
case 3:
|
||||
*out = subseconds * 1000;
|
||||
break;
|
||||
case 4:
|
||||
*out = subseconds * 10000;
|
||||
break;
|
||||
case 5:
|
||||
*out = subseconds * 100000;
|
||||
break;
|
||||
case 6:
|
||||
*out = subseconds * 1000000;
|
||||
break;
|
||||
case 7:
|
||||
*out = subseconds * 10000000;
|
||||
break;
|
||||
case 8:
|
||||
*out = subseconds * 100000000;
|
||||
break;
|
||||
default:
|
||||
// Impossible case
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename Duration>
|
||||
static inline bool ParseYYYY_MM_DD(const char* s, Duration* since_epoch) {
|
||||
uint16_t year = 0;
|
||||
uint8_t month = 0;
|
||||
uint8_t day = 0;
|
||||
if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, &year))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, &month))) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, &day))) {
|
||||
return false;
|
||||
}
|
||||
arrow_vendored::date::year_month_day ymd{arrow_vendored::date::year{year},
|
||||
arrow_vendored::date::month{month},
|
||||
arrow_vendored::date::day{day}};
|
||||
if (ARROW_PREDICT_FALSE(!ymd.ok())) return false;
|
||||
|
||||
*since_epoch = std::chrono::duration_cast<Duration>(
|
||||
arrow_vendored::date::sys_days{ymd}.time_since_epoch());
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool ParseTimestampISO8601(const char* s, size_t length,
|
||||
TimeUnit::type unit, TimestampType::c_type* out,
|
||||
bool* out_zone_offset_present = NULLPTR) {
|
||||
using seconds_type = std::chrono::duration<TimestampType::c_type>;
|
||||
|
||||
// We allow the following zone offset formats:
|
||||
// - (none)
|
||||
// - Z
|
||||
// - [+-]HH(:?MM)?
|
||||
//
|
||||
// We allow the following formats for all units:
|
||||
// - "YYYY-MM-DD"
|
||||
// - "YYYY-MM-DD[ T]hhZ?"
|
||||
// - "YYYY-MM-DD[ T]hh:mmZ?"
|
||||
// - "YYYY-MM-DD[ T]hh:mm:ssZ?"
|
||||
//
|
||||
// We allow the following formats for unit == MILLI, MICRO, or NANO:
|
||||
// - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
|
||||
//
|
||||
// We allow the following formats for unit == MICRO, or NANO:
|
||||
// - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
|
||||
//
|
||||
// We allow the following formats for unit == NANO:
|
||||
// - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
|
||||
//
|
||||
// UTC is always assumed, and the DataType's timezone is ignored.
|
||||
//
|
||||
|
||||
if (ARROW_PREDICT_FALSE(length < 10)) return false;
|
||||
|
||||
seconds_type seconds_since_epoch;
|
||||
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &seconds_since_epoch))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (length == 10) {
|
||||
*out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (out_zone_offset_present) {
|
||||
*out_zone_offset_present = false;
|
||||
}
|
||||
|
||||
seconds_type zone_offset(0);
|
||||
if (s[length - 1] == 'Z') {
|
||||
--length;
|
||||
if (out_zone_offset_present) *out_zone_offset_present = true;
|
||||
} else if (s[length - 3] == '+' || s[length - 3] == '-') {
|
||||
// [+-]HH
|
||||
length -= 3;
|
||||
if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + length + 1, &zone_offset))) {
|
||||
return false;
|
||||
}
|
||||
if (s[length] == '+') zone_offset *= -1;
|
||||
if (out_zone_offset_present) *out_zone_offset_present = true;
|
||||
} else if (s[length - 5] == '+' || s[length - 5] == '-') {
|
||||
// [+-]HHMM
|
||||
length -= 5;
|
||||
if (ARROW_PREDICT_FALSE(!detail::ParseHHMM(s + length + 1, &zone_offset))) {
|
||||
return false;
|
||||
}
|
||||
if (s[length] == '+') zone_offset *= -1;
|
||||
if (out_zone_offset_present) *out_zone_offset_present = true;
|
||||
} else if ((s[length - 6] == '+' || s[length - 6] == '-') && (s[length - 3] == ':')) {
|
||||
// [+-]HH:MM
|
||||
length -= 6;
|
||||
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + length + 1, &zone_offset))) {
|
||||
return false;
|
||||
}
|
||||
if (s[length] == '+') zone_offset *= -1;
|
||||
if (out_zone_offset_present) *out_zone_offset_present = true;
|
||||
}
|
||||
|
||||
seconds_type seconds_since_midnight;
|
||||
switch (length) {
|
||||
case 13: // YYYY-MM-DD[ T]hh
|
||||
if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + 11, &seconds_since_midnight))) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case 16: // YYYY-MM-DD[ T]hh:mm
|
||||
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + 11, &seconds_since_midnight))) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case 19: // YYYY-MM-DD[ T]hh:mm:ss
|
||||
case 21: // YYYY-MM-DD[ T]hh:mm:ss.s
|
||||
case 22: // YYYY-MM-DD[ T]hh:mm:ss.ss
|
||||
case 23: // YYYY-MM-DD[ T]hh:mm:ss.sss
|
||||
case 24: // YYYY-MM-DD[ T]hh:mm:ss.ssss
|
||||
case 25: // YYYY-MM-DD[ T]hh:mm:ss.sssss
|
||||
case 26: // YYYY-MM-DD[ T]hh:mm:ss.ssssss
|
||||
case 27: // YYYY-MM-DD[ T]hh:mm:ss.sssssss
|
||||
case 28: // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
|
||||
case 29: // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
|
||||
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s + 11, &seconds_since_midnight))) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
seconds_since_epoch += seconds_since_midnight;
|
||||
seconds_since_epoch += zone_offset;
|
||||
|
||||
if (length <= 19) {
|
||||
*out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(s[19] != '.')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t subseconds = 0;
|
||||
if (ARROW_PREDICT_FALSE(
|
||||
!detail::ParseSubSeconds(s + 20, length - 20, unit, &subseconds))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*out = util::CastSecondsToUnit(unit, seconds_since_epoch.count()) + subseconds;
|
||||
return true;
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(ARROW_WITH_MUSL)
|
||||
static constexpr bool kStrptimeSupportsZone = false;
|
||||
#else
|
||||
static constexpr bool kStrptimeSupportsZone = true;
|
||||
#endif
|
||||
|
||||
/// \brief Returns time since the UNIX epoch in the requested unit
|
||||
static inline bool ParseTimestampStrptime(const char* buf, size_t length,
|
||||
const char* format, bool ignore_time_in_day,
|
||||
bool allow_trailing_chars, TimeUnit::type unit,
|
||||
int64_t* out) {
|
||||
// NOTE: strptime() is more than 10x faster than arrow_vendored::date::parse().
|
||||
// The buffer may not be nul-terminated
|
||||
std::string clean_copy(buf, length);
|
||||
struct tm result;
|
||||
memset(&result, 0, sizeof(struct tm));
|
||||
#ifdef _WIN32
|
||||
char* ret = arrow_strptime(clean_copy.c_str(), format, &result);
|
||||
#else
|
||||
char* ret = strptime(clean_copy.c_str(), format, &result);
|
||||
#endif
|
||||
if (ret == NULLPTR) {
|
||||
return false;
|
||||
}
|
||||
if (!allow_trailing_chars && static_cast<size_t>(ret - clean_copy.c_str()) != length) {
|
||||
return false;
|
||||
}
|
||||
// ignore the time part
|
||||
arrow_vendored::date::sys_seconds secs =
|
||||
arrow_vendored::date::sys_days(arrow_vendored::date::year(result.tm_year + 1900) /
|
||||
(result.tm_mon + 1) / std::max(result.tm_mday, 1));
|
||||
if (!ignore_time_in_day) {
|
||||
secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) +
|
||||
std::chrono::seconds(result.tm_sec));
|
||||
#if !defined(_WIN32) && !defined(_AIX)
|
||||
secs -= std::chrono::seconds(result.tm_gmtoff);
|
||||
#endif
|
||||
}
|
||||
*out = util::CastSecondsToUnit(unit, secs.time_since_epoch().count());
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
struct StringConverter<TimestampType> {
|
||||
using value_type = int64_t;
|
||||
|
||||
bool Convert(const TimestampType& type, const char* s, size_t length, value_type* out) {
|
||||
return ParseTimestampISO8601(s, length, type.unit(), out);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct StringConverter<DurationType>
|
||||
: public StringToSignedIntConverterMixin<DurationType> {
|
||||
using StringToSignedIntConverterMixin<DurationType>::StringToSignedIntConverterMixin;
|
||||
};
|
||||
|
||||
template <typename DATE_TYPE>
|
||||
struct StringConverter<DATE_TYPE, enable_if_date<DATE_TYPE>> {
|
||||
using value_type = typename DATE_TYPE::c_type;
|
||||
|
||||
using duration_type =
|
||||
typename std::conditional<std::is_same<DATE_TYPE, Date32Type>::value,
|
||||
arrow_vendored::date::days,
|
||||
std::chrono::milliseconds>::type;
|
||||
|
||||
bool Convert(const DATE_TYPE& type, const char* s, size_t length, value_type* out) {
|
||||
if (ARROW_PREDICT_FALSE(length != 10)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
duration_type since_epoch;
|
||||
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &since_epoch))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*out = static_cast<value_type>(since_epoch.count());
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename TIME_TYPE>
|
||||
struct StringConverter<TIME_TYPE, enable_if_time<TIME_TYPE>> {
|
||||
using value_type = typename TIME_TYPE::c_type;
|
||||
|
||||
// We allow the following formats for all units:
|
||||
// - "hh:mm"
|
||||
// - "hh:mm:ss"
|
||||
//
|
||||
// We allow the following formats for unit == MILLI, MICRO, or NANO:
|
||||
// - "hh:mm:ss.s{1,3}"
|
||||
//
|
||||
// We allow the following formats for unit == MICRO, or NANO:
|
||||
// - "hh:mm:ss.s{4,6}"
|
||||
//
|
||||
// We allow the following formats for unit == NANO:
|
||||
// - "hh:mm:ss.s{7,9}"
|
||||
|
||||
bool Convert(const TIME_TYPE& type, const char* s, size_t length, value_type* out) {
|
||||
const auto unit = type.unit();
|
||||
std::chrono::seconds since_midnight;
|
||||
|
||||
if (length == 5) {
|
||||
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s, &since_midnight))) {
|
||||
return false;
|
||||
}
|
||||
*out =
|
||||
static_cast<value_type>(util::CastSecondsToUnit(unit, since_midnight.count()));
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(length < 8)) {
|
||||
return false;
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s, &since_midnight))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*out = static_cast<value_type>(util::CastSecondsToUnit(unit, since_midnight.count()));
|
||||
|
||||
if (length == 8) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(s[8] != '.')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t subseconds_count = 0;
|
||||
if (ARROW_PREDICT_FALSE(
|
||||
!detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*out += subseconds_count;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief Convenience wrappers around internal::StringConverter.
|
||||
template <typename T>
|
||||
bool ParseValue(const T& type, const char* s, size_t length,
|
||||
typename StringConverter<T>::value_type* out) {
|
||||
return StringConverter<T>{}.Convert(type, s, length, out);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
enable_if_parameter_free<T, bool> ParseValue(
|
||||
const char* s, size_t length, typename StringConverter<T>::value_type* out) {
|
||||
static T type;
|
||||
return StringConverter<T>{}.Convert(type, s, length, out);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,172 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/algorithm.h"
|
||||
#include "arrow/util/functional.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
|
||||
ARROW_DCHECK(!values.empty());
|
||||
ARROW_DCHECK_LT(index, values.size());
|
||||
std::vector<T> out;
|
||||
out.reserve(values.size() - 1);
|
||||
for (size_t i = 0; i < index; ++i) {
|
||||
out.push_back(values[i]);
|
||||
}
|
||||
for (size_t i = index + 1; i < values.size(); ++i) {
|
||||
out.push_back(values[i]);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
|
||||
T new_element) {
|
||||
ARROW_DCHECK_LE(index, values.size());
|
||||
std::vector<T> out;
|
||||
out.reserve(values.size() + 1);
|
||||
for (size_t i = 0; i < index; ++i) {
|
||||
out.push_back(values[i]);
|
||||
}
|
||||
out.emplace_back(std::move(new_element));
|
||||
for (size_t i = index; i < values.size(); ++i) {
|
||||
out.push_back(values[i]);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
|
||||
T new_element) {
|
||||
ARROW_DCHECK_LE(index, values.size());
|
||||
std::vector<T> out;
|
||||
out.reserve(values.size());
|
||||
for (size_t i = 0; i < index; ++i) {
|
||||
out.push_back(values[i]);
|
||||
}
|
||||
out.emplace_back(std::move(new_element));
|
||||
for (size_t i = index + 1; i < values.size(); ++i) {
|
||||
out.push_back(values[i]);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T, typename Predicate>
|
||||
std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
|
||||
auto new_end = std::remove_if(values.begin(), values.end(),
|
||||
[&](const T& value) { return !predicate(value); });
|
||||
values.erase(new_end, values.end());
|
||||
return values;
|
||||
}
|
||||
|
||||
template <typename Fn, typename From,
|
||||
typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
|
||||
std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
|
||||
std::vector<To> out;
|
||||
out.reserve(source.size());
|
||||
std::transform(source.begin(), source.end(), std::back_inserter(out),
|
||||
std::forward<Fn>(map));
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename Fn, typename From,
|
||||
typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
|
||||
std::vector<To> MapVector(Fn&& map, std::vector<From>&& source) {
|
||||
std::vector<To> out;
|
||||
out.reserve(source.size());
|
||||
std::transform(std::make_move_iterator(source.begin()),
|
||||
std::make_move_iterator(source.end()), std::back_inserter(out),
|
||||
std::forward<Fn>(map));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Like MapVector, but where the function can fail.
|
||||
template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
|
||||
typename To = typename internal::call_traits::return_type<Fn>::ValueType>
|
||||
Result<std::vector<To>> MaybeMapVector(Fn&& map, const std::vector<From>& source) {
|
||||
std::vector<To> out;
|
||||
out.reserve(source.size());
|
||||
ARROW_RETURN_NOT_OK(MaybeTransform(source.begin(), source.end(),
|
||||
std::back_inserter(out), std::forward<Fn>(map)));
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
|
||||
typename To = typename internal::call_traits::return_type<Fn>::ValueType>
|
||||
Result<std::vector<To>> MaybeMapVector(Fn&& map, std::vector<From>&& source) {
|
||||
std::vector<To> out;
|
||||
out.reserve(source.size());
|
||||
ARROW_RETURN_NOT_OK(MaybeTransform(std::make_move_iterator(source.begin()),
|
||||
std::make_move_iterator(source.end()),
|
||||
std::back_inserter(out), std::forward<Fn>(map)));
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> FlattenVectors(const std::vector<std::vector<T>>& vecs) {
|
||||
std::size_t sum = 0;
|
||||
for (const auto& vec : vecs) {
|
||||
sum += vec.size();
|
||||
}
|
||||
std::vector<T> out;
|
||||
out.reserve(sum);
|
||||
for (const auto& vec : vecs) {
|
||||
out.insert(out.end(), vec.begin(), vec.end());
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Result<std::vector<T>> UnwrapOrRaise(std::vector<Result<T>>&& results) {
|
||||
std::vector<T> out;
|
||||
out.reserve(results.size());
|
||||
auto end = std::make_move_iterator(results.end());
|
||||
for (auto it = std::make_move_iterator(results.begin()); it != end; it++) {
|
||||
if (!it->ok()) {
|
||||
return it->status();
|
||||
}
|
||||
out.push_back(it->MoveValueUnsafe());
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Result<std::vector<T>> UnwrapOrRaise(const std::vector<Result<T>>& results) {
|
||||
std::vector<T> out;
|
||||
out.reserve(results.size());
|
||||
for (const auto& result : results) {
|
||||
if (!result.ok()) {
|
||||
return result.status();
|
||||
}
|
||||
out.push_back(result.ValueUnsafe());
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,95 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||
// Windows
|
||||
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(disable : 4251)
|
||||
# else
|
||||
# pragma GCC diagnostic ignored "-Wattributes"
|
||||
# endif
|
||||
|
||||
# if defined(__cplusplus) && defined(__GNUC__) && !defined(__clang__)
|
||||
// Use C++ attribute syntax where possible to avoid GCC parser bug
|
||||
// (https://stackoverflow.com/questions/57993818/gcc-how-to-combine-attribute-dllexport-and-nodiscard-in-a-struct-de)
|
||||
# define ARROW_DLLEXPORT [[gnu::dllexport]]
|
||||
# define ARROW_DLLIMPORT [[gnu::dllimport]]
|
||||
# else
|
||||
# define ARROW_DLLEXPORT __declspec(dllexport)
|
||||
# define ARROW_DLLIMPORT __declspec(dllimport)
|
||||
# endif
|
||||
|
||||
// _declspec(dllexport) even when #included by a non-arrow source
|
||||
# define ARROW_FORCE_EXPORT ARROW_DLLEXPORT
|
||||
|
||||
# ifdef ARROW_STATIC
|
||||
# define ARROW_EXPORT
|
||||
# define ARROW_FRIEND_EXPORT
|
||||
# define ARROW_TEMPLATE_EXPORT
|
||||
# elif defined(ARROW_EXPORTING)
|
||||
# define ARROW_EXPORT ARROW_DLLEXPORT
|
||||
// For some reason [[gnu::dllexport]] doesn't work well with friend declarations
|
||||
# define ARROW_FRIEND_EXPORT __declspec(dllexport)
|
||||
# define ARROW_TEMPLATE_EXPORT ARROW_DLLEXPORT
|
||||
# else
|
||||
# define ARROW_EXPORT ARROW_DLLIMPORT
|
||||
# define ARROW_FRIEND_EXPORT __declspec(dllimport)
|
||||
# define ARROW_TEMPLATE_EXPORT ARROW_DLLIMPORT
|
||||
# endif
|
||||
|
||||
# define ARROW_NO_EXPORT
|
||||
|
||||
#else
|
||||
|
||||
// Non-Windows
|
||||
|
||||
# if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__))
|
||||
# ifndef ARROW_EXPORT
|
||||
# define ARROW_EXPORT [[gnu::visibility("default")]]
|
||||
# endif
|
||||
# ifndef ARROW_NO_EXPORT
|
||||
# define ARROW_NO_EXPORT [[gnu::visibility("hidden")]]
|
||||
# endif
|
||||
// The C++ language does not have clear rules for how to export explicit template
|
||||
// instantiations, and clang/gcc have differing syntax. See
|
||||
// https://github.com/llvm/llvm-project/issues/29464 and
|
||||
// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
|
||||
# if defined(__clang__)
|
||||
# define ARROW_TEMPLATE_EXPORT
|
||||
# else
|
||||
# define ARROW_TEMPLATE_EXPORT ARROW_EXPORT
|
||||
# endif
|
||||
# else
|
||||
// Not C++, or not gcc/clang
|
||||
# ifndef ARROW_EXPORT
|
||||
# define ARROW_EXPORT
|
||||
# endif
|
||||
# ifndef ARROW_NO_EXPORT
|
||||
# define ARROW_NO_EXPORT
|
||||
# endif
|
||||
# define ARROW_TEMPLATE_EXPORT
|
||||
# endif
|
||||
|
||||
# define ARROW_FRIEND_EXPORT
|
||||
|
||||
// [[gnu::visibility("default")]] even when #included by a non-arrow source
|
||||
# define ARROW_FORCE_EXPORT [[gnu::visibility("default")]]
|
||||
|
||||
#endif // Non-Windows
|
||||
@@ -0,0 +1,39 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
// Windows defines min and max macros that mess up std::min/max
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
|
||||
// Set Windows 7 as a conservative minimum for Apache Arrow
|
||||
# if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x601
|
||||
# undef _WIN32_WINNT
|
||||
# endif
|
||||
# ifndef _WIN32_WINNT
|
||||
# define _WIN32_WINNT 0x601
|
||||
# endif
|
||||
|
||||
# include <winsock2.h>
|
||||
|
||||
# include "arrow/util/windows_fixup.h"
|
||||
|
||||
#endif // _WIN32
|
||||
@@ -0,0 +1,52 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This header needs to be included multiple times.
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
# ifdef max
|
||||
# undef max
|
||||
# endif
|
||||
# ifdef min
|
||||
# undef min
|
||||
# endif
|
||||
|
||||
// The Windows API defines macros from *File resolving to either
|
||||
// *FileA or *FileW. Need to undo them.
|
||||
# ifdef CopyFile
|
||||
# undef CopyFile
|
||||
# endif
|
||||
# ifdef CreateFile
|
||||
# undef CreateFile
|
||||
# endif
|
||||
# ifdef DeleteFile
|
||||
# undef DeleteFile
|
||||
# endif
|
||||
|
||||
// Other annoying Windows macro definitions...
|
||||
# ifdef IN
|
||||
# undef IN
|
||||
# endif
|
||||
# ifdef OUT
|
||||
# undef OUT
|
||||
# endif
|
||||
|
||||
// Note that we can't undefine OPTIONAL, because it can be used in other
|
||||
// Windows headers...
|
||||
|
||||
#endif // _WIN32
|
||||
Reference in New Issue
Block a user