Initial commit
This commit is contained in:
@@ -0,0 +1,323 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
#include "arrow/visitor.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// User array accessor types
|
||||
|
||||
/// \brief Array base type
|
||||
/// Immutable data array with some logical type and some length.
|
||||
///
|
||||
/// Any memory is owned by the respective Buffer instance (or its parents).
|
||||
///
|
||||
/// The base class is only required to have a null bitmap buffer if the null
|
||||
/// count is greater than 0
|
||||
///
|
||||
/// If known, the null count can be provided in the base Array constructor. If
|
||||
/// the null count is not known, pass -1 to indicate that the null count is to
|
||||
/// be computed on the first call to null_count()
|
||||
class ARROW_EXPORT Array {
|
||||
public:
|
||||
virtual ~Array() = default;
|
||||
|
||||
/// \brief Return true if value at index is null. Does not boundscheck
|
||||
bool IsNull(int64_t i) const { return !IsValid(i); }
|
||||
|
||||
/// \brief Return true if value at index is valid (not null). Does not
|
||||
/// boundscheck
|
||||
bool IsValid(int64_t i) const {
|
||||
if (null_bitmap_data_ != NULLPTR) {
|
||||
return bit_util::GetBit(null_bitmap_data_, i + data_->offset);
|
||||
}
|
||||
// Dispatching with a few conditionals like this makes IsNull more
|
||||
// efficient for how it is used in practice. Making IsNull virtual
|
||||
// would add a vtable lookup to every call and prevent inlining +
|
||||
// a potential inner-branch removal.
|
||||
if (type_id() == Type::SPARSE_UNION) {
|
||||
return !internal::IsNullSparseUnion(*data_, i);
|
||||
}
|
||||
if (type_id() == Type::DENSE_UNION) {
|
||||
return !internal::IsNullDenseUnion(*data_, i);
|
||||
}
|
||||
if (type_id() == Type::RUN_END_ENCODED) {
|
||||
return !internal::IsNullRunEndEncoded(*data_, i);
|
||||
}
|
||||
return data_->null_count != data_->length;
|
||||
}
|
||||
|
||||
/// \brief Return a Scalar containing the value of this array at i
|
||||
Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
|
||||
|
||||
/// Size in the number of elements this array contains.
|
||||
int64_t length() const { return data_->length; }
|
||||
|
||||
/// A relative position into another array's data, to enable zero-copy
|
||||
/// slicing. This value defaults to zero
|
||||
int64_t offset() const { return data_->offset; }
|
||||
|
||||
/// The number of null entries in the array. If the null count was not known
|
||||
/// at time of construction (and set to a negative value), then the null
|
||||
/// count will be computed and cached on the first invocation of this
|
||||
/// function
|
||||
int64_t null_count() const;
|
||||
|
||||
/// \brief Computes the logical null count for arrays of all types including
|
||||
/// those that do not have a validity bitmap like union and run-end encoded
|
||||
/// arrays
|
||||
///
|
||||
/// If the array has a validity bitmap, this function behaves the same as
|
||||
/// null_count(). For types that have no validity bitmap, this function will
|
||||
/// recompute the null count every time it is called.
|
||||
///
|
||||
/// \see GetNullCount
|
||||
int64_t ComputeLogicalNullCount() const;
|
||||
|
||||
const std::shared_ptr<DataType>& type() const { return data_->type; }
|
||||
Type::type type_id() const { return data_->type->id(); }
|
||||
|
||||
/// Buffer for the validity (null) bitmap, if any. Note that Union types
|
||||
/// never have a null bitmap.
|
||||
///
|
||||
/// Note that for `null_count == 0` or for null type, this will be null.
|
||||
/// This buffer does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
|
||||
|
||||
/// Raw pointer to the null bitmap.
|
||||
///
|
||||
/// Note that for `null_count == 0` or for null type, this will be null.
|
||||
/// This buffer does not account for any slice offset
|
||||
const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
|
||||
|
||||
/// Equality comparison with another array
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool Equals(const std::shared_ptr<Array>& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// \brief Return the formatted unified diff of arrow::Diff between this
|
||||
/// Array and another Array
|
||||
std::string Diff(const Array& other) const;
|
||||
|
||||
/// Approximate equality comparison with another array
|
||||
///
|
||||
/// epsilon is only used if this is FloatArray or DoubleArray
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
bool ApproxEquals(const std::shared_ptr<Array>& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool ApproxEquals(const Array& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// Compare if the range of slots specified are equal for the given array and
|
||||
/// this array. end_idx exclusive. This methods does not bounds check.
|
||||
///
|
||||
/// Note that arrow::ArrayStatistics is not included in the comparison.
|
||||
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
|
||||
const Array& other,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
|
||||
const std::shared_ptr<Array>& other,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
|
||||
int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
|
||||
int64_t end_idx, int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// \brief Apply the ArrayVisitor::Visit() method specialized to the array type
|
||||
Status Accept(ArrayVisitor* visitor) const;
|
||||
|
||||
/// Construct a zero-copy view of this array with the given type.
|
||||
///
|
||||
/// This method checks if the types are layout-compatible.
|
||||
/// Nested types are traversed in depth-first order. Data buffers must have
|
||||
/// the same item sizes, even though the logical types may be different.
|
||||
/// An error is returned if the types are not layout-compatible.
|
||||
Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;
|
||||
|
||||
/// \brief Construct a copy of the array with all buffers on destination
|
||||
/// Memory Manager
|
||||
///
|
||||
/// This method recursively copies the array's buffers and those of its children
|
||||
/// onto the destination MemoryManager device and returns the new Array.
|
||||
Result<std::shared_ptr<Array>> CopyTo(const std::shared_ptr<MemoryManager>& to) const;
|
||||
|
||||
/// \brief Construct a new array attempting to zero-copy view if possible.
|
||||
///
|
||||
/// Like CopyTo this method recursively goes through all of the array's buffers
|
||||
/// and those of it's children and first attempts to create zero-copy
|
||||
/// views on the destination MemoryManager device. If it can't, it falls back
|
||||
/// to performing a copy. See Buffer::ViewOrCopy.
|
||||
Result<std::shared_ptr<Array>> ViewOrCopyTo(
|
||||
const std::shared_ptr<MemoryManager>& to) const;
|
||||
|
||||
/// Construct a zero-copy slice of the array with the indicated offset and
|
||||
/// length
|
||||
///
|
||||
/// \param[in] offset the position of the first element in the constructed
|
||||
/// slice
|
||||
/// \param[in] length the length of the slice. If there are not enough
|
||||
/// elements in the array, the length will be adjusted accordingly
|
||||
///
|
||||
/// \return a new object wrapped in std::shared_ptr<Array>
|
||||
std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
/// Slice from offset until end of the array
|
||||
std::shared_ptr<Array> Slice(int64_t offset) const;
|
||||
|
||||
/// Input-checking variant of Array::Slice
|
||||
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
|
||||
/// Input-checking variant of Array::Slice
|
||||
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
|
||||
|
||||
const std::shared_ptr<ArrayData>& data() const { return data_; }
|
||||
|
||||
int num_fields() const { return static_cast<int>(data_->child_data.size()); }
|
||||
|
||||
/// \return PrettyPrint representation of array suitable for debugging
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Perform cheap validation checks to determine obvious inconsistencies
|
||||
/// within the array's internal data.
|
||||
///
|
||||
/// This is O(k) where k is the number of descendents.
|
||||
///
|
||||
/// \return Status
|
||||
Status Validate() const;
|
||||
|
||||
/// \brief Perform extensive validation checks to determine inconsistencies
|
||||
/// within the array's internal data.
|
||||
///
|
||||
/// This is potentially O(k*n) where k is the number of descendents and n
|
||||
/// is the array length.
|
||||
///
|
||||
/// \return Status
|
||||
Status ValidateFull() const;
|
||||
|
||||
/// \brief Return the device_type that this array's data is allocated on
|
||||
///
|
||||
/// This just delegates to calling device_type on the underlying ArrayData
|
||||
/// object which backs this Array.
|
||||
///
|
||||
/// \return DeviceAllocationType
|
||||
DeviceAllocationType device_type() const { return data_->device_type(); }
|
||||
|
||||
/// \brief Return the statistics of this Array
|
||||
///
|
||||
/// This just delegates to calling statistics on the underlying ArrayData
|
||||
/// object which backs this Array.
|
||||
///
|
||||
/// \return const std::shared_ptr<ArrayStatistics>&
|
||||
const std::shared_ptr<ArrayStatistics>& statistics() const { return data_->statistics; }
|
||||
|
||||
protected:
|
||||
Array() = default;
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
|
||||
|
||||
std::shared_ptr<ArrayData> data_;
|
||||
const uint8_t* null_bitmap_data_ = NULLPTR;
|
||||
|
||||
/// Protected method for constructors
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
if (data->buffers.size() > 0) {
|
||||
null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
|
||||
} else {
|
||||
null_bitmap_data_ = NULLPTR;
|
||||
}
|
||||
data_ = data;
|
||||
}
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
|
||||
};
|
||||
|
||||
ARROW_EXPORT void PrintTo(const Array& x, std::ostream* os);
|
||||
|
||||
static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
|
||||
os << x.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
/// Base class for non-nested arrays
|
||||
class ARROW_EXPORT FlatArray : public Array {
|
||||
protected:
|
||||
using Array::Array;
|
||||
};
|
||||
|
||||
/// Base class for arrays of fixed-size logical types
|
||||
class ARROW_EXPORT PrimitiveArray : public FlatArray {
|
||||
public:
|
||||
/// Does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& values() const { return data_->buffers[1]; }
|
||||
|
||||
protected:
|
||||
PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
PrimitiveArray() : raw_values_(NULLPTR) {}
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->Array::SetData(data);
|
||||
raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
|
||||
}
|
||||
|
||||
explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
|
||||
|
||||
const uint8_t* raw_values_;
|
||||
};
|
||||
|
||||
/// Degenerate null type Array
|
||||
class ARROW_EXPORT NullArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = NullType;
|
||||
|
||||
explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
|
||||
explicit NullArray(int64_t length);
|
||||
|
||||
private:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
null_bitmap_data_ = NULLPTR;
|
||||
data->null_count = data->length;
|
||||
data_ = data;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,321 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes for Binary, LargeBinary, String, LargeString,
|
||||
// FixedSizeBinary
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/stl_iterator.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup binary-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary and String
|
||||
|
||||
/// Base class for variable-sized binary arrays, regardless of offset size
|
||||
/// and logical interpretation.
|
||||
template <typename TYPE>
|
||||
class BaseBinaryArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
|
||||
|
||||
/// Return the pointer to the given elements bytes
|
||||
// XXX should GetValue(int64_t i) return a string_view?
|
||||
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
|
||||
const offset_type pos = raw_value_offsets_[i];
|
||||
*out_length = raw_value_offsets_[i + 1] - pos;
|
||||
return raw_data_ + pos;
|
||||
}
|
||||
|
||||
/// \brief Get binary value as a string_view
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the view over the selected value
|
||||
std::string_view GetView(int64_t i) const {
|
||||
const offset_type pos = raw_value_offsets_[i];
|
||||
return std::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
|
||||
raw_value_offsets_[i + 1] - pos);
|
||||
}
|
||||
|
||||
std::optional<std::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
/// \brief Get binary value as a string_view
|
||||
/// Provided for consistency with other arrays.
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the view over the selected value
|
||||
std::string_view Value(int64_t i) const { return GetView(i); }
|
||||
|
||||
/// \brief Get binary value as a std::string
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the value copied into a std::string
|
||||
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
|
||||
|
||||
const offset_type* raw_value_offsets() const { return raw_value_offsets_; }
|
||||
|
||||
const uint8_t* raw_data() const { return raw_data_; }
|
||||
|
||||
/// \brief Return the data buffer absolute offset of the data for the value
|
||||
/// at the passed index.
|
||||
///
|
||||
/// Does not perform boundschecking
|
||||
offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; }
|
||||
|
||||
/// \brief Return the length of the data for the value at the passed index.
|
||||
///
|
||||
/// Does not perform boundschecking
|
||||
offset_type value_length(int64_t i) const {
|
||||
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
|
||||
}
|
||||
|
||||
/// \brief Return the total length of the memory in the data buffer
|
||||
/// referenced by this array. If the array has been sliced then this may be
|
||||
/// less than the size of the data buffer (data_->buffers[2]).
|
||||
offset_type total_values_length() const {
|
||||
if (data_->length > 0) {
|
||||
return raw_value_offsets_[data_->length] - raw_value_offsets_[0];
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
// For subclasses
|
||||
BaseBinaryArray() = default;
|
||||
|
||||
// Protected method for constructors
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->Array::SetData(data);
|
||||
raw_value_offsets_ = data->GetValuesSafe<offset_type>(1);
|
||||
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
|
||||
}
|
||||
|
||||
const offset_type* raw_value_offsets_ = NULLPTR;
|
||||
const uint8_t* raw_data_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size binary data
|
||||
class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
|
||||
public:
|
||||
explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
protected:
|
||||
// For subclasses such as StringArray
|
||||
BinaryArray() : BaseBinaryArray() {}
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size string (utf-8) data
|
||||
class ARROW_EXPORT StringArray : public BinaryArray {
|
||||
public:
|
||||
using TypeClass = StringType;
|
||||
|
||||
explicit StringArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
/// Concrete Array class for large variable-size binary data
|
||||
class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
|
||||
public:
|
||||
explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
protected:
|
||||
// For subclasses such as LargeStringArray
|
||||
LargeBinaryArray() : BaseBinaryArray() {}
|
||||
};
|
||||
|
||||
/// Concrete Array class for large variable-size string (utf-8) data
|
||||
class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
|
||||
public:
|
||||
using TypeClass = LargeStringType;
|
||||
|
||||
explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// BinaryView and StringView
|
||||
|
||||
/// Concrete Array class for variable-size binary view data using the
|
||||
/// BinaryViewType::c_type struct to reference in-line or out-of-line string values
|
||||
class ARROW_EXPORT BinaryViewArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = BinaryViewType;
|
||||
using IteratorType = stl::ArrayIterator<BinaryViewArray>;
|
||||
using c_type = BinaryViewType::c_type;
|
||||
|
||||
explicit BinaryViewArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> views, BufferVector data_buffers,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
// For API compatibility with BinaryArray etc.
|
||||
std::string_view GetView(int64_t i) const;
|
||||
std::string GetString(int64_t i) const { return std::string{GetView(i)}; }
|
||||
|
||||
const auto& values() const { return data_->buffers[1]; }
|
||||
const c_type* raw_values() const { return raw_values_; }
|
||||
|
||||
std::optional<std::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
using FlatArray::FlatArray;
|
||||
|
||||
void SetData(std::shared_ptr<ArrayData> data) {
|
||||
FlatArray::SetData(std::move(data));
|
||||
raw_values_ = data_->GetValuesSafe<c_type>(1);
|
||||
}
|
||||
|
||||
const c_type* raw_values_;
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size string view (utf-8) data using
|
||||
/// BinaryViewType::c_type to reference in-line or out-of-line string values
|
||||
class ARROW_EXPORT StringViewArray : public BinaryViewArray {
|
||||
public:
|
||||
using TypeClass = StringViewType;
|
||||
|
||||
explicit StringViewArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
using BinaryViewArray::BinaryViewArray;
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Fixed width binary
|
||||
|
||||
/// Concrete Array class for fixed-size binary data
|
||||
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = FixedSizeBinaryType;
|
||||
using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
|
||||
|
||||
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const uint8_t* GetValue(int64_t i) const { return values_ + i * byte_width_; }
|
||||
const uint8_t* Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
std::string_view GetView(int64_t i) const {
|
||||
return std::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width_);
|
||||
}
|
||||
|
||||
std::optional<std::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
|
||||
|
||||
int32_t byte_width() const { return byte_width_; }
|
||||
|
||||
const uint8_t* raw_values() const { return values_; }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->PrimitiveArray::SetData(data);
|
||||
byte_width_ =
|
||||
internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
|
||||
values_ = raw_values_ + data_->offset * byte_width_;
|
||||
}
|
||||
|
||||
const uint8_t* values_;
|
||||
int32_t byte_width_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,104 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal32Array
|
||||
|
||||
/// Concrete Array class for 32-bit decimal data
|
||||
class ARROW_EXPORT Decimal32Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal32Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal32Array from ArrayData instance
|
||||
explicit Decimal32Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal64Array
|
||||
|
||||
/// Concrete Array class for 64-bit decimal data
|
||||
class ARROW_EXPORT Decimal64Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal64Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal64Array from ArrayData instance
|
||||
explicit Decimal64Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal128Array
|
||||
|
||||
/// Concrete Array class for 128-bit decimal data
|
||||
class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal128Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal128Array from ArrayData instance
|
||||
explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
// Backward compatibility
|
||||
using DecimalArray = Decimal128Array;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal256Array
|
||||
|
||||
/// Concrete Array class for 256-bit decimal data
|
||||
class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal256Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal256Array from ArrayData instance
|
||||
explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,182 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// DictionaryArray
|
||||
|
||||
/// \brief Array type for dictionary-encoded data with a
|
||||
/// data-dependent dictionary
|
||||
///
|
||||
/// A dictionary array contains an array of non-negative integers (the
|
||||
/// "dictionary indices") along with a data type containing a "dictionary"
|
||||
/// corresponding to the distinct values represented in the data.
|
||||
///
|
||||
/// For example, the array
|
||||
///
|
||||
/// ["foo", "bar", "foo", "bar", "foo", "bar"]
|
||||
///
|
||||
/// with dictionary ["bar", "foo"], would have dictionary array representation
|
||||
///
|
||||
/// indices: [1, 0, 1, 0, 1, 0]
|
||||
/// dictionary: ["bar", "foo"]
|
||||
///
|
||||
/// The indices in principle may be any integer type.
|
||||
class ARROW_EXPORT DictionaryArray : public Array {
|
||||
public:
|
||||
using TypeClass = DictionaryType;
|
||||
|
||||
explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DictionaryArray(const std::shared_ptr<DataType>& type,
|
||||
const std::shared_ptr<Array>& indices,
|
||||
const std::shared_ptr<Array>& dictionary);
|
||||
|
||||
/// \brief Construct DictionaryArray from dictionary and indices
|
||||
/// array and validate
|
||||
///
|
||||
/// This function does the validation of the indices and input type. It checks if
|
||||
/// all indices are non-negative and smaller than the size of the dictionary.
|
||||
///
|
||||
/// \param[in] type a dictionary type
|
||||
/// \param[in] dictionary the dictionary with same value type as the
|
||||
/// type object
|
||||
/// \param[in] indices an array of non-negative integers smaller than the
|
||||
/// size of the dictionary
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
|
||||
const std::shared_ptr<Array>& dictionary);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
|
||||
return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
|
||||
dictionary);
|
||||
}
|
||||
|
||||
/// \brief Transpose this DictionaryArray
|
||||
///
|
||||
/// This method constructs a new dictionary array with the given dictionary
|
||||
/// type, transposing indices using the transpose map. The type and the
|
||||
/// transpose map are typically computed using DictionaryUnifier.
|
||||
///
|
||||
/// \param[in] type the new type object
|
||||
/// \param[in] dictionary the new dictionary
|
||||
/// \param[in] transpose_map transposition array of this array's indices
|
||||
/// into the target array's indices
|
||||
/// \param[in] pool a pool to allocate the array data from
|
||||
Result<std::shared_ptr<Array>> Transpose(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
|
||||
const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
Result<std::shared_ptr<Array>> Compact(MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Determine whether dictionary arrays may be compared without unification
|
||||
bool CanCompareIndices(const DictionaryArray& other) const;
|
||||
|
||||
/// \brief Return the dictionary for this array, which is stored as
|
||||
/// a member of the ArrayData internal structure
|
||||
const std::shared_ptr<Array>& dictionary() const;
|
||||
const std::shared_ptr<Array>& indices() const;
|
||||
|
||||
/// \brief Return the ith value of indices, cast to int64_t. Not recommended
|
||||
/// for use in performance-sensitive code. Does not validate whether the
|
||||
/// value is null or out-of-bounds.
|
||||
int64_t GetValueIndex(int64_t i) const;
|
||||
|
||||
const DictionaryType* dict_type() const { return dict_type_; }
|
||||
|
||||
private:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
const DictionaryType* dict_type_;
|
||||
std::shared_ptr<Array> indices_;
|
||||
|
||||
// Lazily initialized when invoking dictionary()
|
||||
mutable std::shared_ptr<Array> dictionary_;
|
||||
};
|
||||
|
||||
/// \brief Helper class for incremental dictionary unification
|
||||
class ARROW_EXPORT DictionaryUnifier {
|
||||
public:
|
||||
virtual ~DictionaryUnifier() = default;
|
||||
|
||||
/// \brief Construct a DictionaryUnifier
|
||||
/// \param[in] value_type the data type of the dictionaries
|
||||
/// \param[in] pool MemoryPool to use for memory allocations
|
||||
static Result<std::unique_ptr<DictionaryUnifier>> Make(
|
||||
std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Unify dictionaries across array chunks
|
||||
///
|
||||
/// The dictionaries in the array chunks will be unified, their indices
|
||||
/// accordingly transposed.
|
||||
///
|
||||
/// Only dictionaries with a primitive value type are currently supported.
|
||||
/// However, dictionaries nested inside a more complex type are correctly unified.
|
||||
static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
|
||||
const std::shared_ptr<ChunkedArray>& array,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Unify dictionaries across the chunks of each table column
|
||||
///
|
||||
/// The dictionaries in each table column will be unified, their indices
|
||||
/// accordingly transposed.
|
||||
///
|
||||
/// Only dictionaries with a primitive value type are currently supported.
|
||||
/// However, dictionaries nested inside a more complex type are correctly unified.
|
||||
static Result<std::shared_ptr<Table>> UnifyTable(
|
||||
const Table& table, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Append dictionary to the internal memo
|
||||
virtual Status Unify(const Array& dictionary) = 0;
|
||||
|
||||
/// \brief Append dictionary and compute transpose indices
|
||||
/// \param[in] dictionary the dictionary values to unify
|
||||
/// \param[out] out_transpose a Buffer containing computed transpose indices
|
||||
/// as int32_t values equal in length to the passed dictionary. The value in
|
||||
/// each slot corresponds to the new index value for each original index
|
||||
/// for a DictionaryArray with the old dictionary
|
||||
virtual Status Unify(const Array& dictionary,
|
||||
std::shared_ptr<Buffer>* out_transpose) = 0;
|
||||
|
||||
/// \brief Return a result DictionaryType with the smallest possible index
|
||||
/// type to accommodate the unified dictionary. The unifier cannot be used
|
||||
/// after this is called
|
||||
virtual Status GetResult(std::shared_ptr<DataType>* out_type,
|
||||
std::shared_ptr<Array>* out_dict) = 0;
|
||||
|
||||
/// \brief Return a unified dictionary with the given index type. If
|
||||
/// the index type is not large enough then an invalid status will be returned.
|
||||
/// The unifier cannot be used after this is called
|
||||
virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
|
||||
std::shared_ptr<Array>* out_dict) = 0;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,887 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList,
|
||||
// Map, Struct, and Union
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// VarLengthListLikeArray
|
||||
|
||||
template <typename TYPE>
|
||||
class VarLengthListLikeArray;
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Private helper for [Large]List[View]Array::SetData.
|
||||
// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header
|
||||
// doesn't play well with MSVC.
|
||||
template <typename TYPE>
|
||||
void SetListData(VarLengthListLikeArray<TYPE>* self,
|
||||
const std::shared_ptr<ArrayData>& data,
|
||||
Type::type expected_type_id = TYPE::type_id);
|
||||
|
||||
/// \brief A version of Flatten that keeps recursively flattening until an array of
|
||||
/// non-list values is reached.
|
||||
///
|
||||
/// Array types considered to be lists by this function:
|
||||
/// - list
|
||||
/// - large_list
|
||||
/// - list_view
|
||||
/// - large_list_view
|
||||
/// - fixed_size_list
|
||||
///
|
||||
/// \see ListArray::Flatten
|
||||
ARROW_EXPORT Result<std::shared_ptr<Array>> FlattenLogicalListRecursively(
|
||||
const Array& in_array, MemoryPool* memory_pool);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// Base class for variable-sized list and list-view arrays, regardless of offset size.
|
||||
template <typename TYPE>
|
||||
class VarLengthListLikeArray : public Array {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
const TypeClass* var_length_list_like_type() const { return this->list_type_; }
|
||||
|
||||
/// \brief Return array object containing the list's values
|
||||
///
|
||||
/// Note that this buffer does not account for any slice offset or length.
|
||||
const std::shared_ptr<Array>& values() const { return values_; }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset or length.
|
||||
const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[1]; }
|
||||
|
||||
const std::shared_ptr<DataType>& value_type() const { return list_type_->value_type(); }
|
||||
|
||||
/// Return pointer to raw value offsets accounting for any slice offset
|
||||
const offset_type* raw_value_offsets() const { return raw_value_offsets_; }
|
||||
|
||||
// The following functions will not perform boundschecking
|
||||
|
||||
offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; }
|
||||
|
||||
/// \brief Return the size of the value at a particular index
|
||||
///
|
||||
/// Since non-empty null lists and list-views are possible, avoid calling this
|
||||
/// function when the list at slot i is null.
|
||||
///
|
||||
/// \pre IsValid(i)
|
||||
virtual offset_type value_length(int64_t i) const = 0;
|
||||
|
||||
/// \pre IsValid(i)
|
||||
std::shared_ptr<Array> value_slice(int64_t i) const {
|
||||
return values_->Slice(value_offset(i), value_length(i));
|
||||
}
|
||||
|
||||
/// \brief Flatten all level recursively until reach a non-list type, and return
|
||||
/// a non-list type Array.
|
||||
///
|
||||
/// \see internal::FlattenLogicalListRecursively
|
||||
Result<std::shared_ptr<Array>> FlattenRecursively(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const {
|
||||
return internal::FlattenLogicalListRecursively(*this, memory_pool);
|
||||
}
|
||||
|
||||
protected:
|
||||
friend void internal::SetListData<TYPE>(VarLengthListLikeArray<TYPE>* self,
|
||||
const std::shared_ptr<ArrayData>& data,
|
||||
Type::type expected_type_id);
|
||||
|
||||
const TypeClass* list_type_ = NULLPTR;
|
||||
std::shared_ptr<Array> values_;
|
||||
const offset_type* raw_value_offsets_ = NULLPTR;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListArray / LargeListArray
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListArray : public VarLengthListLikeArray<TYPE> {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TYPE::offset_type;
|
||||
|
||||
const TypeClass* list_type() const { return this->var_length_list_like_type(); }
|
||||
|
||||
/// \brief Return the size of the value at a particular index
|
||||
///
|
||||
/// Since non-empty null lists are possible, avoid calling this
|
||||
/// function when the list at slot i is null.
|
||||
///
|
||||
/// \pre IsValid(i)
|
||||
offset_type value_length(int64_t i) const final {
|
||||
return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i];
|
||||
}
|
||||
};
|
||||
|
||||
/// Concrete Array class for list data
|
||||
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
|
||||
public:
|
||||
explicit ListArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
ListArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct ListArray from array of offsets and child value array
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed.
|
||||
///
|
||||
/// If a null_bitmap is not provided, the nulls will be inferred from the offsets'
|
||||
/// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls.
|
||||
///
|
||||
/// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an
|
||||
/// array with offset() > 0).
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int32 type
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<ListArray>> FromArrays(
|
||||
const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<ListArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Build a ListArray from a ListViewArray
|
||||
static Result<std::shared_ptr<ListArray>> FromListView(const ListViewArray& source,
|
||||
MemoryPool* pool);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration of this array's offsets as well as null elements backed
|
||||
/// by non-empty lists (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list offsets as an Int32Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to ListArray::FromArrays() and get back the same list array
|
||||
/// if the original one has nulls.
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
protected:
|
||||
// This constructor defers SetData to a derived array class
|
||||
ListArray() = default;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// Concrete Array class for large list data (with 64-bit offsets)
|
||||
class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
|
||||
public:
|
||||
explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct LargeListArray from array of offsets and child value array
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed.
|
||||
///
|
||||
/// If a null_bitmap is not provided, the nulls will be inferred from the offsets'
|
||||
/// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls.
|
||||
///
|
||||
/// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an
|
||||
/// array with offset() > 0).
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int64 type
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<LargeListArray>> FromArrays(
|
||||
const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<LargeListArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Build a LargeListArray from a LargeListViewArray
|
||||
static Result<std::shared_ptr<LargeListArray>> FromListView(
|
||||
const LargeListViewArray& source, MemoryPool* pool);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration of this array's offsets as well as null elements backed
|
||||
/// by non-empty lists (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list offsets as an Int64Array
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListViewArray / LargeListViewArray
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListViewArray : public VarLengthListLikeArray<TYPE> {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TYPE::offset_type;
|
||||
|
||||
const TypeClass* list_view_type() const { return this->var_length_list_like_type(); }
|
||||
|
||||
/// \brief Note that this buffer does not account for any slice offset or length.
|
||||
const std::shared_ptr<Buffer>& value_sizes() const { return this->data_->buffers[2]; }
|
||||
|
||||
/// \brief Return pointer to raw value offsets accounting for any slice offset
|
||||
const offset_type* raw_value_sizes() const { return raw_value_sizes_; }
|
||||
|
||||
/// \brief Return the size of the value at a particular index
|
||||
///
|
||||
/// This should not be called if the list-view at slot i is null.
|
||||
/// The returned size in those cases could be any value from 0 to the
|
||||
/// length of the child values array.
|
||||
///
|
||||
/// \pre IsValid(i)
|
||||
offset_type value_length(int64_t i) const final { return this->raw_value_sizes_[i]; }
|
||||
|
||||
protected:
|
||||
const offset_type* raw_value_sizes_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief Concrete Array class for list-view data
|
||||
class ARROW_EXPORT ListViewArray : public BaseListViewArray<ListViewType> {
|
||||
public:
|
||||
explicit ListViewArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
ListViewArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> value_offsets,
|
||||
std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array> values,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct ListViewArray from array of offsets, sizes, and child
|
||||
/// value array
|
||||
///
|
||||
/// Construct a ListViewArray using buffers from offsets and sizes arrays
|
||||
/// that project views into the child values array.
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets/sizes and
|
||||
/// input types. The offset and length of the offsets and sizes arrays must
|
||||
/// match and that will be checked, but their contents will be assumed to be
|
||||
/// well-formed.
|
||||
///
|
||||
/// If a null_bitmap is not provided, the nulls will be inferred from the
|
||||
/// offsets's null bitmap. But if a null_bitmap is provided, the offsets array
|
||||
/// can't have nulls.
|
||||
///
|
||||
/// And when a null_bitmap is provided, neither the offsets or sizes array can be a
|
||||
/// slice (i.e. an array with offset() > 0).
|
||||
///
|
||||
/// \param[in] offsets An array of int32 offsets into the values array. NULL values are
|
||||
/// supported if the corresponding values in sizes is NULL or 0.
|
||||
/// \param[in] sizes An array containing the int32 sizes of every view. NULL values are
|
||||
/// taken to represent a NULL list-view in the array being created.
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<ListViewArray>> FromArrays(
|
||||
const Array& offsets, const Array& sizes, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<ListViewArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
|
||||
const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Build a ListViewArray from a ListArray
|
||||
static Result<std::shared_ptr<ListViewArray>> FromList(const ListArray& list_array,
|
||||
MemoryPool* pool);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the list-views in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration this array's offsets (which can be in any order)
|
||||
/// and sizes. Nulls are skipped.
|
||||
///
|
||||
/// This function invokes Concatenate() if list-views are non-contiguous. It
|
||||
/// will try to minimize the number of array slices passed to Concatenate() by
|
||||
/// maximizing the size of each slice (containing as many contiguous
|
||||
/// list-views as possible).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list-view offsets as an Int32Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to ListArray::FromArrays() and get back the same list array
|
||||
/// if the original one has nulls.
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
/// \brief Return list-view sizes as an Int32Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to ListViewArray::FromArrays() and get back the same list
|
||||
/// array if the original one has nulls.
|
||||
std::shared_ptr<Array> sizes() const;
|
||||
|
||||
protected:
|
||||
// This constructor defers SetData to a derived array class
|
||||
ListViewArray() = default;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// \brief Concrete Array class for large list-view data (with 64-bit offsets
|
||||
/// and sizes)
|
||||
class ARROW_EXPORT LargeListViewArray : public BaseListViewArray<LargeListViewType> {
|
||||
public:
|
||||
explicit LargeListViewArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
LargeListViewArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> value_offsets,
|
||||
std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array> values,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct LargeListViewArray from array of offsets, sizes, and child
|
||||
/// value array
|
||||
///
|
||||
/// Construct an LargeListViewArray using buffers from offsets and sizes arrays
|
||||
/// that project views into the values array.
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets/sizes and
|
||||
/// input types. The offset and length of the offsets and sizes arrays must
|
||||
/// match and that will be checked, but their contents will be assumed to be
|
||||
/// well-formed.
|
||||
///
|
||||
/// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or
|
||||
/// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a
|
||||
/// null_bitmap is provided, the offsets array and the sizes array can't have nulls.
|
||||
///
|
||||
/// And when a null_bitmap is provided, neither the offsets or sizes array can be a
|
||||
/// slice (i.e. an array with offset() > 0).
|
||||
///
|
||||
/// \param[in] offsets An array of int64 offsets into the values array. NULL values are
|
||||
/// supported if the corresponding values in sizes is NULL or 0.
|
||||
/// \param[in] sizes An array containing the int64 sizes of every view. NULL values are
|
||||
/// taken to represent a NULL list-view in the array being created.
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
|
||||
const Array& offsets, const Array& sizes, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
|
||||
const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Build a LargeListViewArray from a LargeListArray
|
||||
static Result<std::shared_ptr<LargeListViewArray>> FromList(
|
||||
const LargeListArray& list_array, MemoryPool* pool);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the large list-views in this
|
||||
/// array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration this array's offsets (which can be in any order)
|
||||
/// and sizes. Nulls are skipped.
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list-view offsets as an Int64Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to LargeListArray::FromArrays() and get back the same list array
|
||||
/// if the original one has nulls.
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
/// \brief Return list-view sizes as an Int64Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to LargeListViewArray::FromArrays() and get back the same list
|
||||
/// array if the original one has nulls.
|
||||
std::shared_ptr<Array> sizes() const;
|
||||
|
||||
protected:
|
||||
// This constructor defers SetData to a derived array class
|
||||
LargeListViewArray() = default;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// MapArray
|
||||
|
||||
/// Concrete Array class for map data
|
||||
///
|
||||
/// NB: "value" in this context refers to a pair of a key and the corresponding item
|
||||
class ARROW_EXPORT MapArray : public ListArray {
|
||||
public:
|
||||
using TypeClass = MapType;
|
||||
|
||||
explicit MapArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length, BufferVector buffers,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct MapArray from array of offsets and child key, item arrays
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int32 type
|
||||
/// \param[in] keys Array containing key values
|
||||
/// \param[in] items Array containing item values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// allocated because of null values
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
|
||||
const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR);
|
||||
|
||||
const MapType* map_type() const { return map_type_; }
|
||||
|
||||
/// \brief Return array object containing all map keys
|
||||
const std::shared_ptr<Array>& keys() const { return keys_; }
|
||||
|
||||
/// \brief Return array object containing all mapped items
|
||||
const std::shared_ptr<Array>& items() const { return items_; }
|
||||
|
||||
/// Validate child data before constructing the actual MapArray.
|
||||
static Status ValidateChildData(
|
||||
const std::vector<std::shared_ptr<ArrayData>>& child_data);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArraysInternal(
|
||||
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
MemoryPool* pool, std::shared_ptr<Buffer> null_bitmap = NULLPTR);
|
||||
|
||||
private:
|
||||
const MapType* map_type_;
|
||||
std::shared_ptr<Array> keys_, items_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeListArray
|
||||
|
||||
/// Concrete Array class for fixed size list data
|
||||
class ARROW_EXPORT FixedSizeListArray : public Array {
|
||||
public:
|
||||
using TypeClass = FixedSizeListType;
|
||||
using offset_type = TypeClass::offset_type;
|
||||
|
||||
explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const FixedSizeListType* list_type() const;
|
||||
|
||||
/// \brief Return array object containing the list's values
|
||||
const std::shared_ptr<Array>& values() const;
|
||||
|
||||
const std::shared_ptr<DataType>& value_type() const;
|
||||
|
||||
// The following functions will not perform boundschecking
|
||||
int64_t value_offset(int64_t i) const {
|
||||
i += data_->offset;
|
||||
return list_size_ * i;
|
||||
}
|
||||
/// \brief Return the fixed-size of the values
|
||||
///
|
||||
/// No matter the value of the index parameter, the result is the same.
|
||||
/// So even when the value at slot i is null, this function will return a
|
||||
/// non-zero size.
|
||||
///
|
||||
/// \pre IsValid(i)
|
||||
int32_t value_length(int64_t i = 0) const {
|
||||
ARROW_UNUSED(i);
|
||||
return list_size_;
|
||||
}
|
||||
/// \pre IsValid(i)
|
||||
std::shared_ptr<Array> value_slice(int64_t i) const {
|
||||
return values_->Slice(value_offset(i), value_length(i));
|
||||
}
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration null elements (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Flatten all level recursively until reach a non-list type, and return
|
||||
/// a non-list type Array.
|
||||
///
|
||||
/// \see internal::FlattenLogicalListRecursively
|
||||
Result<std::shared_ptr<Array>> FlattenRecursively(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const {
|
||||
return internal::FlattenLogicalListRecursively(*this, memory_pool);
|
||||
}
|
||||
|
||||
/// \brief Construct FixedSizeListArray from child value array and value_length
|
||||
///
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] list_size The fixed length of each list
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
/// \return Will have length equal to values.length() / list_size
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& values, int32_t list_size,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Construct FixedSizeListArray from child value array and type
|
||||
///
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] type The fixed sized list type
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
/// \return Will have length equal to values.length() / type.list_size()
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
int32_t list_size_;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Array> values_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Struct
|
||||
|
||||
/// Concrete Array class for struct data
|
||||
class ARROW_EXPORT StructArray : public Array {
|
||||
public:
|
||||
using TypeClass = StructType;
|
||||
|
||||
explicit StructArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
StructArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::vector<std::shared_ptr<Array>>& children,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Return a StructArray from child arrays and field names.
|
||||
///
|
||||
/// The length and data type are automatically inferred from the arguments.
|
||||
/// There should be at least one child array.
|
||||
static Result<std::shared_ptr<StructArray>> Make(
|
||||
const ArrayVector& children, const std::vector<std::string>& field_names,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Return a StructArray from child arrays and fields.
|
||||
///
|
||||
/// The length is automatically inferred from the arguments.
|
||||
/// There should be at least one child array. This method does not
|
||||
/// check that field types and child array types are consistent.
|
||||
static Result<std::shared_ptr<StructArray>> Make(
|
||||
const ArrayVector& children, const FieldVector& fields,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const StructType* struct_type() const;
|
||||
|
||||
// Return a shared pointer in case the requestor desires to share ownership
|
||||
// with this array. The returned array has its offset, length and null
|
||||
// count adjusted.
|
||||
const std::shared_ptr<Array>& field(int pos) const;
|
||||
|
||||
const ArrayVector& fields() const;
|
||||
|
||||
/// Returns null if name not found
|
||||
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
|
||||
|
||||
/// Indicate if field named `name` can be found unambiguously in the struct.
|
||||
Status CanReferenceFieldByName(const std::string& name) const;
|
||||
|
||||
/// Indicate if fields named `names` can be found unambiguously in the struct.
|
||||
Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;
|
||||
|
||||
/// \brief Flatten this array as a vector of arrays, one for each field
|
||||
///
|
||||
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Get one of the child arrays, combining its null bitmap
|
||||
/// with the parent struct array's bitmap.
|
||||
///
|
||||
/// \param[in] index Which child array to get
|
||||
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<std::shared_ptr<Array>> GetFlattenedField(
|
||||
int index, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
private:
|
||||
// For caching boxed child data
|
||||
// XXX This is not handled in a thread-safe manner.
|
||||
mutable ArrayVector boxed_fields_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Union
|
||||
|
||||
/// Base class for SparseUnionArray and DenseUnionArray
|
||||
class ARROW_EXPORT UnionArray : public Array {
|
||||
public:
|
||||
using type_code_t = int8_t;
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& type_codes() const { return data_->buffers[1]; }
|
||||
|
||||
const type_code_t* raw_type_codes() const { return raw_type_codes_; }
|
||||
|
||||
/// The logical type code of the value at index.
|
||||
type_code_t type_code(int64_t i) const { return raw_type_codes_[i]; }
|
||||
|
||||
/// The physical child id containing value at index.
|
||||
int child_id(int64_t i) const { return union_type_->child_ids()[raw_type_codes_[i]]; }
|
||||
|
||||
const UnionType* union_type() const { return union_type_; }
|
||||
|
||||
UnionMode::type mode() const { return union_type_->mode(); }
|
||||
|
||||
/// \brief Return the given field as an individual array.
|
||||
///
|
||||
/// For sparse unions, the returned array has its offset, length and null
|
||||
/// count adjusted.
|
||||
std::shared_ptr<Array> field(int pos) const;
|
||||
|
||||
protected:
|
||||
void SetData(std::shared_ptr<ArrayData> data);
|
||||
|
||||
const type_code_t* raw_type_codes_;
|
||||
const UnionType* union_type_;
|
||||
|
||||
// For caching boxed child data
|
||||
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
|
||||
};
|
||||
|
||||
/// Concrete Array class for sparse union data
|
||||
class ARROW_EXPORT SparseUnionArray : public UnionArray {
|
||||
public:
|
||||
using TypeClass = SparseUnionType;
|
||||
|
||||
explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
|
||||
std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct SparseUnionArray from type_ids and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
|
||||
std::vector<type_code_t> type_codes) {
|
||||
return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
|
||||
std::move(type_codes));
|
||||
}
|
||||
|
||||
/// \brief Construct SparseUnionArray with custom field names from type_ids and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] field_names Vector of strings containing the name of each field.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
|
||||
std::vector<std::string> field_names = {},
|
||||
std::vector<type_code_t> type_codes = {});
|
||||
|
||||
const SparseUnionType* union_type() const {
|
||||
return internal::checked_cast<const SparseUnionType*>(union_type_);
|
||||
}
|
||||
|
||||
/// \brief Get one of the child arrays, adjusting its null bitmap
|
||||
/// where the union array type code does not match.
|
||||
///
|
||||
/// \param[in] index Which child array to get (i.e. the physical index, not the type
|
||||
/// code) \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<std::shared_ptr<Array>> GetFlattenedField(
|
||||
int index, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
protected:
|
||||
void SetData(std::shared_ptr<ArrayData> data);
|
||||
};
|
||||
|
||||
/// \brief Concrete Array class for dense union data
|
||||
///
|
||||
/// Note that union types do not have a validity bitmap
|
||||
class ARROW_EXPORT DenseUnionArray : public UnionArray {
|
||||
public:
|
||||
using TypeClass = DenseUnionType;
|
||||
|
||||
explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
|
||||
std::shared_ptr<Buffer> type_ids,
|
||||
std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] value_offsets An array of signed int32 values indicating the
|
||||
/// relative offset into the respective child array for the type in a given slot.
|
||||
/// The respective offsets for each child value array must be in order / increasing.
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
|
||||
const Array& value_offsets,
|
||||
ArrayVector children,
|
||||
std::vector<type_code_t> type_codes) {
|
||||
return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
|
||||
std::move(type_codes));
|
||||
}
|
||||
|
||||
/// \brief Construct DenseUnionArray with custom field names from type_ids,
|
||||
/// value_offsets, and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] value_offsets An array of signed int32 values indicating the
|
||||
/// relative offset into the respective child array for the type in a given slot.
|
||||
/// The respective offsets for each child value array must be in order / increasing.
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] field_names Vector of strings containing the name of each field.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
|
||||
const Array& value_offsets,
|
||||
ArrayVector children,
|
||||
std::vector<std::string> field_names = {},
|
||||
std::vector<type_code_t> type_codes = {});
|
||||
|
||||
const DenseUnionType* union_type() const {
|
||||
return internal::checked_cast<const DenseUnionType*>(union_type_);
|
||||
}
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[2]; }
|
||||
|
||||
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i]; }
|
||||
|
||||
const int32_t* raw_value_offsets() const { return raw_value_offsets_; }
|
||||
|
||||
protected:
|
||||
const int32_t* raw_value_offsets_;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,220 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor types for primitive/C-type-based arrays, such as numbers,
|
||||
// boolean, and temporal types.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/stl_iterator.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h" // IWYU pragma: export
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// Concrete Array class for boolean data
|
||||
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = BooleanType;
|
||||
using IteratorType = stl::ArrayIterator<BooleanArray>;
|
||||
|
||||
explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
bool Value(int64_t i) const {
|
||||
return bit_util::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
|
||||
i + data_->offset);
|
||||
}
|
||||
|
||||
bool GetView(int64_t i) const { return Value(i); }
|
||||
|
||||
std::optional<bool> operator[](int64_t i) const { return *IteratorType(*this, i); }
|
||||
|
||||
/// \brief Return the number of false (0) values among the valid
|
||||
/// values. Result is not cached.
|
||||
int64_t false_count() const;
|
||||
|
||||
/// \brief Return the number of true (1) values among the valid
|
||||
/// values. Result is not cached.
|
||||
int64_t true_count() const;
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
using PrimitiveArray::PrimitiveArray;
|
||||
};
|
||||
|
||||
/// \addtogroup numeric-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Concrete Array class for numeric data with a corresponding C type
|
||||
///
|
||||
/// This class is templated on the corresponding DataType subclass for the
|
||||
/// given data, for example NumericArray<Int8Type> or NumericArray<Date32Type>.
|
||||
///
|
||||
/// Note that convenience aliases are available for all accepted types
|
||||
/// (for example Int8Array for NumericArray<Int8Type>).
|
||||
template <typename TYPE>
|
||||
class NumericArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using value_type = typename TypeClass::c_type;
|
||||
using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
|
||||
|
||||
explicit NumericArray(const std::shared_ptr<ArrayData>& data) {
|
||||
NumericArray::SetData(data);
|
||||
}
|
||||
|
||||
// Only enable this constructor without a type argument for types without additional
|
||||
// metadata
|
||||
template <typename T1 = TYPE>
|
||||
NumericArray(enable_if_parameter_free<T1, int64_t> length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0) {
|
||||
NumericArray::SetData(ArrayData::Make(TypeTraits<T1>::type_singleton(), length,
|
||||
{null_bitmap, data}, null_count, offset));
|
||||
}
|
||||
|
||||
NumericArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0) {
|
||||
NumericArray::SetData(ArrayData::Make(std::move(type), length, {null_bitmap, data},
|
||||
null_count, offset));
|
||||
}
|
||||
|
||||
const value_type* raw_values() const { return values_; }
|
||||
|
||||
value_type Value(int64_t i) const { return values_[i]; }
|
||||
|
||||
// For API compatibility with BinaryArray etc.
|
||||
value_type GetView(int64_t i) const { return values_[i]; }
|
||||
|
||||
std::optional<value_type> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
NumericArray() : values_(NULLPTR) {}
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->PrimitiveArray::SetData(data);
|
||||
values_ = raw_values_
|
||||
? (reinterpret_cast<const value_type*>(raw_values_) + data_->offset)
|
||||
: NULLPTR;
|
||||
}
|
||||
|
||||
const value_type* values_;
|
||||
};
|
||||
|
||||
/// DayTimeArray
|
||||
/// ---------------------
|
||||
/// \brief Array of Day and Millisecond values.
|
||||
class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = DayTimeIntervalType;
|
||||
using IteratorType = stl::ArrayIterator<DayTimeIntervalArray>;
|
||||
|
||||
explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
DayTimeIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
TypeClass::DayMilliseconds GetValue(int64_t i) const;
|
||||
TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
// For compatibility with Take kernel.
|
||||
TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
std::optional<TypeClass::DayMilliseconds> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
|
||||
};
|
||||
|
||||
/// \brief Array of Month, Day and nanosecond values.
|
||||
class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = MonthDayNanoIntervalType;
|
||||
using IteratorType = stl::ArrayIterator<MonthDayNanoIntervalArray>;
|
||||
|
||||
explicit MonthDayNanoIntervalArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
MonthDayNanoIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
TypeClass::MonthDayNanos GetValue(int64_t i) const;
|
||||
TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
// For compatibility with Take kernel.
|
||||
TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
std::optional<TypeClass::MonthDayNanos> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,133 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes run-end encoded arrays
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup run-end-encoded-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// RunEndEncoded
|
||||
|
||||
/// \brief Array type for run-end encoded data
|
||||
class ARROW_EXPORT RunEndEncodedArray : public Array {
|
||||
private:
|
||||
std::shared_ptr<Array> run_ends_array_;
|
||||
std::shared_ptr<Array> values_array_;
|
||||
|
||||
public:
|
||||
using TypeClass = RunEndEncodedType;
|
||||
|
||||
explicit RunEndEncodedArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
/// \brief Construct a RunEndEncodedArray from all parameters
|
||||
///
|
||||
/// The length and offset parameters refer to the dimensions of the logical
|
||||
/// array which is the array we would get after expanding all the runs into
|
||||
/// repeated values. As such, length can be much greater than the length of
|
||||
/// the child run_ends and values arrays.
|
||||
RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Array>& run_ends,
|
||||
const std::shared_ptr<Array>& values, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct a RunEndEncodedArray from all parameters
|
||||
///
|
||||
/// The length and offset parameters refer to the dimensions of the logical
|
||||
/// array which is the array we would get after expanding all the runs into
|
||||
/// repeated values. As such, length can be much greater than the length of
|
||||
/// the child run_ends and values arrays.
|
||||
static Result<std::shared_ptr<RunEndEncodedArray>> Make(
|
||||
const std::shared_ptr<DataType>& type, int64_t logical_length,
|
||||
const std::shared_ptr<Array>& run_ends, const std::shared_ptr<Array>& values,
|
||||
int64_t logical_offset = 0);
|
||||
|
||||
/// \brief Construct a RunEndEncodedArray from values and run ends arrays
|
||||
///
|
||||
/// The data type is automatically inferred from the arguments.
|
||||
/// The run_ends and values arrays must have the same length.
|
||||
static Result<std::shared_ptr<RunEndEncodedArray>> Make(
|
||||
int64_t logical_length, const std::shared_ptr<Array>& run_ends,
|
||||
const std::shared_ptr<Array>& values, int64_t logical_offset = 0);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
public:
|
||||
/// \brief Returns an array holding the logical indexes of each run-end
|
||||
///
|
||||
/// The physical offset to the array is applied.
|
||||
const std::shared_ptr<Array>& run_ends() const { return run_ends_array_; }
|
||||
|
||||
/// \brief Returns an array holding the values of each run
|
||||
///
|
||||
/// The physical offset to the array is applied.
|
||||
const std::shared_ptr<Array>& values() const { return values_array_; }
|
||||
|
||||
/// \brief Returns an array holding the logical indexes of each run end
|
||||
///
|
||||
/// If a non-zero logical offset is set, this function allocates a new
|
||||
/// array and rewrites all the run end values to be relative to the logical
|
||||
/// offset and cuts the end of the array to the logical length.
|
||||
Result<std::shared_ptr<Array>> LogicalRunEnds(MemoryPool* pool) const;
|
||||
|
||||
/// \brief Returns an array holding the values of each run
|
||||
///
|
||||
/// If a non-zero logical offset is set, this function allocates a new
|
||||
/// array containing only the values within the logical range.
|
||||
std::shared_ptr<Array> LogicalValues() const;
|
||||
|
||||
/// \brief Find the physical offset of this REE array
|
||||
///
|
||||
/// This function uses binary-search, so it has a O(log N) cost.
|
||||
int64_t FindPhysicalOffset() const;
|
||||
|
||||
/// \brief Find the physical length of this REE array
|
||||
///
|
||||
/// The physical length of an REE is the number of physical values (and
|
||||
/// run-ends) necessary to represent the logical range of values from offset
|
||||
/// to length.
|
||||
///
|
||||
/// Avoid calling this function if the physical length can be established in
|
||||
/// some other way (e.g. when iterating over the runs sequentially until the
|
||||
/// end). This function uses binary-search, so it has a O(log N) cost.
|
||||
int64_t FindPhysicalLength() const;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,215 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
|
||||
public:
|
||||
AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
explicit AdaptiveIntBuilderBase(MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: AdaptiveIntBuilderBase(sizeof(uint8_t), pool, alignment) {}
|
||||
|
||||
/// \brief Append multiple nulls
|
||||
/// \param[in] length the number of nulls to append
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(CommitPendingData());
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
|
||||
UnsafeSetNull(length);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
pending_data_[pending_pos_] = 0;
|
||||
pending_valid_[pending_pos_] = 0;
|
||||
pending_has_nulls_ = true;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
++null_count_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(CommitPendingData());
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
|
||||
UnsafeSetNotNull(length);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
pending_data_[pending_pos_] = 0;
|
||||
pending_valid_[pending_pos_] = 1;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
|
||||
protected:
|
||||
Status AppendInternal(const uint64_t val) {
|
||||
pending_data_[pending_pos_] = val;
|
||||
pending_valid_[pending_pos_] = 1;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status CommitPendingData() = 0;
|
||||
|
||||
template <typename new_type, typename old_type>
|
||||
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
|
||||
ExpandIntSizeInternal();
|
||||
template <typename new_type, typename old_type>
|
||||
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
|
||||
ExpandIntSizeInternal();
|
||||
|
||||
std::shared_ptr<ResizableBuffer> data_;
|
||||
uint8_t* raw_data_ = NULLPTR;
|
||||
|
||||
const uint8_t start_int_size_;
|
||||
uint8_t int_size_;
|
||||
|
||||
static constexpr int32_t pending_size_ = 1024;
|
||||
uint8_t pending_valid_[pending_size_];
|
||||
uint64_t pending_data_[pending_size_];
|
||||
int32_t pending_pos_ = 0;
|
||||
bool pending_has_nulls_ = false;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
|
||||
public:
|
||||
explicit AdaptiveUIntBuilder(uint8_t start_int_size,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool())
|
||||
: AdaptiveUIntBuilder(sizeof(uint8_t), pool) {}
|
||||
|
||||
using internal::AdaptiveIntBuilderBase::Reset;
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const uint64_t val) { return AppendInternal(val); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const uint64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
protected:
|
||||
Status CommitPendingData() override;
|
||||
Status ExpandIntSize(uint8_t new_int_size);
|
||||
|
||||
Status AppendValuesInternal(const uint64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes);
|
||||
|
||||
template <typename new_type>
|
||||
Status ExpandIntSizeN();
|
||||
};
|
||||
|
||||
class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
|
||||
public:
|
||||
explicit AdaptiveIntBuilder(uint8_t start_int_size,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {}
|
||||
|
||||
using internal::AdaptiveIntBuilderBase::Reset;
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const int64_t val) { return AppendInternal(static_cast<uint64_t>(val)); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
protected:
|
||||
Status CommitPendingData() override;
|
||||
Status ExpandIntSize(uint8_t new_int_size);
|
||||
|
||||
Status AppendValuesInternal(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes);
|
||||
|
||||
template <typename new_type>
|
||||
Status ExpandIntSizeN();
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,371 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm> // IWYU pragma: keep
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_primitive.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <class Builder, class V>
|
||||
class ArrayBuilderExtraOps {
|
||||
public:
|
||||
/// \brief Append a value from an optional or null if it has no value.
|
||||
Status AppendOrNull(const std::optional<V>& value) {
|
||||
auto* self = static_cast<Builder*>(this);
|
||||
return value.has_value() ? self->Append(*value) : self->AppendNull();
|
||||
}
|
||||
|
||||
/// \brief Append a value from an optional or null if it has no value.
|
||||
///
|
||||
/// Unsafe methods don't check existing size.
|
||||
void UnsafeAppendOrNull(const std::optional<V>& value) {
|
||||
auto* self = static_cast<Builder*>(this);
|
||||
return value.has_value() ? self->UnsafeAppend(*value) : self->UnsafeAppendNull();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \defgroup numeric-builders Concrete builder subclasses for numeric types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup temporal-builders Concrete builder subclasses for temporal types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup binary-builders Concrete builder subclasses for binary types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup nested-builders Concrete builder subclasses for nested types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup run-end-encoded-builders Concrete builder subclasses for run-end encoded
|
||||
/// arrays
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
constexpr int64_t kMinBuilderCapacity = 1 << 5;
|
||||
constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
|
||||
|
||||
/// Base class for all data array builders.
|
||||
///
|
||||
/// This class provides a facilities for incrementally building the null bitmap
|
||||
/// (see Append methods) and as a side effect the current number of slots and
|
||||
/// the null count.
|
||||
///
|
||||
/// \note Users are expected to use builders as one of the concrete types below.
|
||||
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
|
||||
class ARROW_EXPORT ArrayBuilder {
|
||||
public:
|
||||
explicit ArrayBuilder(MemoryPool* pool, int64_t alignment = kDefaultBufferAlignment)
|
||||
: pool_(pool), alignment_(alignment), null_bitmap_builder_(pool, alignment) {}
|
||||
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
|
||||
|
||||
virtual ~ArrayBuilder() = default;
|
||||
|
||||
/// For nested types. Since the objects are owned by this class instance, we
|
||||
/// skip shared pointers and just return a raw pointer
|
||||
ArrayBuilder* child(int i) { return children_[i].get(); }
|
||||
|
||||
const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
|
||||
|
||||
int num_children() const { return static_cast<int>(children_.size()); }
|
||||
|
||||
virtual int64_t length() const { return length_; }
|
||||
int64_t null_count() const { return null_count_; }
|
||||
int64_t capacity() const { return capacity_; }
|
||||
|
||||
/// \brief Ensure that enough memory has been allocated to fit the indicated
|
||||
/// number of total elements in the builder, including any that have already
|
||||
/// been appended. Does not account for reallocations that may be due to
|
||||
/// variable size data, like binary values. To make space for incremental
|
||||
/// appends, use Reserve instead.
|
||||
///
|
||||
/// \param[in] capacity the minimum number of total array values to
|
||||
/// accommodate. Must be greater than the current capacity.
|
||||
/// \return Status
|
||||
virtual Status Resize(int64_t capacity);
|
||||
|
||||
/// \brief Ensure that there is enough space allocated to append the indicated
|
||||
/// number of elements without any further reallocation. Overallocation is
|
||||
/// used in order to minimize the impact of incremental Reserve() calls.
|
||||
/// Note that additional_capacity is relative to the current number of elements
|
||||
/// rather than to the current capacity, so calls to Reserve() which are not
|
||||
/// interspersed with addition of new elements may not increase the capacity.
|
||||
///
|
||||
/// \param[in] additional_capacity the number of additional array values
|
||||
/// \return Status
|
||||
Status Reserve(int64_t additional_capacity) {
|
||||
auto current_capacity = capacity();
|
||||
auto min_capacity = length() + additional_capacity;
|
||||
if (min_capacity <= current_capacity) return Status::OK();
|
||||
|
||||
// leave growth factor up to BufferBuilder
|
||||
auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
|
||||
return Resize(new_capacity);
|
||||
}
|
||||
|
||||
/// Reset the builder.
|
||||
virtual void Reset();
|
||||
|
||||
/// \brief Append a null value to builder
|
||||
virtual Status AppendNull() = 0;
|
||||
/// \brief Append a number of null values to builder
|
||||
virtual Status AppendNulls(int64_t length) = 0;
|
||||
|
||||
/// \brief Append a non-null value to builder
|
||||
///
|
||||
/// The appended value is an implementation detail, but the corresponding
|
||||
/// memory slot is guaranteed to be initialized.
|
||||
/// This method is useful when appending a null value to a parent nested type.
|
||||
virtual Status AppendEmptyValue() = 0;
|
||||
|
||||
/// \brief Append a number of non-null values to builder
|
||||
///
|
||||
/// The appended values are an implementation detail, but the corresponding
|
||||
/// memory slot is guaranteed to be initialized.
|
||||
/// This method is useful when appending null values to a parent nested type.
|
||||
virtual Status AppendEmptyValues(int64_t length) = 0;
|
||||
|
||||
/// \brief Append a value from a scalar
|
||||
Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
|
||||
virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
|
||||
virtual Status AppendScalars(const ScalarVector& scalars);
|
||||
|
||||
/// \brief Append a range of values from an array.
|
||||
///
|
||||
/// The given array must be the same type as the builder.
|
||||
virtual Status AppendArraySlice(const ArraySpan& ARROW_ARG_UNUSED(array),
|
||||
int64_t ARROW_ARG_UNUSED(offset),
|
||||
int64_t ARROW_ARG_UNUSED(length)) {
|
||||
return Status::NotImplemented("AppendArraySlice for builder for ", *type());
|
||||
}
|
||||
|
||||
/// \brief Return result of builder as an internal generic ArrayData
|
||||
/// object. Resets builder except for dictionary builder
|
||||
///
|
||||
/// \param[out] out the finalized ArrayData object
|
||||
/// \return Status
|
||||
virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
|
||||
|
||||
/// \brief Return result of builder as an Array object.
|
||||
///
|
||||
/// The builder is reset except for DictionaryBuilder.
|
||||
///
|
||||
/// \param[out] out the finalized Array object
|
||||
/// \return Status
|
||||
Status Finish(std::shared_ptr<Array>* out);
|
||||
|
||||
/// \brief Return result of builder as an Array object.
|
||||
///
|
||||
/// The builder is reset except for DictionaryBuilder.
|
||||
///
|
||||
/// \return The finalized Array object
|
||||
Result<std::shared_ptr<Array>> Finish();
|
||||
|
||||
/// \brief Return the type of the built Array
|
||||
virtual std::shared_ptr<DataType> type() const = 0;
|
||||
|
||||
protected:
|
||||
/// Append to null bitmap
|
||||
Status AppendToBitmap(bool is_valid);
|
||||
|
||||
/// Vector append. Treat each zero byte as a null. If valid_bytes is null
|
||||
/// assume all of length bits are valid.
|
||||
Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
|
||||
|
||||
/// Uniform append. Append N times the same validity bit.
|
||||
Status AppendToBitmap(int64_t num_bits, bool value);
|
||||
|
||||
/// Set the next length bits to not null (i.e. valid).
|
||||
Status SetNotNull(int64_t length);
|
||||
|
||||
// Unsafe operations (don't check capacity/don't resize)
|
||||
|
||||
void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
|
||||
|
||||
// Append to null bitmap, update the length
|
||||
void UnsafeAppendToBitmap(bool is_valid) {
|
||||
null_bitmap_builder_.UnsafeAppend(is_valid);
|
||||
++length_;
|
||||
if (!is_valid) ++null_count_;
|
||||
}
|
||||
|
||||
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
|
||||
// assume all of length bits are valid.
|
||||
void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
|
||||
if (valid_bytes == NULLPTR) {
|
||||
return UnsafeSetNotNull(length);
|
||||
}
|
||||
null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
|
||||
length_ += length;
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
// Vector append. Copy from a given bitmap. If bitmap is null assume
|
||||
// all of length bits are valid.
|
||||
void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
|
||||
if (bitmap == NULLPTR) {
|
||||
return UnsafeSetNotNull(length);
|
||||
}
|
||||
null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
|
||||
length_ += length;
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
// Append the same validity value a given number of times.
|
||||
void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
|
||||
if (value) {
|
||||
UnsafeSetNotNull(num_bits);
|
||||
} else {
|
||||
UnsafeSetNull(num_bits);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
|
||||
|
||||
// Set the next validity bits to not null (i.e. valid).
|
||||
void UnsafeSetNotNull(int64_t length);
|
||||
|
||||
// Set the next validity bits to null (i.e. invalid).
|
||||
void UnsafeSetNull(int64_t length);
|
||||
|
||||
static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
|
||||
|
||||
/// \brief Finish to an array of the specified ArrayType
|
||||
template <typename ArrayType>
|
||||
Status FinishTyped(std::shared_ptr<ArrayType>* out) {
|
||||
std::shared_ptr<Array> out_untyped;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out_untyped));
|
||||
*out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Check the requested capacity for validity
|
||||
Status CheckCapacity(int64_t new_capacity) {
|
||||
if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
|
||||
return Status::Invalid(
|
||||
"Resize capacity must be positive (requested: ", new_capacity, ")");
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
|
||||
return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
|
||||
", current length: ", length_, ")");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Check for array type
|
||||
Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
|
||||
const Array& array, const char* message);
|
||||
Status CheckArrayType(Type::type expected_type, const Array& array,
|
||||
const char* message);
|
||||
|
||||
MemoryPool* pool_;
|
||||
int64_t alignment_;
|
||||
|
||||
TypedBufferBuilder<bool> null_bitmap_builder_;
|
||||
int64_t null_count_ = 0;
|
||||
|
||||
// Array length, so far. Also, the index of the next element to be added
|
||||
int64_t length_ = 0;
|
||||
int64_t capacity_ = 0;
|
||||
|
||||
// Child value array builders. These are owned by this class
|
||||
std::vector<std::shared_ptr<ArrayBuilder>> children_;
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
|
||||
};
|
||||
|
||||
/// \brief Construct an empty ArrayBuilder corresponding to the data
|
||||
/// type
|
||||
/// \param[in] pool the MemoryPool to use for allocations
|
||||
/// \param[in] type the data type to create the builder for
|
||||
/// \param[out] out the created ArrayBuilder
|
||||
ARROW_EXPORT
|
||||
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
|
||||
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Construct an empty ArrayBuilder corresponding to the data
|
||||
/// type, where any top-level or nested dictionary builders return the
|
||||
/// exact index type specified by the type.
|
||||
ARROW_EXPORT
|
||||
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
|
||||
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Construct an empty DictionaryBuilder initialized optionally
|
||||
/// with a preexisting dictionary
|
||||
/// \param[in] pool the MemoryPool to use for allocations
|
||||
/// \param[in] type the dictionary type to create the builder for
|
||||
/// \param[in] dictionary the initial dictionary, if any. May be nullptr
|
||||
/// \param[out] out the created ArrayBuilder
|
||||
ARROW_EXPORT
|
||||
Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
const std::shared_ptr<Array>& dictionary,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,993 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/binary_view_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup binary-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary and String
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseBinaryBuilder
|
||||
: public ArrayBuilder,
|
||||
public internal::ArrayBuilderExtraOps<BaseBinaryBuilder<TYPE>, std::string_view> {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
offsets_builder_(pool, alignment),
|
||||
value_data_builder_(pool, alignment) {}
|
||||
|
||||
BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
|
||||
: BaseBinaryBuilder(pool) {}
|
||||
|
||||
Status Append(const uint8_t* value, offset_type length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNextOffset();
|
||||
// Safety check for UBSAN.
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
||||
}
|
||||
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value, offset_type length) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
Status Append(std::string_view value) {
|
||||
return Append(value.data(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
/// Extend the last appended value by appending more data at the end
|
||||
///
|
||||
/// Unlike Append, this does not create a new offset.
|
||||
Status ExtendCurrent(const uint8_t* value, offset_type length) {
|
||||
// Safety check for UBSAN.
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ExtendCurrent(std::string_view value) {
|
||||
return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNextOffset();
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNextOffset();
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append without checking capacity
|
||||
///
|
||||
/// Offsets and data should have been presized using Reserve() and
|
||||
/// ReserveData(), respectively.
|
||||
void UnsafeAppend(const uint8_t* value, offset_type length) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(value, length);
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value, offset_type length) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const std::string& value) {
|
||||
UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(std::string_view value) {
|
||||
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
/// Like ExtendCurrent, but do not check capacity
|
||||
void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
|
||||
value_data_builder_.UnsafeAppend(value, length);
|
||||
}
|
||||
|
||||
void UnsafeExtendCurrent(std::string_view value) {
|
||||
UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppendEmptyValue() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of strings in one shot.
|
||||
///
|
||||
/// \param[in] values a vector of strings
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<std::string>& values,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
std::size_t total_length = std::accumulate(
|
||||
values.begin(), values.end(), 0ULL,
|
||||
[](uint64_t sum, const std::string& str) { return sum + str.size(); });
|
||||
ARROW_RETURN_NOT_OK(Reserve(values.size()));
|
||||
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
||||
|
||||
if (valid_bytes != NULLPTR) {
|
||||
for (std::size_t i = 0; i < values.size(); ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (valid_bytes[i]) {
|
||||
value_data_builder_.UnsafeAppend(
|
||||
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (const auto& value : values) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
value.size());
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeAppendToBitmap(valid_bytes, values.size());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of nul-terminated strings in one shot.
|
||||
/// If one of the values is NULL, it is processed as a null
|
||||
/// value even if the corresponding valid_bytes entry is 1.
|
||||
///
|
||||
/// \param[in] values a contiguous C array of nul-terminated char *
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const char** values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
std::size_t total_length = 0;
|
||||
std::vector<std::size_t> value_lengths(length);
|
||||
bool have_null_value = false;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
if (values[i] != NULLPTR) {
|
||||
auto value_length = strlen(values[i]);
|
||||
value_lengths[i] = value_length;
|
||||
total_length += value_length;
|
||||
} else {
|
||||
have_null_value = true;
|
||||
}
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
||||
|
||||
if (valid_bytes) {
|
||||
int64_t valid_bytes_offset = 0;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (valid_bytes[i]) {
|
||||
if (values[i]) {
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
} else {
|
||||
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
|
||||
i - valid_bytes_offset);
|
||||
UnsafeAppendToBitmap(false);
|
||||
valid_bytes_offset = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
|
||||
} else {
|
||||
if (have_null_value) {
|
||||
std::vector<uint8_t> valid_vector(length, 0);
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (values[i]) {
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
valid_vector[i] = 1;
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(valid_vector.data(), length);
|
||||
} else {
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
}
|
||||
UnsafeAppendToBitmap(NULLPTR, length);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
auto bitmap = array.GetValues<uint8_t>(0, 0);
|
||||
auto offsets = array.GetValues<offset_type>(1);
|
||||
auto data = array.GetValues<uint8_t>(2, 0);
|
||||
auto total_length = offsets[offset + length] - offsets[offset];
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
||||
for (int64_t i = 0; i < length; i++) {
|
||||
if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
|
||||
const offset_type start = offsets[offset + i];
|
||||
const offset_type end = offsets[offset + i + 1];
|
||||
UnsafeAppend(data + start, end - start);
|
||||
} else {
|
||||
UnsafeAppendNull();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
ArrayBuilder::Reset();
|
||||
offsets_builder_.Reset();
|
||||
value_data_builder_.Reset();
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_bytes) {
|
||||
auto new_size = value_data_builder_.length() + new_bytes;
|
||||
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
||||
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
||||
" bytes, have ", new_size);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
// One more than requested for offsets
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated capacity to append the indicated
|
||||
/// number of bytes to the value data buffer without additional allocations
|
||||
Status ReserveData(int64_t elements) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
||||
return value_data_builder_.Reserve(elements);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
// Write final offset (values length)
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
|
||||
// These buffers' padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
|
||||
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
|
||||
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
|
||||
null_count_, 0);
|
||||
Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \return data pointer of the value date builder
|
||||
const uint8_t* value_data() const { return value_data_builder_.data(); }
|
||||
/// \return size of values buffer so far
|
||||
int64_t value_data_length() const { return value_data_builder_.length(); }
|
||||
/// \return capacity of values buffer
|
||||
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
|
||||
|
||||
/// \return data pointer of the value date builder
|
||||
const offset_type* offsets_data() const { return offsets_builder_.data(); }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
|
||||
const offset_type* offsets = offsets_builder_.data();
|
||||
const auto offset = offsets[i];
|
||||
if (i == (length_ - 1)) {
|
||||
*out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
|
||||
} else {
|
||||
*out_length = offsets[i + 1] - offset;
|
||||
}
|
||||
return value_data_builder_.data() + offset;
|
||||
}
|
||||
|
||||
offset_type offset(int64_t i) const { return offsets_data()[i]; }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This view becomes invalid on the next modifying operation.
|
||||
std::string_view GetView(int64_t i) const {
|
||||
offset_type value_length;
|
||||
const uint8_t* value_data = GetValue(i, &value_length);
|
||||
return std::string_view(reinterpret_cast<const char*>(value_data), value_length);
|
||||
}
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t memory_limit() {
|
||||
return std::numeric_limits<offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<offset_type> offsets_builder_;
|
||||
TypedBufferBuilder<uint8_t> value_data_builder_;
|
||||
|
||||
Status AppendNextOffset() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
|
||||
void UnsafeAppendNextOffset() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
};
|
||||
|
||||
/// \class BinaryBuilder
|
||||
/// \brief Builder class for variable-length binary data
|
||||
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
|
||||
public:
|
||||
using BaseBinaryBuilder::BaseBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return binary(); }
|
||||
};
|
||||
|
||||
/// \class StringBuilder
|
||||
/// \brief Builder class for UTF8 strings
|
||||
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
|
||||
public:
|
||||
using BinaryBuilder::BinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return utf8(); }
|
||||
};
|
||||
|
||||
/// \class LargeBinaryBuilder
|
||||
/// \brief Builder class for large variable-length binary data
|
||||
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
|
||||
public:
|
||||
using BaseBinaryBuilder::BaseBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return large_binary(); }
|
||||
};
|
||||
|
||||
/// \class LargeStringBuilder
|
||||
/// \brief Builder class for large UTF8 strings
|
||||
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
|
||||
public:
|
||||
using LargeBinaryBuilder::LargeBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return large_utf8(); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// BinaryViewBuilder, StringViewBuilder
|
||||
//
|
||||
// These builders do not support building raw pointer view arrays.
|
||||
|
||||
namespace internal {
|
||||
|
||||
// We allocate medium-sized memory chunks and accumulate data in those, which
|
||||
// may result in some waste if there are many large-ish strings. If a string
|
||||
// comes along that does not fit into a block, we allocate a new block and
|
||||
// write into that.
|
||||
//
|
||||
// Later we can implement optimizations to continuing filling underfull blocks
|
||||
// after encountering a large string that required allocating a new block.
|
||||
class ARROW_EXPORT StringHeapBuilder {
|
||||
public:
|
||||
static constexpr int64_t kDefaultBlocksize = 32 << 10; // 32KB
|
||||
|
||||
StringHeapBuilder(MemoryPool* pool, int64_t alignment)
|
||||
: pool_(pool), alignment_(alignment) {}
|
||||
|
||||
void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; }
|
||||
|
||||
using c_type = BinaryViewType::c_type;
|
||||
|
||||
template <bool Safe>
|
||||
std::conditional_t<Safe, Result<c_type>, c_type> Append(const uint8_t* value,
|
||||
int64_t length) {
|
||||
if (length <= BinaryViewType::kInlineSize) {
|
||||
return util::ToInlineBinaryView(value, static_cast<int32_t>(length));
|
||||
}
|
||||
|
||||
if constexpr (Safe) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
}
|
||||
|
||||
auto v = util::ToNonInlineBinaryView(value, static_cast<int32_t>(length),
|
||||
static_cast<int32_t>(blocks_.size() - 1),
|
||||
current_offset_);
|
||||
|
||||
memcpy(current_out_buffer_, value, static_cast<size_t>(length));
|
||||
current_out_buffer_ += length;
|
||||
current_remaining_bytes_ -= length;
|
||||
current_offset_ += static_cast<int32_t>(length);
|
||||
return v;
|
||||
}
|
||||
|
||||
static constexpr int64_t ValueSizeLimit() {
|
||||
return std::numeric_limits<int32_t>::max();
|
||||
}
|
||||
|
||||
/// \brief Ensure that the indicated number of bytes can be appended via
|
||||
/// UnsafeAppend operations without the need to allocate more memory
|
||||
Status Reserve(int64_t num_bytes) {
|
||||
if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) {
|
||||
return Status::CapacityError(
|
||||
"BinaryView or StringView elements cannot reference "
|
||||
"strings larger than 2GB");
|
||||
}
|
||||
if (num_bytes > current_remaining_bytes_) {
|
||||
ARROW_RETURN_NOT_OK(FinishLastBlock());
|
||||
current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
std::shared_ptr<ResizableBuffer> new_block,
|
||||
AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_));
|
||||
current_offset_ = 0;
|
||||
current_out_buffer_ = new_block->mutable_data();
|
||||
blocks_.emplace_back(std::move(new_block));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
current_offset_ = 0;
|
||||
current_out_buffer_ = NULLPTR;
|
||||
current_remaining_bytes_ = 0;
|
||||
blocks_.clear();
|
||||
}
|
||||
|
||||
int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
|
||||
|
||||
Result<std::vector<std::shared_ptr<ResizableBuffer>>> Finish() {
|
||||
if (!blocks_.empty()) {
|
||||
ARROW_RETURN_NOT_OK(FinishLastBlock());
|
||||
}
|
||||
current_offset_ = 0;
|
||||
current_out_buffer_ = NULLPTR;
|
||||
current_remaining_bytes_ = 0;
|
||||
return std::move(blocks_);
|
||||
}
|
||||
|
||||
private:
|
||||
Status FinishLastBlock() {
|
||||
if (current_remaining_bytes_ > 0) {
|
||||
// Avoid leaking uninitialized bytes from the allocator
|
||||
ARROW_RETURN_NOT_OK(
|
||||
blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_,
|
||||
/*shrink_to_fit=*/true));
|
||||
blocks_.back()->ZeroPadding();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
MemoryPool* pool_;
|
||||
int64_t alignment_;
|
||||
int64_t blocksize_ = kDefaultBlocksize;
|
||||
std::vector<std::shared_ptr<ResizableBuffer>> blocks_;
|
||||
|
||||
int32_t current_offset_ = 0;
|
||||
uint8_t* current_out_buffer_ = NULLPTR;
|
||||
int64_t current_remaining_bytes_ = 0;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = BinaryViewType;
|
||||
|
||||
// this constructor provided for MakeBuilder compatibility
|
||||
BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool);
|
||||
|
||||
explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
data_builder_(pool, alignment),
|
||||
data_heap_builder_(pool, alignment) {}
|
||||
|
||||
/// Set the size for future preallocated data buffers.
|
||||
///
|
||||
/// The default size is 32KB, so after each 32KB of string data appended to the builder
|
||||
/// a new data buffer will be allocated. Adjust this to a larger value to decrease the
|
||||
/// frequency of allocation, or to a smaller value to lower the overhead of each
|
||||
/// allocation.
|
||||
void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); }
|
||||
|
||||
/// The number of bytes which can be appended to this builder without allocating another
|
||||
/// data buffer.
|
||||
int64_t current_block_bytes_remaining() const {
|
||||
return data_heap_builder_.current_remaining_bytes();
|
||||
}
|
||||
|
||||
Status Append(const uint8_t* value, int64_t length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(true);
|
||||
ARROW_ASSIGN_OR_RAISE(auto v,
|
||||
data_heap_builder_.Append</*Safe=*/true>(value, length));
|
||||
data_builder_.UnsafeAppend(v);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value, int64_t length) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
Status Append(std::string_view value) {
|
||||
return Append(value.data(), static_cast<int64_t>(value.size()));
|
||||
}
|
||||
|
||||
/// \brief Append without checking capacity
|
||||
///
|
||||
/// Builder should have been presized using Reserve() and ReserveData(),
|
||||
/// respectively, and the value must not be larger than 2GB
|
||||
void UnsafeAppend(const uint8_t* value, int64_t length) {
|
||||
UnsafeAppendToBitmap(true);
|
||||
auto v = data_heap_builder_.Append</*Safe=*/false>(value, length);
|
||||
data_builder_.UnsafeAppend(v);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value, int64_t length) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const std::string& value) {
|
||||
UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(std::string_view value) {
|
||||
UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated available capacity in the
|
||||
/// out-of-line data heap to append the indicated number of bytes without
|
||||
/// additional allocations
|
||||
Status ReserveData(int64_t length);
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a empty element (length-0 inline string)
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append several empty elements
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppendEmptyValue() {
|
||||
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
/// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies
|
||||
/// the underlying out-of-line string memory to avoid memory lifetime issues
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
|
||||
void Reset() override;
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return binary_view(); }
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<BinaryViewType::c_type> data_builder_;
|
||||
|
||||
// Accumulates out-of-line data in fixed-size chunks which are then attached
|
||||
// to the resulting ArrayData
|
||||
internal::StringHeapBuilder data_heap_builder_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
|
||||
public:
|
||||
using BinaryViewBuilder::BinaryViewBuilder;
|
||||
std::shared_ptr<DataType> type() const override { return utf8_view(); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeBinaryBuilder
|
||||
|
||||
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = FixedSizeBinaryType;
|
||||
|
||||
explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
Status Append(const uint8_t* value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value));
|
||||
}
|
||||
|
||||
Status Append(std::string_view view) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(view);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const std::string& s) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(s);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const Buffer& s) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(s);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
|
||||
|
||||
template <size_t NBYTES>
|
||||
Status Append(const std::array<uint8_t, NBYTES>& value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(
|
||||
std::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(const uint8_t* data, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
|
||||
int64_t bitmap_offset);
|
||||
|
||||
Status AppendNull() final;
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(
|
||||
array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const uint8_t* value) {
|
||||
UnsafeAppendToBitmap(true);
|
||||
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
|
||||
byte_builder_.UnsafeAppend(value, byte_width_);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
|
||||
}
|
||||
|
||||
void UnsafeAppend(std::string_view value) {
|
||||
#ifndef NDEBUG
|
||||
CheckValueSize(static_cast<size_t>(value.size()));
|
||||
#endif
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); }
|
||||
|
||||
void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
UnsafeAppendToBitmap(false);
|
||||
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_bytes) const {
|
||||
auto new_size = byte_builder_.length() + new_bytes;
|
||||
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
||||
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
||||
" bytes, have ", new_size);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated capacity to append the indicated
|
||||
/// number of bytes to the value data buffer without additional allocations
|
||||
Status ReserveData(int64_t elements) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
||||
return byte_builder_.Reserve(elements);
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \return size of values buffer so far
|
||||
int64_t value_data_length() const { return byte_builder_.length(); }
|
||||
|
||||
int32_t byte_width() const { return byte_width_; }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
const uint8_t* GetValue(int64_t i) const;
|
||||
|
||||
/// Temporary mutable access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
uint8_t* GetMutableValue(int64_t i) {
|
||||
uint8_t* data_ptr = byte_builder_.mutable_data();
|
||||
return data_ptr + i * byte_width_;
|
||||
}
|
||||
|
||||
/// Temporary mutable access to a value.
|
||||
///
|
||||
/// This view becomes invalid on the next modifying operation.
|
||||
std::string_view GetView(int64_t i) const;
|
||||
|
||||
/// Advance builder without allocating nor writing any values
|
||||
///
|
||||
/// The internal pointer is advanced by `length` values and the same number
|
||||
/// of non-null entries are appended to the validity bitmap.
|
||||
/// This method assumes that the `length` values were populated directly,
|
||||
/// for example using `GetMutableValue`.
|
||||
void UnsafeAdvance(int64_t length) {
|
||||
byte_builder_.UnsafeAdvance(length * byte_width_);
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
}
|
||||
|
||||
/// Advance builder without allocating nor writing any values
|
||||
///
|
||||
/// The internal pointer is advanced by `length` values and the same number
|
||||
/// of validity bits are appended to the validity bitmap.
|
||||
/// This method assumes that the `length` values were populated directly,
|
||||
/// for example using `GetMutableValue`.
|
||||
void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) {
|
||||
byte_builder_.UnsafeAdvance(length * byte_width_);
|
||||
UnsafeAppendToBitmap(validity, valid_bits_offset, length);
|
||||
}
|
||||
|
||||
static constexpr int64_t memory_limit() {
|
||||
return std::numeric_limits<int64_t>::max() - 1;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return fixed_size_binary(byte_width_);
|
||||
}
|
||||
|
||||
protected:
|
||||
int32_t byte_width_;
|
||||
BufferBuilder byte_builder_;
|
||||
|
||||
void CheckValueSize(int64_t size);
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Chunked builders: build a sequence of BinaryArray or StringArray that are
|
||||
// limited to a particular size (to the upper limit of 2GB)
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT ChunkedBinaryBuilder {
|
||||
public:
|
||||
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
virtual ~ChunkedBinaryBuilder() = default;
|
||||
|
||||
Status Append(const uint8_t* value, int32_t length) {
|
||||
if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
|
||||
max_chunk_value_length_)) {
|
||||
if (builder_->value_data_length() == 0) {
|
||||
// The current item is larger than max_chunk_size_;
|
||||
// this chunk will be oversize and hold *only* this item
|
||||
ARROW_RETURN_NOT_OK(builder_->Append(value, length));
|
||||
return NextChunk();
|
||||
}
|
||||
// The current item would cause builder_->value_data_length() to exceed
|
||||
// max_chunk_size_, so finish this chunk and append the current item to the next
|
||||
// chunk
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
return Append(value, length);
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
||||
// The current item would cause builder_->length() to exceed max_chunk_length_, so
|
||||
// finish this chunk and append the current item to the next chunk
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
}
|
||||
|
||||
return builder_->Append(value, length);
|
||||
}
|
||||
|
||||
Status Append(std::string_view value) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<int32_t>(value.size()));
|
||||
}
|
||||
|
||||
Status AppendNull() {
|
||||
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
}
|
||||
return builder_->AppendNull();
|
||||
}
|
||||
|
||||
Status Reserve(int64_t values);
|
||||
|
||||
virtual Status Finish(ArrayVector* out);
|
||||
|
||||
protected:
|
||||
Status NextChunk();
|
||||
|
||||
// maximum total character data size per chunk
|
||||
int64_t max_chunk_value_length_;
|
||||
|
||||
// maximum elements allowed per chunk
|
||||
int64_t max_chunk_length_ = kListMaximumElements;
|
||||
|
||||
// when Reserve() would cause builder_ to exceed its max_chunk_length_,
|
||||
// add to extra_capacity_ instead and wait to reserve until the next chunk
|
||||
int64_t extra_capacity_ = 0;
|
||||
|
||||
std::unique_ptr<BinaryBuilder> builder_;
|
||||
std::vector<std::shared_ptr<Array>> chunks_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
|
||||
public:
|
||||
using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
|
||||
|
||||
Status Finish(ArrayVector* out) override;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,164 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_decimal.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT Decimal32Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal32Type;
|
||||
using ValueType = Decimal32;
|
||||
|
||||
explicit Decimal32Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(Decimal32 val);
|
||||
void UnsafeAppend(Decimal32 val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal32Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal32Type> decimal_type_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Decimal64Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal64Type;
|
||||
using ValueType = Decimal64;
|
||||
|
||||
explicit Decimal64Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(Decimal64 val);
|
||||
void UnsafeAppend(Decimal64 val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal64Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal64Type> decimal_type_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal128Type;
|
||||
using ValueType = Decimal128;
|
||||
|
||||
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(Decimal128 val);
|
||||
void UnsafeAppend(Decimal128 val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal128Type> decimal_type_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal256Type;
|
||||
using ValueType = Decimal256;
|
||||
|
||||
explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(const Decimal256& val);
|
||||
void UnsafeAppend(const Decimal256& val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal256Type> decimal_type_;
|
||||
};
|
||||
|
||||
using DecimalBuilder = Decimal128Builder;
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,728 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
|
||||
#include "arrow/array/builder_base.h" // IWYU pragma: export
|
||||
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/array/util.h"
|
||||
#include "arrow/scalar.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_block_counter.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Dictionary builder
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename T, typename Enable = void>
|
||||
struct DictionaryValue {
|
||||
using type = typename T::c_type;
|
||||
using PhysicalType = T;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_base_binary<T>> {
|
||||
using type = std::string_view;
|
||||
using PhysicalType =
|
||||
typename std::conditional<std::is_same<typename T::offset_type, int32_t>::value,
|
||||
BinaryType, LargeBinaryType>::type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_binary_view_like<T>> {
|
||||
using type = std::string_view;
|
||||
using PhysicalType = BinaryViewType;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
|
||||
using type = std::string_view;
|
||||
using PhysicalType = BinaryType;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT DictionaryMemoTable {
|
||||
public:
|
||||
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<DataType>& type);
|
||||
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<Array>& dictionary);
|
||||
~DictionaryMemoTable();
|
||||
|
||||
Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out);
|
||||
|
||||
/// \brief Insert new memo values
|
||||
Status InsertValues(const Array& values);
|
||||
|
||||
int32_t size() const;
|
||||
|
||||
template <typename T>
|
||||
Status GetOrInsert(typename DictionaryValue<T>::type value, int32_t* out) {
|
||||
// We want to keep the DictionaryMemoTable implementation private, also we can't
|
||||
// use extern template classes because of compiler issues (MinGW?). Instead,
|
||||
// we expose explicit function overrides for each supported physical type.
|
||||
const typename DictionaryValue<T>::PhysicalType* physical_type = NULLPTR;
|
||||
return GetOrInsert(physical_type, value, out);
|
||||
}
|
||||
|
||||
private:
|
||||
Status GetOrInsert(const BooleanType*, bool value, int32_t* out);
|
||||
Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
|
||||
Status GetOrInsert(const DurationType*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const MonthDayNanoIntervalType*,
|
||||
MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out);
|
||||
Status GetOrInsert(const DayTimeIntervalType*,
|
||||
DayTimeIntervalType::DayMilliseconds value, int32_t* out);
|
||||
Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const FloatType*, float value, int32_t* out);
|
||||
Status GetOrInsert(const DoubleType*, double value, int32_t* out);
|
||||
|
||||
Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out);
|
||||
Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out);
|
||||
Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out);
|
||||
|
||||
class DictionaryMemoTableImpl;
|
||||
std::unique_ptr<DictionaryMemoTableImpl> impl_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \addtogroup dictionary-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Array builder for created encoded DictionaryArray from
|
||||
/// dense array
|
||||
///
|
||||
/// Unlike other builders, dictionary builder does not completely
|
||||
/// reset the state on Finish calls.
|
||||
template <typename BuilderType, typename T>
|
||||
class DictionaryBuilderBase : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = DictionaryType;
|
||||
using Value = typename DictionaryValue<T>::type;
|
||||
|
||||
// WARNING: the type given below is the value type, not the DictionaryType.
|
||||
// The DictionaryType is instantiated on the Finish() call.
|
||||
template <typename B = BuilderType, typename T1 = T>
|
||||
DictionaryBuilderBase(uint8_t start_int_size,
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
|
||||
!is_fixed_size_binary_type<T1>::value,
|
||||
const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(start_int_size, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
const std::shared_ptr<DataType>& index_type,
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(index_type, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename B = BuilderType, typename T1 = T>
|
||||
DictionaryBuilderBase(uint8_t start_int_size,
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
|
||||
is_fixed_size_binary_type<T1>::value,
|
||||
const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(start_int_size, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
const std::shared_ptr<DataType>& index_type,
|
||||
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(index_type, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
|
||||
: DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
|
||||
|
||||
// This constructor doesn't check for errors. Use InsertMemoValues instead.
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(pool, alignment),
|
||||
value_type_(dictionary->type()) {}
|
||||
|
||||
~DictionaryBuilderBase() override = default;
|
||||
|
||||
/// \brief The current number of entries in the dictionary
|
||||
int64_t dictionary_length() const { return memo_table_->size(); }
|
||||
|
||||
/// \brief The value byte width (for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, int32_t> byte_width() const {
|
||||
return byte_width_;
|
||||
}
|
||||
|
||||
/// \brief Append a scalar value
|
||||
Status Append(Value value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
|
||||
int32_t memo_index;
|
||||
ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert<T>(value, &memo_index));
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index));
|
||||
length_ += 1;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> Append(const uint8_t* value) {
|
||||
return Append(std::string_view(reinterpret_cast<const char*>(value), byte_width_));
|
||||
}
|
||||
|
||||
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> Append(const char* value) {
|
||||
return Append(std::string_view(value, byte_width_));
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for binary types)
|
||||
template <typename T1 = T>
|
||||
enable_if_binary_like<T1, Status> Append(const uint8_t* value, int32_t length) {
|
||||
return Append(reinterpret_cast<const char*>(value), length);
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for binary types)
|
||||
template <typename T1 = T>
|
||||
enable_if_binary_like<T1, Status> Append(const char* value, int32_t length) {
|
||||
return Append(std::string_view(value, length));
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for string types)
|
||||
template <typename T1 = T>
|
||||
enable_if_string_like<T1, Status> Append(const char* value, int32_t length) {
|
||||
return Append(std::string_view(value, length));
|
||||
}
|
||||
|
||||
/// \brief Append a decimal (only for Decimal32/64/128/256 Type)
|
||||
template <typename T1 = T, typename CType = typename TypeTraits<T1>::CType>
|
||||
enable_if_decimal<T1, Status> Append(const CType& value) {
|
||||
auto bytes = value.ToBytes();
|
||||
return Append(bytes.data(), static_cast<int32_t>(bytes.size()));
|
||||
}
|
||||
|
||||
/// \brief Append a scalar null value
|
||||
Status AppendNull() final {
|
||||
length_ += 1;
|
||||
null_count_ += 1;
|
||||
|
||||
return indices_builder_.AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
length_ += length;
|
||||
null_count_ += length;
|
||||
|
||||
return indices_builder_.AppendNulls(length);
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
length_ += 1;
|
||||
|
||||
return indices_builder_.AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
length_ += length;
|
||||
|
||||
return indices_builder_.AppendEmptyValues(length);
|
||||
}
|
||||
|
||||
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
|
||||
if (!scalar.is_valid) return AppendNulls(n_repeats);
|
||||
|
||||
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*scalar.type);
|
||||
const DictionaryScalar& dict_scalar =
|
||||
internal::checked_cast<const DictionaryScalar&>(scalar);
|
||||
const auto& dict = internal::checked_cast<const typename TypeTraits<T>::ArrayType&>(
|
||||
*dict_scalar.value.dictionary);
|
||||
ARROW_RETURN_NOT_OK(Reserve(n_repeats));
|
||||
switch (dict_ty.index_type()->id()) {
|
||||
case Type::UINT8:
|
||||
return AppendScalarImpl<UInt8Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT8:
|
||||
return AppendScalarImpl<Int8Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT16:
|
||||
return AppendScalarImpl<UInt16Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT16:
|
||||
return AppendScalarImpl<Int16Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT32:
|
||||
return AppendScalarImpl<UInt32Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT32:
|
||||
return AppendScalarImpl<Int32Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT64:
|
||||
return AppendScalarImpl<UInt64Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT64:
|
||||
return AppendScalarImpl<Int64Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
default:
|
||||
return Status::TypeError("Invalid index type: ", dict_ty);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendScalars(const ScalarVector& scalars) override {
|
||||
for (const auto& scalar : scalars) {
|
||||
ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
|
||||
// Visit the indices and insert the unpacked values.
|
||||
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
|
||||
// See if possible to avoid using ToArrayData here
|
||||
const typename TypeTraits<T>::ArrayType dict(array.dictionary().ToArrayData());
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
switch (dict_ty.index_type()->id()) {
|
||||
case Type::UINT8:
|
||||
return AppendArraySliceImpl<uint8_t>(dict, array, offset, length);
|
||||
case Type::INT8:
|
||||
return AppendArraySliceImpl<int8_t>(dict, array, offset, length);
|
||||
case Type::UINT16:
|
||||
return AppendArraySliceImpl<uint16_t>(dict, array, offset, length);
|
||||
case Type::INT16:
|
||||
return AppendArraySliceImpl<int16_t>(dict, array, offset, length);
|
||||
case Type::UINT32:
|
||||
return AppendArraySliceImpl<uint32_t>(dict, array, offset, length);
|
||||
case Type::INT32:
|
||||
return AppendArraySliceImpl<int32_t>(dict, array, offset, length);
|
||||
case Type::UINT64:
|
||||
return AppendArraySliceImpl<uint64_t>(dict, array, offset, length);
|
||||
case Type::INT64:
|
||||
return AppendArraySliceImpl<int64_t>(dict, array, offset, length);
|
||||
default:
|
||||
return Status::TypeError("Invalid index type: ", dict_ty);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Insert values into the dictionary's memo, but do not append any
|
||||
/// indices. Can be used to initialize a new builder with known dictionary
|
||||
/// values
|
||||
/// \param[in] values dictionary values to add to memo. Type must match
|
||||
/// builder type
|
||||
Status InsertMemoValues(const Array& values) {
|
||||
return memo_table_->InsertValues(values);
|
||||
}
|
||||
|
||||
/// \brief Append a whole dense array to the builder
|
||||
template <typename T1 = T>
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, Status> AppendArray(
|
||||
const Array& array) {
|
||||
using ArrayType = typename TypeTraits<T>::ArrayType;
|
||||
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
value_type_, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
|
||||
const auto& concrete_array = static_cast<const ArrayType&>(array);
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
if (array.IsNull(i)) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> AppendArray(const Array& array) {
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
value_type_, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
|
||||
const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
if (array.IsNull(i)) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
// Perform a partial reset. Call ResetFull to also reset the accumulated
|
||||
// dictionary values
|
||||
ArrayBuilder::Reset();
|
||||
indices_builder_.Reset();
|
||||
}
|
||||
|
||||
/// \brief Reset and also clear accumulated dictionary values in memo table
|
||||
void ResetFull() {
|
||||
Reset();
|
||||
memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_));
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
|
||||
capacity_ = indices_builder_.capacity();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Return dictionary indices and a delta dictionary since the last
|
||||
/// time that Finish or FinishDelta were called, and reset state of builder
|
||||
/// (except the memo table)
|
||||
Status FinishDelta(std::shared_ptr<Array>* out_indices,
|
||||
std::shared_ptr<Array>* out_delta) {
|
||||
std::shared_ptr<ArrayData> indices_data;
|
||||
std::shared_ptr<ArrayData> delta_data;
|
||||
ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data));
|
||||
*out_indices = MakeArray(indices_data);
|
||||
*out_delta = MakeArray(delta_data);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return ::arrow::dictionary(indices_builder_.type(), value_type_);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename c_type>
|
||||
Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
|
||||
const ArraySpan& array, int64_t offset, int64_t length) {
|
||||
const c_type* values = array.GetValues<c_type>(1) + offset;
|
||||
return VisitBitBlocks(
|
||||
array.buffers[0].data, array.offset + offset, length,
|
||||
[&](const int64_t position) {
|
||||
const int64_t index = static_cast<int64_t>(values[position]);
|
||||
if (dict.IsValid(index)) {
|
||||
return Append(dict.GetView(index));
|
||||
}
|
||||
return AppendNull();
|
||||
},
|
||||
[&]() { return AppendNull(); });
|
||||
}
|
||||
|
||||
template <typename IndexType>
|
||||
Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
|
||||
const Scalar& index_scalar, int64_t n_repeats) {
|
||||
using ScalarType = typename TypeTraits<IndexType>::ScalarType;
|
||||
const auto index = internal::checked_cast<const ScalarType&>(index_scalar).value;
|
||||
if (index_scalar.is_valid && dict.IsValid(index)) {
|
||||
const auto& value = dict.GetView(index);
|
||||
for (int64_t i = 0; i < n_repeats; i++) {
|
||||
ARROW_RETURN_NOT_OK(Append(value));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
return AppendNulls(n_repeats);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
std::shared_ptr<ArrayData> dictionary;
|
||||
ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
|
||||
|
||||
// Set type of array data to the right dictionary type
|
||||
(*out)->type = type();
|
||||
(*out)->dictionary = dictionary;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishWithDictOffset(int64_t dict_offset,
|
||||
std::shared_ptr<ArrayData>* out_indices,
|
||||
std::shared_ptr<ArrayData>* out_dictionary) {
|
||||
// Finalize indices array
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices));
|
||||
|
||||
// Generate dictionary array from hash table contents
|
||||
ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary));
|
||||
delta_offset_ = memo_table_->size();
|
||||
|
||||
// Update internals for further uses of this DictionaryBuilder
|
||||
ArrayBuilder::Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::unique_ptr<DictionaryMemoTable> memo_table_;
|
||||
|
||||
// The size of the dictionary memo at last invocation of Finish, to use in
|
||||
// FinishDelta for computing dictionary deltas
|
||||
int32_t delta_offset_;
|
||||
|
||||
// Only used for FixedSizeBinaryType
|
||||
int32_t byte_width_;
|
||||
|
||||
BuilderType indices_builder_;
|
||||
std::shared_ptr<DataType> value_type_;
|
||||
};
|
||||
|
||||
template <typename BuilderType>
|
||||
class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
|
||||
public:
|
||||
template <typename B = BuilderType>
|
||||
DictionaryBuilderBase(
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
|
||||
start_int_size,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(index_type, pool) {}
|
||||
|
||||
template <typename B = BuilderType>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
|
||||
start_int_size,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
/// \brief Append a scalar null value
|
||||
Status AppendNull() final {
|
||||
length_ += 1;
|
||||
null_count_ += 1;
|
||||
|
||||
return indices_builder_.AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
length_ += length;
|
||||
null_count_ += length;
|
||||
|
||||
return indices_builder_.AppendNulls(length);
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
length_ += 1;
|
||||
|
||||
return indices_builder_.AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
length_ += length;
|
||||
|
||||
return indices_builder_.AppendEmptyValues(length);
|
||||
}
|
||||
|
||||
/// \brief Append a whole dense array to the builder
|
||||
Status AppendArray(const Array& array) {
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
Type::NA, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
|
||||
capacity_ = indices_builder_.capacity();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
|
||||
(*out)->type = dictionary((*out)->type, null());
|
||||
(*out)->dictionary = NullArray(0).data();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return ::arrow::dictionary(indices_builder_.type(), null());
|
||||
}
|
||||
|
||||
protected:
|
||||
BuilderType indices_builder_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
|
||||
/// smallest index size that can accommodate the dictionary indices
|
||||
template <typename T>
|
||||
class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
|
||||
public:
|
||||
using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
|
||||
using BASE::BASE;
|
||||
|
||||
/// \brief Append dictionary indices directly without modifying memo
|
||||
///
|
||||
/// NOTE: Experimental API
|
||||
Status AppendIndices(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
int64_t null_count_before = this->indices_builder_.null_count();
|
||||
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
|
||||
this->capacity_ = this->indices_builder_.capacity();
|
||||
this->length_ += length;
|
||||
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief A DictionaryArray builder that always returns int32 dictionary
|
||||
/// indices so that data cast to dictionary form will have a consistent index
|
||||
/// type, e.g. for creating a ChunkedArray
|
||||
template <typename T>
|
||||
class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
|
||||
public:
|
||||
using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
|
||||
using BASE::BASE;
|
||||
|
||||
/// \brief Append dictionary indices directly without modifying memo
|
||||
///
|
||||
/// NOTE: Experimental API
|
||||
Status AppendIndices(const int32_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
int64_t null_count_before = this->indices_builder_.null_count();
|
||||
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
|
||||
this->capacity_ = this->indices_builder_.capacity();
|
||||
this->length_ += length;
|
||||
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary / Unicode builders
|
||||
// (compatibility aliases; those used to be derived classes with additional
|
||||
// Append() overloads, but they have been folded into DictionaryBuilderBase)
|
||||
|
||||
using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
|
||||
using StringDictionaryBuilder = DictionaryBuilder<StringType>;
|
||||
using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
|
||||
using StringDictionary32Builder = Dictionary32Builder<StringType>;
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,836 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// VarLengthListLikeBuilder
|
||||
|
||||
template <typename TYPE>
|
||||
class VarLengthListLikeBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
/// Use this constructor to incrementally build the value array along with offsets and
|
||||
/// null bitmap.
|
||||
VarLengthListLikeBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
offsets_builder_(pool, alignment),
|
||||
value_builder_(value_builder),
|
||||
value_field_(type->field(0)->WithType(NULLPTR)) {}
|
||||
|
||||
VarLengthListLikeBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: VarLengthListLikeBuilder(pool, value_builder,
|
||||
std::make_shared<TYPE>(value_builder->type()),
|
||||
alignment) {}
|
||||
|
||||
~VarLengthListLikeBuilder() override = default;
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) {
|
||||
return Status::CapacityError(type_name(),
|
||||
" array cannot reserve space for more than ",
|
||||
maximum_elements(), " got ", capacity);
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
|
||||
// One more than requested for list offsets
|
||||
const int64_t offsets_capacity =
|
||||
is_list_view(TYPE::type_id) ? capacity : capacity + 1;
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
ArrayBuilder::Reset();
|
||||
offsets_builder_.Reset();
|
||||
value_builder_->Reset();
|
||||
}
|
||||
|
||||
/// \brief Start a new variable-length list slot
|
||||
///
|
||||
/// This function should be called before appending elements to the
|
||||
/// value builder. Elements appended to the value builder before this function
|
||||
/// is called for the first time, will not be members of any list value.
|
||||
///
|
||||
/// After this function is called, list_length elements SHOULD be appended to
|
||||
/// the values builder. If this contract is violated, the behavior is defined by
|
||||
/// the concrete builder implementation and SHOULD NOT be relied upon unless
|
||||
/// the caller is specifically building a [Large]List or [Large]ListView array.
|
||||
///
|
||||
/// For [Large]List arrays, the list slot length will be the number of elements
|
||||
/// appended to the values builder before the next call to Append* or Finish. For
|
||||
/// [Large]ListView arrays, the list slot length will be exactly list_length, but if
|
||||
/// Append* is called before at least list_length elements are appended to the values
|
||||
/// builder, the current list slot will share elements with the next list
|
||||
/// slots or an invalid [Large]ListView array will be generated because there
|
||||
/// aren't enough elements in the values builder to fill the list slots.
|
||||
///
|
||||
/// If you're building a [Large]List and don't need to be compatible
|
||||
/// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)`
|
||||
/// is a simpler API.
|
||||
///
|
||||
/// \pre if is_valid is false, list_length MUST be 0
|
||||
/// \param is_valid Whether the new list slot is valid
|
||||
/// \param list_length The number of elements in the list
|
||||
Status Append(bool is_valid, int64_t list_length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
assert(is_valid || list_length == 0);
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
// Append() a null list slot with list_length=0.
|
||||
//
|
||||
// When building [Large]List arrays, elements being appended to the values builder
|
||||
// before the next call to Append* or Finish will extend the list slot length, but
|
||||
// that is totally fine because list arrays admit non-empty null list slots.
|
||||
//
|
||||
// In the case of [Large]ListViews that's not a problem either because the
|
||||
// list slot length remains zero.
|
||||
return Append(false, 0);
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
UnsafeAppendEmptyDimensions(/*num_values=*/length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append an empty list slot
|
||||
///
|
||||
/// \post Another call to Append* or Finish should be made before appending to
|
||||
/// the values builder to ensure list slot remains empty
|
||||
Status AppendEmptyValue() final { return Append(true, 0); }
|
||||
|
||||
/// \brief Append an empty list slot
|
||||
///
|
||||
/// \post Another call to Append* or Finish should be made before appending to
|
||||
/// the values builder to ensure the last list slot remains empty
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
UnsafeAppendEmptyDimensions(/*num_values=*/length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// For list-array builders, the sizes are inferred from the offsets.
|
||||
/// BaseListBuilder<T> provides an implementation that doesn't take sizes, but
|
||||
/// this virtual function allows dispatching calls to both list-array and
|
||||
/// list-view-array builders (which need the sizes)
|
||||
///
|
||||
/// \param offsets The offsets of the variable-length lists
|
||||
/// \param sizes The sizes of the variable-length lists
|
||||
/// \param length The number of offsets, sizes, and validity bits to append
|
||||
/// \param valid_bytes If passed, valid_bytes is of equal length to values,
|
||||
/// and any zero byte will be considered as a null for that slot
|
||||
virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length, const uint8_t* valid_bytes) = 0;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
const offset_type* offsets = array.GetValues<offset_type>(1);
|
||||
[[maybe_unused]] const offset_type* sizes = NULLPTR;
|
||||
if constexpr (is_list_view(TYPE::type_id)) {
|
||||
sizes = array.GetValues<offset_type>(2);
|
||||
}
|
||||
static_assert(internal::may_have_validity_bitmap(TYPE::type_id));
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
|
||||
int64_t size = 0;
|
||||
if (is_valid) {
|
||||
if constexpr (is_list_view(TYPE::type_id)) {
|
||||
size = sizes[row];
|
||||
} else {
|
||||
size = offsets[row + 1] - offsets[row];
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size);
|
||||
if (is_valid) {
|
||||
ARROW_RETURN_NOT_OK(
|
||||
value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements) const {
|
||||
auto new_length = value_builder_->length() + new_elements;
|
||||
if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
|
||||
return Status::CapacityError(type_name(), " array cannot contain more than ",
|
||||
maximum_elements(), " elements, have ", new_elements);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
ArrayBuilder* value_builder() const { return value_builder_.get(); }
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t maximum_elements() {
|
||||
return std::numeric_limits<offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr const char* type_name() {
|
||||
if constexpr (is_list_view(TYPE::type_id)) {
|
||||
return "ListView";
|
||||
} else {
|
||||
return "List";
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
/// \brief Append dimensions for num_values empty list slots.
|
||||
///
|
||||
/// ListViewBuilder overrides this to also append the sizes.
|
||||
virtual void UnsafeAppendEmptyDimensions(int64_t num_values) {
|
||||
const int64_t offset = value_builder_->length();
|
||||
for (int64_t i = 0; i < num_values; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Append dimensions for a single list slot.
|
||||
///
|
||||
/// ListViewBuilder overrides this to also append the size.
|
||||
virtual void UnsafeAppendDimensions(int64_t offset, int64_t ARROW_ARG_UNUSED(size)) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
|
||||
}
|
||||
|
||||
TypedBufferBuilder<offset_type> offsets_builder_;
|
||||
std::shared_ptr<ArrayBuilder> value_builder_;
|
||||
std::shared_ptr<Field> value_field_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListBuilder / LargeListBuilder
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListBuilder : public VarLengthListLikeBuilder<TYPE> {
|
||||
private:
|
||||
using BASE = VarLengthListLikeBuilder<TYPE>;
|
||||
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename BASE::offset_type;
|
||||
|
||||
using BASE::BASE;
|
||||
|
||||
using BASE::Append;
|
||||
|
||||
~BaseListBuilder() override = default;
|
||||
|
||||
/// \brief Start a new variable-length list slot
|
||||
///
|
||||
/// This function should be called before beginning to append elements to the
|
||||
/// value builder
|
||||
Status Append(bool is_valid = true) {
|
||||
// The value_length parameter to BASE::Append(bool, int64_t) is ignored when
|
||||
// building a list array, so we can pass 0 here.
|
||||
return BASE::Append(is_valid, 0);
|
||||
}
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const offset_type* offsets, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
ARROW_RETURN_NOT_OK(this->Reserve(length));
|
||||
this->UnsafeAppendToBitmap(valid_bytes, length);
|
||||
this->offsets_builder_.UnsafeAppend(offsets, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length, const uint8_t* valid_bytes) final {
|
||||
// Offsets are assumed to be valid, but the first length-1 sizes have to be
|
||||
// consistent with the offsets to partially rule out the possibility that the
|
||||
// caller is passing sizes that could work if building a list-view, but don't
|
||||
// work on building a list that requires offsets to be non-decreasing.
|
||||
//
|
||||
// CAUTION: the last size element (`sizes[length - 1]`) is not
|
||||
// validated and could be inconsistent with the offsets given in a
|
||||
// subsequent call to AppendValues.
|
||||
#ifndef NDEBUG
|
||||
if (sizes) {
|
||||
for (int64_t i = 0; i < length - 1; ++i) {
|
||||
if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) {
|
||||
if (!valid_bytes || valid_bytes[i]) {
|
||||
return Status::Invalid(
|
||||
"BaseListBuilder: sizes are inconsistent with offsets provided");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return AppendValues(offsets, length, valid_bytes);
|
||||
}
|
||||
|
||||
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length) {
|
||||
return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR);
|
||||
}
|
||||
|
||||
Status AppendNextOffset() {
|
||||
ARROW_RETURN_NOT_OK(this->ValidateOverflow(0));
|
||||
const int64_t num_values = this->value_builder_->length();
|
||||
return this->offsets_builder_.Append(static_cast<offset_type>(num_values));
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
|
||||
// Offset padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> offsets;
|
||||
std::shared_ptr<Buffer> null_bitmap;
|
||||
ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap));
|
||||
|
||||
if (this->value_builder_->length() == 0) {
|
||||
// Try to make sure we get a non-null values buffer (ARROW-2744)
|
||||
ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> items;
|
||||
ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items));
|
||||
|
||||
*out = ArrayData::Make(this->type(), this->length_,
|
||||
{std::move(null_bitmap), std::move(offsets)},
|
||||
{std::move(items)}, this->null_count_);
|
||||
this->Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
/// \class ListBuilder
|
||||
/// \brief Builder class for variable-length list array value types
|
||||
///
|
||||
/// To use this class, you must append values to the child array builder and use
|
||||
/// the Append function to delimit each distinct list value (once the values
|
||||
/// have been appended to the child array) or use the bulk API to append
|
||||
/// a sequence of offsets and null values.
|
||||
///
|
||||
/// A note on types. Per arrow/type.h all types in the c++ implementation are
|
||||
/// logical so even though this class always builds list array, this can
|
||||
/// represent multiple different logical types. If no logical type is provided
|
||||
/// at construction time, the class defaults to List<T> where t is taken from the
|
||||
/// value_builder/values that the object is constructed with.
|
||||
class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
|
||||
public:
|
||||
using BaseListBuilder::BaseListBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
/// \class LargeListBuilder
|
||||
/// \brief Builder class for large variable-length list array value types
|
||||
///
|
||||
/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
|
||||
class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
|
||||
public:
|
||||
using BaseListBuilder::BaseListBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListViewBuilder / LargeListViewBuilder
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListViewBuilder : public VarLengthListLikeBuilder<TYPE> {
|
||||
private:
|
||||
using BASE = VarLengthListLikeBuilder<TYPE>;
|
||||
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename BASE::offset_type;
|
||||
|
||||
using BASE::BASE;
|
||||
|
||||
~BaseListViewBuilder() override = default;
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(BASE::Resize(capacity));
|
||||
return sizes_builder_.Resize(capacity);
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
BASE::Reset();
|
||||
sizes_builder_.Reset();
|
||||
}
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length, const uint8_t* valid_bytes) final {
|
||||
ARROW_RETURN_NOT_OK(this->Reserve(length));
|
||||
this->UnsafeAppendToBitmap(valid_bytes, length);
|
||||
this->offsets_builder_.UnsafeAppend(offsets, length);
|
||||
this->sizes_builder_.UnsafeAppend(sizes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
|
||||
int64_t length) {
|
||||
return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
// Offset and sizes padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> null_bitmap;
|
||||
std::shared_ptr<Buffer> offsets;
|
||||
std::shared_ptr<Buffer> sizes;
|
||||
ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap));
|
||||
ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes));
|
||||
|
||||
if (this->value_builder_->length() == 0) {
|
||||
// Try to make sure we get a non-null values buffer (ARROW-2744)
|
||||
ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> items;
|
||||
ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items));
|
||||
|
||||
*out = ArrayData::Make(this->type(), this->length_,
|
||||
{std::move(null_bitmap), std::move(offsets), std::move(sizes)},
|
||||
{std::move(items)}, this->null_count_);
|
||||
this->Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
void UnsafeAppendEmptyDimensions(int64_t num_values) override {
|
||||
for (int64_t i = 0; i < num_values; ++i) {
|
||||
this->offsets_builder_.UnsafeAppend(0);
|
||||
}
|
||||
for (int64_t i = 0; i < num_values; ++i) {
|
||||
this->sizes_builder_.UnsafeAppend(0);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppendDimensions(int64_t offset, int64_t size) override {
|
||||
this->offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
|
||||
this->sizes_builder_.UnsafeAppend(static_cast<offset_type>(size));
|
||||
}
|
||||
|
||||
private:
|
||||
TypedBufferBuilder<offset_type> sizes_builder_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder<ListViewType> {
|
||||
public:
|
||||
using BaseListViewBuilder::BaseListViewBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ListViewArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
class ARROW_EXPORT LargeListViewBuilder final
|
||||
: public BaseListViewBuilder<LargeListViewType> {
|
||||
public:
|
||||
using BaseListViewBuilder::BaseListViewBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeListViewArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Map builder
|
||||
|
||||
/// \class MapBuilder
|
||||
/// \brief Builder class for arrays of variable-size maps
|
||||
///
|
||||
/// To use this class, you must use the Append function to delimit each distinct
|
||||
/// map before appending values to the key and item array builders, or use the
|
||||
/// bulk API to append a sequence of offsets and null maps.
|
||||
///
|
||||
/// Key uniqueness and ordering are not validated.
|
||||
class ARROW_EXPORT MapBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// Use this constructor to define the built array's type explicitly. If key_builder
|
||||
/// or item_builder has indeterminate type, this builder will also.
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& item_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
/// Use this constructor to infer the built array's type. If key_builder or
|
||||
/// item_builder has indeterminate type, this builder will also.
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
|
||||
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
Status Resize(int64_t capacity) override;
|
||||
void Reset() override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const int32_t* offsets, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Start a new variable-length map slot
|
||||
///
|
||||
/// This function should be called before beginning to append elements to the
|
||||
/// key and item builders
|
||||
Status Append();
|
||||
|
||||
Status AppendNull() final;
|
||||
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
const auto* offsets = array.GetValues<int32_t>(1);
|
||||
static_assert(internal::may_have_validity_bitmap(MapType::type_id));
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
|
||||
if (is_valid) {
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
const int64_t slot_length = offsets[row + 1] - offsets[row];
|
||||
// Add together the inner StructArray offset to the Map/List offset
|
||||
int64_t key_value_offset = array.child_data[0].offset + offsets[row];
|
||||
ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
|
||||
array.child_data[0].child_data[0], key_value_offset, slot_length));
|
||||
ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
|
||||
array.child_data[0].child_data[1], key_value_offset, slot_length));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Get builder to append keys.
|
||||
///
|
||||
/// Append a key with this builder should be followed by appending
|
||||
/// an item or null value with item_builder().
|
||||
ArrayBuilder* key_builder() const { return key_builder_.get(); }
|
||||
|
||||
/// \brief Get builder to append items
|
||||
///
|
||||
/// Appending an item with this builder should have been preceded
|
||||
/// by appending a key with key_builder().
|
||||
ArrayBuilder* item_builder() const { return item_builder_.get(); }
|
||||
|
||||
/// \brief Get builder to add Map entries as struct values.
|
||||
///
|
||||
/// This is used instead of key_builder()/item_builder() and allows
|
||||
/// the Map to be built as a list of struct values.
|
||||
ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
// Key and Item builder may update types, but they don't contain the field names,
|
||||
// so we need to reconstruct the type. (See ARROW-13735.)
|
||||
return std::make_shared<MapType>(
|
||||
field(entries_name_,
|
||||
struct_({field(key_name_, key_builder_->type(), false),
|
||||
field(item_name_, item_builder_->type(), item_nullable_)}),
|
||||
false),
|
||||
keys_sorted_);
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements) {
|
||||
return list_builder_->ValidateOverflow(new_elements);
|
||||
}
|
||||
|
||||
protected:
|
||||
inline Status AdjustStructBuilderLength();
|
||||
|
||||
protected:
|
||||
bool keys_sorted_ = false;
|
||||
bool item_nullable_ = false;
|
||||
std::string entries_name_;
|
||||
std::string key_name_;
|
||||
std::string item_name_;
|
||||
std::shared_ptr<ListBuilder> list_builder_;
|
||||
std::shared_ptr<ArrayBuilder> key_builder_;
|
||||
std::shared_ptr<ArrayBuilder> item_builder_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeList builder
|
||||
|
||||
/// \class FixedSizeListBuilder
|
||||
/// \brief Builder class for fixed-length list array value types
|
||||
class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = FixedSizeListType;
|
||||
|
||||
/// Use this constructor to define the built array's type explicitly. If value_builder
|
||||
/// has indeterminate type, this builder will also.
|
||||
FixedSizeListBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
int32_t list_size);
|
||||
|
||||
/// Use this constructor to infer the built array's type. If value_builder has
|
||||
/// indeterminate type, this builder will also.
|
||||
FixedSizeListBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
Status Resize(int64_t capacity) override;
|
||||
void Reset() override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Append a valid fixed length list.
|
||||
///
|
||||
/// This function affects only the validity bitmap; the child values must be appended
|
||||
/// using the child array builder.
|
||||
Status Append();
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes will be read and any zero byte
|
||||
/// will cause the corresponding slot to be null
|
||||
///
|
||||
/// This function affects only the validity bitmap; the child values must be appended
|
||||
/// using the child array builder. This includes appending nulls for null lists.
|
||||
/// XXX this restriction is confusing, should this method be omitted?
|
||||
Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Append a null fixed length list.
|
||||
///
|
||||
/// The child array builder will have the appropriate number of nulls appended
|
||||
/// automatically.
|
||||
Status AppendNull() final;
|
||||
|
||||
/// \brief Append length null fixed length lists.
|
||||
///
|
||||
/// The child array builder will have the appropriate number of nulls appended
|
||||
/// automatically.
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements);
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
|
||||
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
|
||||
array.child_data[0], list_size_ * (array.offset + row), list_size_));
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ArrayBuilder* value_builder() const { return value_builder_.get(); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
|
||||
}
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t maximum_elements() {
|
||||
return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Field> value_field_;
|
||||
const int32_t list_size_;
|
||||
std::shared_ptr<ArrayBuilder> value_builder_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Struct
|
||||
|
||||
// ---------------------------------------------------------------------------------
|
||||
// StructArray builder
|
||||
/// Append, Resize and Reserve methods are acting on StructBuilder.
|
||||
/// Please make sure all these methods of all child-builders' are consistently
|
||||
/// called to maintain data-structure consistency.
|
||||
class ARROW_EXPORT StructBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// If any of field_builders has indeterminate type, this builder will also
|
||||
StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
|
||||
std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// Null bitmap is of equal length to every child field, and any zero byte
|
||||
/// will be considered as a null for that field, but users must using app-
|
||||
/// end methods or advance methods of the child builders' independently to
|
||||
/// insert data.
|
||||
Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(valid_bytes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Append an element to the Struct. All child-builders' Append method must
|
||||
/// be called independently to maintain data-structure consistency.
|
||||
Status Append(bool is_valid = true) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a null value. Automatically appends an empty value to each child
|
||||
/// builder.
|
||||
Status AppendNull() final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
|
||||
}
|
||||
return Append(false);
|
||||
}
|
||||
|
||||
/// \brief Append multiple null values. Automatically appends empty values to each
|
||||
/// child builder.
|
||||
Status AppendNulls(int64_t length) final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
|
||||
}
|
||||
return Append(true);
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
|
||||
ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i],
|
||||
array.offset + offset, length));
|
||||
}
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(validity, array.offset + offset, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
|
||||
ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
|
||||
|
||||
int num_fields() const { return static_cast<int>(children_.size()); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<DataType> type_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,689 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/float16.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ARROW_EXPORT NullBuilder : public ArrayBuilder {
|
||||
public:
|
||||
explicit NullBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t ARROW_ARG_UNUSED(alignment) = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool) {}
|
||||
|
||||
explicit NullBuilder(const std::shared_ptr<DataType>& ARROW_ARG_UNUSED(type),
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: NullBuilder(pool, alignment) {}
|
||||
|
||||
/// \brief Append the specified number of null elements
|
||||
Status AppendNulls(int64_t length) final {
|
||||
if (length < 0) return Status::Invalid("length must be positive");
|
||||
null_count_ += length;
|
||||
length_ += length;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final { return AppendNulls(1); }
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
|
||||
|
||||
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
|
||||
|
||||
Status Append(std::nullptr_t) { return AppendNull(); }
|
||||
|
||||
Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override {
|
||||
return AppendNulls(length);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return null(); }
|
||||
|
||||
Status Finish(std::shared_ptr<NullArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// Base class for all Builders that emit an Array of a scalar numerical type.
|
||||
template <typename T>
|
||||
class NumericBuilder
|
||||
: public ArrayBuilder,
|
||||
public internal::ArrayBuilderExtraOps<NumericBuilder<T>, typename T::c_type> {
|
||||
public:
|
||||
using TypeClass = T;
|
||||
using value_type = typename T::c_type;
|
||||
using ArrayType = typename TypeTraits<T>::ArrayType;
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit NumericBuilder(
|
||||
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
type_(TypeTraits<T>::type_singleton()),
|
||||
data_builder_(pool, alignment) {}
|
||||
|
||||
NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment), type_(type), data_builder_(pool, alignment) {}
|
||||
|
||||
/// Append a single scalar and increase the size if necessary.
|
||||
Status Append(const value_type val) {
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
|
||||
UnsafeAppend(val);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
|
||||
/// The memory at the corresponding data slot is set to 0 to prevent
|
||||
/// uninitialized memory access
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value_type{}); // zero
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a empty element
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append several empty elements
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value_type{}); // zero
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
|
||||
|
||||
value_type* GetMutableValue(int64_t index) {
|
||||
return &data_builder_.mutable_data()[index];
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
data_builder_.Reset();
|
||||
ArrayBuilder::Reset();
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
value_type operator[](int64_t index) const { return GetValue(index); }
|
||||
|
||||
value_type& operator[](int64_t index) {
|
||||
return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] bitmap a validity bitmap to copy (may be null)
|
||||
/// \param[in] bitmap_offset an offset into the validity bitmap
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap,
|
||||
int64_t bitmap_offset) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length,
|
||||
const std::vector<bool>& is_valid) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of values
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<value_type>& values,
|
||||
const std::vector<bool>& is_valid) {
|
||||
if (values.empty()) {
|
||||
return Status::OK();
|
||||
}
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<value_type>& values) {
|
||||
if (values.empty()) {
|
||||
return Status::OK();
|
||||
}
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
|
||||
null_bitmap_builder_.FinishWithLength(length_));
|
||||
ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
|
||||
capacity_ = length_ = null_count_ = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \return Status
|
||||
template <typename ValuesIter>
|
||||
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
// this updates the length_
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot, with a specified nullmap
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \param[in] valid_begin InputIterator with elements indication valid(1)
|
||||
/// or null(0) values.
|
||||
/// \return Status
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
static_assert(!internal::is_null_pointer<ValidIter>::value,
|
||||
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
|
||||
"version instead");
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Same as above, with a pointer type ValidIter
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
// this updates the length_
|
||||
if (valid_begin == NULLPTR) {
|
||||
UnsafeSetNotNull(length);
|
||||
} else {
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(array.GetValues<value_type>(1) + offset, length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
/// Append a single scalar under the assumption that the underlying Buffer is
|
||||
/// large enough.
|
||||
///
|
||||
/// This method does not capacity-check; make sure to call Reserve
|
||||
/// beforehand.
|
||||
void UnsafeAppend(const value_type val) {
|
||||
ArrayBuilder::UnsafeAppendToBitmap(true);
|
||||
data_builder_.UnsafeAppend(val);
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
ArrayBuilder::UnsafeAppendToBitmap(false);
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
}
|
||||
|
||||
/// Advance builder without allocating nor writing any values
|
||||
///
|
||||
/// The internal pointer is advanced by `length` values and the same number
|
||||
/// of non-null entries are appended to the validity bitmap.
|
||||
/// This method assumes that the `length` values were populated directly,
|
||||
/// for example using `GetMutableValue`.
|
||||
void UnsafeAdvance(int64_t length) {
|
||||
data_builder_.UnsafeAdvance(length);
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
}
|
||||
|
||||
/// Advance builder without allocating nor writing any values
|
||||
///
|
||||
/// The internal pointer is advanced by `length` values and the same number
|
||||
/// of validity bits are appended to the validity bitmap.
|
||||
/// This method assumes that the `length` values were populated directly,
|
||||
/// for example using `GetMutableValue`.
|
||||
void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) {
|
||||
data_builder_.UnsafeAdvance(length);
|
||||
UnsafeAppendToBitmap(validity, valid_bits_offset, length);
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<DataType> type_;
|
||||
TypedBufferBuilder<value_type> data_builder_;
|
||||
};
|
||||
|
||||
// Builders
|
||||
|
||||
using UInt8Builder = NumericBuilder<UInt8Type>;
|
||||
using UInt16Builder = NumericBuilder<UInt16Type>;
|
||||
using UInt32Builder = NumericBuilder<UInt32Type>;
|
||||
using UInt64Builder = NumericBuilder<UInt64Type>;
|
||||
|
||||
using Int8Builder = NumericBuilder<Int8Type>;
|
||||
using Int16Builder = NumericBuilder<Int16Type>;
|
||||
using Int32Builder = NumericBuilder<Int32Type>;
|
||||
using Int64Builder = NumericBuilder<Int64Type>;
|
||||
|
||||
using FloatBuilder = NumericBuilder<FloatType>;
|
||||
using DoubleBuilder = NumericBuilder<DoubleType>;
|
||||
|
||||
/// @}
|
||||
|
||||
/// \addtogroup temporal-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
using Date32Builder = NumericBuilder<Date32Type>;
|
||||
using Date64Builder = NumericBuilder<Date64Type>;
|
||||
using Time32Builder = NumericBuilder<Time32Type>;
|
||||
using Time64Builder = NumericBuilder<Time64Type>;
|
||||
using TimestampBuilder = NumericBuilder<TimestampType>;
|
||||
using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
|
||||
using DurationBuilder = NumericBuilder<DurationType>;
|
||||
|
||||
/// @}
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT HalfFloatBuilder : public NumericBuilder<HalfFloatType> {
|
||||
public:
|
||||
using BaseClass = NumericBuilder<HalfFloatType>;
|
||||
using Float16 = arrow::util::Float16;
|
||||
|
||||
using BaseClass::Append;
|
||||
using BaseClass::AppendValues;
|
||||
using BaseClass::BaseClass;
|
||||
using BaseClass::GetValue;
|
||||
using BaseClass::UnsafeAppend;
|
||||
|
||||
/// Scalar append a arrow::util::Float16
|
||||
Status Append(const Float16 val) { return Append(val.bits()); }
|
||||
|
||||
/// Scalar append a arrow::util::Float16, without checking for capacity
|
||||
void UnsafeAppend(const Float16 val) { UnsafeAppend(val.bits()); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of arrow::util::Float16
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const Float16* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
|
||||
valid_bytes);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of arrow::util::Float16
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] bitmap a validity bitmap to copy (may be null)
|
||||
/// \param[in] bitmap_offset an offset into the validity bitmap
|
||||
/// \return Status
|
||||
Status AppendValues(const Float16* values, int64_t length, const uint8_t* bitmap,
|
||||
int64_t bitmap_offset) {
|
||||
return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
|
||||
bitmap, bitmap_offset);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of arrow::util::Float16
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid a std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const Float16* values, int64_t length,
|
||||
const std::vector<bool>& is_valid) {
|
||||
return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
|
||||
is_valid);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector<arrow::util::Float16>
|
||||
/// \param[in] is_valid a std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<Float16>& values,
|
||||
const std::vector<bool>& is_valid) {
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector<arrow::util::Float16>
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<Float16>& values) {
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
|
||||
}
|
||||
|
||||
/// \brief Append one value many times in one shot
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] value a arrow::util::Float16
|
||||
Status AppendValues(int64_t length, Float16 value) {
|
||||
RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value.bits());
|
||||
ArrayBuilder::UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Get the value at a certain index
|
||||
/// \param[in] index the zero-based index
|
||||
/// @tparam T arrow::util::Float16 or value_type (uint16_t)
|
||||
template <typename T = BaseClass::value_type>
|
||||
T GetValue(int64_t index) const {
|
||||
static_assert(std::is_same_v<T, BaseClass::value_type> ||
|
||||
std::is_same_v<T, arrow::util::Float16>);
|
||||
if constexpr (std::is_same_v<T, BaseClass::value_type>) {
|
||||
return BaseClass::GetValue(index);
|
||||
} else {
|
||||
return Float16::FromBits(BaseClass::GetValue(index));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
class ARROW_EXPORT BooleanBuilder
|
||||
: public ArrayBuilder,
|
||||
public internal::ArrayBuilderExtraOps<BooleanBuilder, bool> {
|
||||
public:
|
||||
using TypeClass = BooleanType;
|
||||
using value_type = bool;
|
||||
|
||||
explicit BooleanBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
BooleanBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, false);
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNull();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(false);
|
||||
UnsafeSetNotNull(1);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, false);
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const bool val) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(val);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const uint8_t val) { return Append(val != 0); }
|
||||
|
||||
/// Scalar append, without checking for capacity
|
||||
void UnsafeAppend(const bool val) {
|
||||
data_builder_.UnsafeAppend(val);
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
data_builder_.UnsafeAppend(false);
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of bytes (non-zero is 1)
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a bitmap of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] validity a validity bitmap to copy (may be null)
|
||||
/// \param[in] offset an offset into the values and validity bitmaps
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity,
|
||||
int64_t offset);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length,
|
||||
const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of bytes
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<uint8_t>& values,
|
||||
const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of bytes
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<uint8_t>& values);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values an std::vector<bool> indicating true (1) or false
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values an std::vector<bool> indicating true (1) or false
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<bool>& values);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// or null(0) values
|
||||
/// \return Status
|
||||
template <typename ValuesIter>
|
||||
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
// this updates length_
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot, with a specified nullmap
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \param[in] valid_begin InputIterator with elements indication valid(1)
|
||||
/// or null(0) values
|
||||
/// \return Status
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
static_assert(!internal::is_null_pointer<ValidIter>::value,
|
||||
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
|
||||
"version instead");
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Same as above, for a pointer type ValidIter
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
|
||||
if (valid_begin == NULLPTR) {
|
||||
UnsafeSetNotNull(length);
|
||||
} else {
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
}
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(int64_t length, bool value);
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(array.GetValues<uint8_t>(1, 0), length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<BooleanArray>* out) { return FinishTyped(out); }
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return boolean(); }
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<bool> data_builder_;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,303 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup run-end-encoded-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief An ArrayBuilder that deduplicates repeated values as they are
|
||||
/// appended to the inner-ArrayBuilder and reports the length of the current run
|
||||
/// of identical values.
|
||||
///
|
||||
/// The following sequence of calls
|
||||
///
|
||||
/// Append(2)
|
||||
/// Append(2)
|
||||
/// Append(2)
|
||||
/// Append(7)
|
||||
/// Append(7)
|
||||
/// Append(2)
|
||||
/// FinishInternal()
|
||||
///
|
||||
/// will cause the inner-builder to receive only 3 Append calls
|
||||
///
|
||||
/// Append(2)
|
||||
/// Append(7)
|
||||
/// Append(2)
|
||||
/// FinishInternal()
|
||||
///
|
||||
/// Note that values returned by length(), null_count() and capacity() are
|
||||
/// related to the compressed array built by the inner-ArrayBuilder.
|
||||
class RunCompressorBuilder : public ArrayBuilder {
|
||||
public:
|
||||
RunCompressorBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> inner_builder,
|
||||
std::shared_ptr<DataType> type);
|
||||
|
||||
~RunCompressorBuilder() override;
|
||||
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(RunCompressorBuilder);
|
||||
|
||||
/// \brief Called right before a run is being closed
|
||||
///
|
||||
/// Subclasses can override this function to perform an additional action when
|
||||
/// a run is closed (i.e. run-length is known and value is appended to the
|
||||
/// inner builder).
|
||||
///
|
||||
/// \param value can be NULLPTR if closing a run of NULLs
|
||||
/// \param length the greater than 0 length of the value run being closed
|
||||
virtual Status WillCloseRun(const std::shared_ptr<const Scalar>& value,
|
||||
int64_t length) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Called right before a run of empty values is being closed
|
||||
///
|
||||
/// Subclasses can override this function to perform an additional action when
|
||||
/// a run of empty values is appended (i.e. run-length is known and a single
|
||||
/// empty value is appended to the inner builder).
|
||||
///
|
||||
/// \param length the greater than 0 length of the value run being closed
|
||||
virtual Status WillCloseRunOfEmptyValues(int64_t length) { return Status::OK(); }
|
||||
|
||||
/// \brief Allocate enough memory for a given number of array elements.
|
||||
///
|
||||
/// NOTE: Conservatively resizing a run-length compressed array for a given
|
||||
/// number of logical elements is not possible, since the physical length will
|
||||
/// vary depending on the values to be appended in the future. But we can
|
||||
/// pessimistically assume that each run will contain a single value and
|
||||
/// allocate that number of runs.
|
||||
Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
|
||||
|
||||
/// \brief Allocate enough memory for a given number of runs.
|
||||
///
|
||||
/// Like Resize on non-encoded builders, it does not account for variable size
|
||||
/// data.
|
||||
Status ResizePhysical(int64_t capacity);
|
||||
|
||||
Status ReservePhysical(int64_t additional_capacity) {
|
||||
return Reserve(additional_capacity);
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
|
||||
Status AppendNull() final { return AppendNulls(1); }
|
||||
Status AppendNulls(int64_t length) override;
|
||||
|
||||
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
|
||||
Status AppendEmptyValues(int64_t length) override;
|
||||
|
||||
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
|
||||
Status AppendScalars(const ScalarVector& scalars) override;
|
||||
|
||||
// AppendArraySlice() is not implemented.
|
||||
|
||||
/// \brief Append a slice of an array containing values from already
|
||||
/// compressed runs.
|
||||
///
|
||||
/// NOTE: WillCloseRun() is not called as the length of each run cannot be
|
||||
/// determined at this point. Caller should ensure that !has_open_run() by
|
||||
/// calling FinishCurrentRun() before calling this.
|
||||
///
|
||||
/// Pre-condition: !has_open_run()
|
||||
Status AppendRunCompressedArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length);
|
||||
|
||||
/// \brief Forces the closing of the current run if one is currently open.
|
||||
///
|
||||
/// This can be called when one wants to ensure the current run will not be
|
||||
/// extended. This may cause identical values to appear close to each other in
|
||||
/// the underlying array (i.e. two runs that could be a single run) if more
|
||||
/// values are appended after this is called.
|
||||
///
|
||||
/// Finish() and FinishInternal() call this automatically.
|
||||
virtual Status FinishCurrentRun();
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
ArrayBuilder& inner_builder() const { return *inner_builder_; }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return inner_builder_->type(); }
|
||||
|
||||
bool has_open_run() const { return current_run_length_ > 0; }
|
||||
int64_t open_run_length() const { return current_run_length_; }
|
||||
|
||||
private:
|
||||
inline void UpdateDimensions() {
|
||||
capacity_ = inner_builder_->capacity();
|
||||
length_ = inner_builder_->length();
|
||||
null_count_ = inner_builder_->null_count();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<ArrayBuilder> inner_builder_;
|
||||
std::shared_ptr<const Scalar> current_value_ = NULLPTR;
|
||||
int64_t current_run_length_ = 0;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// RunEndEncoded builder
|
||||
|
||||
/// \brief Run-end encoded array builder.
|
||||
///
|
||||
/// NOTE: the value returned by and capacity() is related to the
|
||||
/// compressed array (physical) and not the decoded array (logical) that is
|
||||
/// run-end encoded. null_count() always returns 0. length(), on the other hand,
|
||||
/// returns the logical length of the run-end encoded array.
|
||||
class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {
|
||||
private:
|
||||
// An internal::RunCompressorBuilder that produces a run-end in the
|
||||
// RunEndEncodedBuilder every time a value-run is closed.
|
||||
class ValueRunBuilder : public internal::RunCompressorBuilder {
|
||||
public:
|
||||
ValueRunBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
RunEndEncodedBuilder& ree_builder);
|
||||
|
||||
~ValueRunBuilder() override = default;
|
||||
|
||||
Status WillCloseRun(const std::shared_ptr<const Scalar>&, int64_t length) override {
|
||||
return ree_builder_.CloseRun(length);
|
||||
}
|
||||
|
||||
Status WillCloseRunOfEmptyValues(int64_t length) override {
|
||||
return ree_builder_.CloseRun(length);
|
||||
}
|
||||
|
||||
private:
|
||||
RunEndEncodedBuilder& ree_builder_;
|
||||
};
|
||||
|
||||
public:
|
||||
RunEndEncodedBuilder(MemoryPool* pool,
|
||||
const std::shared_ptr<ArrayBuilder>& run_end_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& value_builder,
|
||||
std::shared_ptr<DataType> type);
|
||||
|
||||
/// \brief Allocate enough memory for a given number of array elements.
|
||||
///
|
||||
/// NOTE: Conservatively resizing an REE for a given number of logical
|
||||
/// elements is not possible, since the physical length will vary depending on
|
||||
/// the values to be appended in the future. But we can pessimistically assume
|
||||
/// that each run will contain a single value and allocate that number of
|
||||
/// runs.
|
||||
Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
|
||||
|
||||
/// \brief Allocate enough memory for a given number of runs.
|
||||
Status ResizePhysical(int64_t capacity);
|
||||
|
||||
/// \brief Ensure that there is enough space allocated to append the indicated
|
||||
/// number of run without any further reallocation. Overallocation is
|
||||
/// used in order to minimize the impact of incremental ReservePhysical() calls.
|
||||
/// Note that additional_capacity is relative to the current number of elements
|
||||
/// rather than to the current capacity, so calls to Reserve() which are not
|
||||
/// interspersed with addition of new elements may not increase the capacity.
|
||||
///
|
||||
/// \param[in] additional_capacity the number of additional runs
|
||||
/// \return Status
|
||||
Status ReservePhysical(int64_t additional_capacity) {
|
||||
return Reserve(additional_capacity);
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
|
||||
Status AppendNull() final { return AppendNulls(1); }
|
||||
Status AppendNulls(int64_t length) override;
|
||||
|
||||
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
|
||||
Status AppendEmptyValues(int64_t length) override;
|
||||
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
|
||||
Status AppendScalars(const ScalarVector& scalars) override;
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<RunEndEncodedArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Forces the closing of the current run if one is currently open.
|
||||
///
|
||||
/// This can be called when one wants to ensure the current run will not be
|
||||
/// extended. This may cause identical values to appear close to each other in
|
||||
/// the values array (i.e. two runs that could be a single run) if more
|
||||
/// values are appended after this is called.
|
||||
Status FinishCurrentRun();
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
private:
|
||||
/// \brief Update physical capacity and logical length
|
||||
///
|
||||
/// \param committed_logical_length number of logical values that have been
|
||||
/// committed to the values array
|
||||
/// \param open_run_length number of logical values in the currently open run if any
|
||||
inline void UpdateDimensions(int64_t committed_logical_length,
|
||||
int64_t open_run_length) {
|
||||
capacity_ = run_end_builder().capacity();
|
||||
length_ = committed_logical_length + open_run_length;
|
||||
committed_logical_length_ = committed_logical_length;
|
||||
}
|
||||
|
||||
// Pre-condition: !value_run_builder_.has_open_run()
|
||||
template <typename RunEndCType>
|
||||
Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length);
|
||||
|
||||
template <typename RunEndCType>
|
||||
Status DoAppendRunEnd(int64_t run_end);
|
||||
|
||||
/// \brief Cast run_end to the appropriate type and appends it to the run_ends
|
||||
/// array.
|
||||
Status AppendRunEnd(int64_t run_end);
|
||||
|
||||
/// \brief Close a run by appending a value to the run_ends array and updating
|
||||
/// length_ to reflect the new run.
|
||||
///
|
||||
/// Pre-condition: run_length > 0.
|
||||
[[nodiscard]] Status CloseRun(int64_t run_length);
|
||||
|
||||
ArrayBuilder& run_end_builder();
|
||||
ArrayBuilder& value_builder();
|
||||
|
||||
private:
|
||||
std::shared_ptr<RunEndEncodedType> type_;
|
||||
ValueRunBuilder* value_run_builder_;
|
||||
// The length not counting the current open run in the value_run_builder_
|
||||
int64_t committed_logical_length_ = 0;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,66 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Contains declarations of time related Arrow builder types.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/builder_primitive.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup temporal-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// TODO(ARROW-7938): this class is untested
|
||||
|
||||
class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
|
||||
public:
|
||||
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
|
||||
|
||||
explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: DayTimeIntervalBuilder(day_time_interval(), pool, alignment) {}
|
||||
|
||||
explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: NumericBuilder<DayTimeIntervalType>(type, pool, alignment) {}
|
||||
};
|
||||
|
||||
class ARROW_EXPORT MonthDayNanoIntervalBuilder
|
||||
: public NumericBuilder<MonthDayNanoIntervalType> {
|
||||
public:
|
||||
using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos;
|
||||
|
||||
explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool, alignment) {}
|
||||
|
||||
explicit MonthDayNanoIntervalBuilder(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: NumericBuilder<MonthDayNanoIntervalType>(type, pool, alignment) {}
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,254 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Base class for union array builds.
|
||||
///
|
||||
/// Note that while we subclass ArrayBuilder, as union types do not have a
|
||||
/// validity bitmap, the bitmap builder member of ArrayBuilder is not used.
|
||||
class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
|
||||
public:
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Make a new child builder available to the UnionArray
|
||||
///
|
||||
/// \param[in] new_child the child builder
|
||||
/// \param[in] field_name the name of the field in the union array type
|
||||
/// if type inference is used
|
||||
/// \return child index, which is the "type" argument that needs
|
||||
/// to be passed to the "Append" method to add a new element to
|
||||
/// the union array.
|
||||
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
|
||||
const std::string& field_name = "");
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
int64_t length() const override { return types_builder_.length(); }
|
||||
|
||||
protected:
|
||||
BasicUnionBuilder(MemoryPool* pool, int64_t alignment,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
int8_t NextTypeId();
|
||||
|
||||
std::vector<std::shared_ptr<Field>> child_fields_;
|
||||
std::vector<int8_t> type_codes_;
|
||||
UnionMode::type mode_;
|
||||
|
||||
std::vector<ArrayBuilder*> type_id_to_children_;
|
||||
std::vector<int> type_id_to_child_id_;
|
||||
// for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
|
||||
int8_t dense_type_id_ = 0;
|
||||
TypedBufferBuilder<int8_t> types_builder_;
|
||||
};
|
||||
|
||||
/// \class DenseUnionBuilder
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
|
||||
public:
|
||||
/// Use this constructor to initialize the UnionBuilder with no child builders,
|
||||
/// allowing type to be inferred. You will need to call AppendChild for each of the
|
||||
/// children builders you want to use.
|
||||
explicit DenseUnionBuilder(MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, {}, dense_union(FieldVector{})),
|
||||
offsets_builder_(pool, alignment) {}
|
||||
|
||||
/// Use this constructor to specify the type explicitly.
|
||||
/// You can still add child builders to the union after using this constructor
|
||||
DenseUnionBuilder(MemoryPool* pool,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, children, type),
|
||||
offsets_builder_(pool, alignment) {}
|
||||
|
||||
Status AppendNull() final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
|
||||
// Append a null arbitrarily to the first child
|
||||
return child_builder->AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
|
||||
// Append just a single null to the first child
|
||||
return child_builder->AppendNull();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
|
||||
// Append an empty value arbitrarily to the first child
|
||||
return child_builder->AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
|
||||
// Append just a single empty value to the first child
|
||||
return child_builder->AppendEmptyValue();
|
||||
}
|
||||
|
||||
/// \brief Append an element to the UnionArray. This must be followed
|
||||
/// by an append to the appropriate child builder.
|
||||
///
|
||||
/// \param[in] next_type type_id of the child to which the next value will be appended.
|
||||
///
|
||||
/// The corresponding child builder must be appended to independently after this method
|
||||
/// is called.
|
||||
Status Append(int8_t next_type) {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
|
||||
if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
|
||||
return Status::CapacityError(
|
||||
"a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
|
||||
"child");
|
||||
}
|
||||
auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
|
||||
return offsets_builder_.Append(offset);
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
private:
|
||||
TypedBufferBuilder<int32_t> offsets_builder_;
|
||||
};
|
||||
|
||||
/// \class SparseUnionBuilder
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
|
||||
public:
|
||||
/// Use this constructor to initialize the UnionBuilder with no child builders,
|
||||
/// allowing type to be inferred. You will need to call AppendChild for each of the
|
||||
/// children builders you want to use.
|
||||
explicit SparseUnionBuilder(MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, {}, sparse_union(FieldVector{})) {}
|
||||
|
||||
/// Use this constructor to specify the type explicitly.
|
||||
/// You can still add child builders to the union after using this constructor
|
||||
SparseUnionBuilder(MemoryPool* pool,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, children, type) {}
|
||||
|
||||
/// \brief Append a null value.
|
||||
///
|
||||
/// A null is appended to the first child, empty values to the other children.
|
||||
Status AppendNull() final {
|
||||
const auto first_child_code = type_codes_[0];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
|
||||
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append multiple null values.
|
||||
///
|
||||
/// Nulls are appended to the first child, empty values to the other children.
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const auto first_child_code = type_codes_[0];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
|
||||
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
|
||||
ARROW_RETURN_NOT_OK(
|
||||
type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
|
||||
for (int8_t code : type_codes_) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
|
||||
for (int8_t code : type_codes_) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append an element to the UnionArray. This must be followed
|
||||
/// by an append to the appropriate child builder.
|
||||
///
|
||||
/// \param[in] next_type type_id of the child to which the next value will be appended.
|
||||
///
|
||||
/// The corresponding child builder must be appended to independently after this method
|
||||
/// is called, and all other child builders must have null or empty value appended.
|
||||
Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,53 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
/// \brief Concatenate arrays
|
||||
///
|
||||
/// \param[in] arrays a vector of arrays to be concatenated
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \param[out] out_suggested_cast if a non-OK Result is returned, the function might set
|
||||
/// out_suggested_cast to a cast suggestion that would allow concatenating the arrays
|
||||
/// without overflow of offsets (e.g. string to large_string)
|
||||
///
|
||||
/// \return the concatenated array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, MemoryPool* pool,
|
||||
std::shared_ptr<DataType>* out_suggested_cast);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief Concatenate arrays
|
||||
///
|
||||
/// \param[in] arrays a vector of arrays to be concatenated
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \return the concatenated array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,750 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic> // IWYU pragma: export
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/statistics.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/span.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace internal {
|
||||
// ----------------------------------------------------------------------
|
||||
// Null handling for types without a validity bitmap and the dictionary type
|
||||
|
||||
ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i);
|
||||
ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i);
|
||||
ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i);
|
||||
|
||||
ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data);
|
||||
ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data);
|
||||
ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// When slicing, we do not know the null count of the sliced range without
|
||||
// doing some computation. To avoid doing this eagerly, we set the null count
|
||||
// to -1 (any negative number will do). When Array::null_count is called the
|
||||
// first time, the null count will be computed. See ARROW-33
|
||||
constexpr int64_t kUnknownNullCount = -1;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Generic array data container
|
||||
|
||||
/// \class ArrayData
|
||||
/// \brief Mutable container for generic Arrow array data
|
||||
///
|
||||
/// This data structure is a self-contained representation of the memory and
|
||||
/// metadata inside an Arrow array data structure (called vectors in Java). The
|
||||
/// Array class and its concrete subclasses provide strongly-typed accessors
|
||||
/// with support for the visitor pattern and other affordances.
|
||||
///
|
||||
/// This class is designed for easy internal data manipulation, analytical data
|
||||
/// processing, and data transport to and from IPC messages.
|
||||
///
|
||||
/// This class is also useful in an analytics setting where memory may be
|
||||
/// efficiently reused. For example, computing the Abs of a numeric array
|
||||
/// should return null iff the input is null: therefore, an Abs function can
|
||||
/// reuse the validity bitmap (a Buffer) of its input as the validity bitmap
|
||||
/// of its output.
|
||||
///
|
||||
/// This class is meant mostly for immutable data access. Any mutable access
|
||||
/// (either to ArrayData members or to the contents of its Buffers) should take
|
||||
/// into account the fact that ArrayData instances are typically wrapped in a
|
||||
/// shared_ptr and can therefore have multiple owners at any given time.
|
||||
/// Therefore, mutable access is discouraged except when initially populating
|
||||
/// the ArrayData.
|
||||
struct ARROW_EXPORT ArrayData {
|
||||
ArrayData() = default;
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: ArrayData(std::move(type), length, null_count, offset) {
|
||||
this->buffers = std::move(buffers);
|
||||
#ifndef NDEBUG
|
||||
// in debug mode, call the `device_type` function to trigger
|
||||
// the DCHECKs that validate all the buffers are on the same device
|
||||
ARROW_UNUSED(this->device_type());
|
||||
#endif
|
||||
}
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: ArrayData(std::move(type), length, null_count, offset) {
|
||||
this->buffers = std::move(buffers);
|
||||
this->child_data = std::move(child_data);
|
||||
#ifndef NDEBUG
|
||||
// in debug mode, call the `device_type` function to trigger
|
||||
// the DCHECKs that validate all the buffers (including children)
|
||||
// are on the same device
|
||||
ARROW_UNUSED(this->device_type());
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(
|
||||
std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(
|
||||
std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
|
||||
int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
// Move constructor
|
||||
ArrayData(ArrayData&& other) noexcept
|
||||
: type(std::move(other.type)),
|
||||
length(other.length),
|
||||
null_count(other.null_count.load()),
|
||||
offset(other.offset),
|
||||
buffers(std::move(other.buffers)),
|
||||
child_data(std::move(other.child_data)),
|
||||
dictionary(std::move(other.dictionary)),
|
||||
statistics(std::move(other.statistics)) {}
|
||||
|
||||
// Copy constructor
|
||||
ArrayData(const ArrayData& other) noexcept
|
||||
: type(other.type),
|
||||
length(other.length),
|
||||
null_count(other.null_count.load()),
|
||||
offset(other.offset),
|
||||
buffers(other.buffers),
|
||||
child_data(other.child_data),
|
||||
dictionary(other.dictionary),
|
||||
statistics(other.statistics) {}
|
||||
|
||||
// Move assignment
|
||||
ArrayData& operator=(ArrayData&& other) {
|
||||
type = std::move(other.type);
|
||||
length = other.length;
|
||||
SetNullCount(other.null_count);
|
||||
offset = other.offset;
|
||||
buffers = std::move(other.buffers);
|
||||
child_data = std::move(other.child_data);
|
||||
dictionary = std::move(other.dictionary);
|
||||
statistics = std::move(other.statistics);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Copy assignment
|
||||
ArrayData& operator=(const ArrayData& other) {
|
||||
type = other.type;
|
||||
length = other.length;
|
||||
SetNullCount(other.null_count);
|
||||
offset = other.offset;
|
||||
buffers = other.buffers;
|
||||
child_data = other.child_data;
|
||||
dictionary = other.dictionary;
|
||||
statistics = other.statistics;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Return a shallow copy of this ArrayData
|
||||
std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
|
||||
|
||||
/// \brief Deep copy this ArrayData to destination memory manager
|
||||
///
|
||||
/// Returns a new ArrayData object with buffers and all child buffers
|
||||
/// copied to the destination memory manager. This includes dictionaries
|
||||
/// if applicable.
|
||||
Result<std::shared_ptr<ArrayData>> CopyTo(
|
||||
const std::shared_ptr<MemoryManager>& to) const;
|
||||
|
||||
/// \brief View or copy this ArrayData to destination memory manager
|
||||
///
|
||||
/// Tries to view the buffer contents on the given memory manager's device
|
||||
/// if possible (to avoid a copy) but falls back to copying if a no-copy view
|
||||
/// isn't supported.
|
||||
Result<std::shared_ptr<ArrayData>> ViewOrCopyTo(
|
||||
const std::shared_ptr<MemoryManager>& to) const;
|
||||
|
||||
/// \brief Return the null-ness of a given array element
|
||||
///
|
||||
/// Calling `IsNull(i)` is the same as `!IsValid(i)`.
|
||||
bool IsNull(int64_t i) const { return !IsValid(i); }
|
||||
|
||||
/// \brief Return the validity of a given array element
|
||||
///
|
||||
/// For most data types, this will simply query the validity bitmap.
|
||||
/// For union and run-end-encoded arrays, the underlying child data is
|
||||
/// queried instead.
|
||||
/// For dictionary arrays, this reflects the validity of the dictionary
|
||||
/// index, but the corresponding dictionary value might still be null.
|
||||
/// For null arrays, this always returns false.
|
||||
bool IsValid(int64_t i) const {
|
||||
if (buffers[0] != NULLPTR) {
|
||||
return bit_util::GetBit(buffers[0]->data(), i + offset);
|
||||
}
|
||||
const auto type = this->type->id();
|
||||
if (type == Type::SPARSE_UNION) {
|
||||
return !internal::IsNullSparseUnion(*this, i);
|
||||
}
|
||||
if (type == Type::DENSE_UNION) {
|
||||
return !internal::IsNullDenseUnion(*this, i);
|
||||
}
|
||||
if (type == Type::RUN_END_ENCODED) {
|
||||
return !internal::IsNullRunEndEncoded(*this, i);
|
||||
}
|
||||
return null_count.load() != length;
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
/// \param absolute_offset the offset into the buffer
|
||||
///
|
||||
/// If `absolute_offset` is non-zero, the type `T` must match the
|
||||
/// layout of buffer number `i` for the array's data type; otherwise
|
||||
/// offset computation would be incorrect.
|
||||
///
|
||||
/// If the given buffer is bit-packed (such as a validity bitmap, or
|
||||
/// the data buffer of a boolean array), then `absolute_offset` must be
|
||||
/// zero for correct results, and any bit offset must be applied manually
|
||||
/// by the caller.
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i, int64_t absolute_offset) const {
|
||||
if (buffers[i]) {
|
||||
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
///
|
||||
/// This method uses the array's offset to index into buffer number `i`.
|
||||
///
|
||||
/// Calling this method on a bit-packed buffer (such as a validity bitmap, or
|
||||
/// the data buffer of a boolean array) will lead to incorrect results.
|
||||
/// You should instead call `GetValues(i, 0)` and apply the bit offset manually.
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i) const {
|
||||
return GetValues<T>(i, offset);
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
/// \param absolute_offset the offset into the buffer
|
||||
///
|
||||
/// Like `GetValues(i, absolute_offset)`, but returns nullptr if the given buffer
|
||||
/// is not a CPU buffer.
|
||||
template <typename T>
|
||||
inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
|
||||
if (buffers[i] && buffers[i]->is_cpu()) {
|
||||
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
///
|
||||
/// Like `GetValues(i)`, but returns nullptr if the given buffer is not a CPU buffer.
|
||||
template <typename T>
|
||||
inline const T* GetValuesSafe(int i) const {
|
||||
return GetValuesSafe<T>(i, offset);
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a mutable typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
/// \param absolute_offset the offset into the buffer
|
||||
///
|
||||
/// Like `GetValues(i, absolute_offset)`, but allows mutating buffer contents.
|
||||
/// This should only be used when initially populating the ArrayData, before
|
||||
/// it is attached to a Array instance.
|
||||
template <typename T>
|
||||
inline T* GetMutableValues(int i, int64_t absolute_offset) {
|
||||
if (buffers[i]) {
|
||||
return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a mutable typed C pointer
|
||||
///
|
||||
/// \param i the buffer index
|
||||
///
|
||||
/// Like `GetValues(i)`, but allows mutating buffer contents.
|
||||
/// This should only be used when initially populating the ArrayData, before
|
||||
/// it is attached to a Array instance.
|
||||
template <typename T>
|
||||
inline T* GetMutableValues(int i) {
|
||||
return GetMutableValues<T>(i, offset);
|
||||
}
|
||||
|
||||
/// \brief Construct a zero-copy slice of the data with the given offset and length
|
||||
///
|
||||
/// This method applies the given slice to this ArrayData, taking into account
|
||||
/// its existing offset and length.
|
||||
/// If the given `length` is too large, the slice length is clamped so as not
|
||||
/// to go past the offset end.
|
||||
/// If the given `often` is too large, or if either `offset` or `length` is negative,
|
||||
/// behavior is undefined.
|
||||
///
|
||||
/// The associated ArrayStatistics is always discarded in a sliced
|
||||
/// ArrayData, even if the slice is trivially equal to the original ArrayData.
|
||||
/// If you want to reuse the statistics from the original ArrayData, you must
|
||||
/// explicitly reattach them.
|
||||
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
/// \brief Construct a zero-copy slice of the data with the given offset and length
|
||||
///
|
||||
/// Like `Slice(offset, length)`, but returns an error if the requested slice
|
||||
/// falls out of bounds.
|
||||
/// Unlike Slice, `length` isn't clamped to the available buffer size.
|
||||
Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
|
||||
|
||||
/// \brief Set the cached physical null count
|
||||
///
|
||||
/// \param v the number of nulls in the ArrayData
|
||||
///
|
||||
/// This should only be used when initially populating the ArrayData, if
|
||||
/// it possible to compute the null count without visiting the entire validity
|
||||
/// bitmap. In most cases, relying on `GetNullCount` is sufficient.
|
||||
void SetNullCount(int64_t v) { null_count.store(v); }
|
||||
|
||||
/// \brief Return the physical null count
|
||||
///
|
||||
/// This method returns the number of array elements for which `IsValid` would
|
||||
/// return false.
|
||||
///
|
||||
/// A cached value is returned if already available, otherwise it is first
|
||||
/// computed and stored.
|
||||
/// How it is is computed depends on the data type, see `IsValid` for details.
|
||||
///
|
||||
/// Note that this method is typically much faster than calling `IsValid`
|
||||
/// for all elements. Therefore, it helps avoid per-element validity bitmap
|
||||
/// lookups in the common cases where the array contains zero or only nulls.
|
||||
int64_t GetNullCount() const;
|
||||
|
||||
/// \brief Return true if the array may have nulls in its validity bitmap
|
||||
///
|
||||
/// This method returns true if the data has a validity bitmap, and the physical
|
||||
/// null count is either known to be non-zero or not yet known.
|
||||
///
|
||||
/// Unlike `MayHaveLogicalNulls`, this does not check for the presence of nulls
|
||||
/// in child data for data types such as unions and run-end encoded types.
|
||||
///
|
||||
/// \see HasValidityBitmap
|
||||
/// \see MayHaveLogicalNulls
|
||||
bool MayHaveNulls() const {
|
||||
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
|
||||
// but no buffer
|
||||
return null_count.load() != 0 && buffers[0] != NULLPTR;
|
||||
}
|
||||
|
||||
/// \brief Return true if the array has a validity bitmap
|
||||
bool HasValidityBitmap() const { return buffers[0] != NULLPTR; }
|
||||
|
||||
/// \brief Return true if the array may have logical nulls
|
||||
///
|
||||
/// Unlike `MayHaveNulls`, this method checks for null child values
|
||||
/// for types without a validity bitmap, such as unions and run-end encoded
|
||||
/// types, and for null dictionary values for dictionary types.
|
||||
///
|
||||
/// This implies that `MayHaveLogicalNulls` may return true for arrays that
|
||||
/// don't have a top-level validity bitmap. It is therefore necessary
|
||||
/// to call `HasValidityBitmap` before accessing a top-level validity bitmap.
|
||||
///
|
||||
/// Code that previously used MayHaveNulls and then dealt with the validity
|
||||
/// bitmap directly can be fixed to handle all types correctly without
|
||||
/// performance degradation when handling most types by adopting
|
||||
/// HasValidityBitmap and MayHaveLogicalNulls.
|
||||
///
|
||||
/// Before:
|
||||
///
|
||||
/// uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
/// for (int64_t i = 0; i < array.length; ++i) {
|
||||
/// if (validity && !bit_util::GetBit(validity, i)) {
|
||||
/// continue; // skip a NULL
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
///
|
||||
/// After:
|
||||
///
|
||||
/// bool all_valid = !array.MayHaveLogicalNulls();
|
||||
/// uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
|
||||
/// for (int64_t i = 0; i < array.length; ++i) {
|
||||
/// bool is_valid = all_valid ||
|
||||
/// (validity && bit_util::GetBit(validity, i)) ||
|
||||
/// array.IsValid(i);
|
||||
/// if (!is_valid) {
|
||||
/// continue; // skip a NULL
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
bool MayHaveLogicalNulls() const {
|
||||
if (buffers[0] != NULLPTR) {
|
||||
return null_count.load() != 0;
|
||||
}
|
||||
const auto t = type->id();
|
||||
if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
|
||||
return internal::UnionMayHaveLogicalNulls(*this);
|
||||
}
|
||||
if (t == Type::RUN_END_ENCODED) {
|
||||
return internal::RunEndEncodedMayHaveLogicalNulls(*this);
|
||||
}
|
||||
if (t == Type::DICTIONARY) {
|
||||
return internal::DictionaryMayHaveLogicalNulls(*this);
|
||||
}
|
||||
return null_count.load() != 0;
|
||||
}
|
||||
|
||||
/// \brief Compute the logical null count for arrays of all types
|
||||
///
|
||||
/// If the array has a validity bitmap, this function behaves the same as
|
||||
/// GetNullCount. For arrays that have no validity bitmap but whose values
|
||||
/// may be logically null (such as union arrays and run-end encoded arrays),
|
||||
/// this function recomputes the null count every time it is called.
|
||||
///
|
||||
/// \see GetNullCount
|
||||
int64_t ComputeLogicalNullCount() const;
|
||||
|
||||
/// \brief Return the device_type of the underlying buffers and children
|
||||
///
|
||||
/// If there are no buffers in this ArrayData object, it just returns
|
||||
/// DeviceAllocationType::kCPU as a default. We also assume that all buffers
|
||||
/// should be allocated on the same device type and perform DCHECKs to confirm
|
||||
/// this in debug mode.
|
||||
///
|
||||
/// \return DeviceAllocationType
|
||||
DeviceAllocationType device_type() const;
|
||||
|
||||
std::shared_ptr<DataType> type;
|
||||
int64_t length = 0;
|
||||
mutable std::atomic<int64_t> null_count{0};
|
||||
// The logical start point into the physical buffers (in values, not bytes).
|
||||
// Note that, for child data, this must be *added* to the child data's own offset.
|
||||
int64_t offset = 0;
|
||||
std::vector<std::shared_ptr<Buffer>> buffers;
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data;
|
||||
|
||||
// The dictionary for this Array, if any. Only used for dictionary type
|
||||
std::shared_ptr<ArrayData> dictionary;
|
||||
|
||||
// The statistics for this Array.
|
||||
std::shared_ptr<ArrayStatistics> statistics;
|
||||
};
|
||||
|
||||
/// \brief A non-owning Buffer reference
|
||||
struct ARROW_EXPORT BufferSpan {
|
||||
// It is the user of this class's responsibility to ensure that
|
||||
// buffers that were const originally are not written to
|
||||
// accidentally.
|
||||
uint8_t* data = NULLPTR;
|
||||
int64_t size = 0;
|
||||
// Pointer back to buffer that owns this memory
|
||||
const std::shared_ptr<Buffer>* owner = NULLPTR;
|
||||
|
||||
template <typename T>
|
||||
const T* data_as() const {
|
||||
return reinterpret_cast<const T*>(data);
|
||||
}
|
||||
template <typename T>
|
||||
T* mutable_data_as() {
|
||||
return reinterpret_cast<T*>(data);
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief EXPERIMENTAL: A non-owning array data container
|
||||
///
|
||||
/// Unlike ArrayData, this class doesn't own its referenced data type nor data buffers.
|
||||
/// It is cheaply copyable and can therefore be suitable for use cases where
|
||||
/// shared_ptr overhead is not acceptable. However, care should be taken to
|
||||
/// keep alive the referenced objects and memory while the ArraySpan object is in use.
|
||||
/// For this reason, this should not be exposed in most public APIs (apart from
|
||||
/// compute kernel interfaces).
|
||||
struct ARROW_EXPORT ArraySpan {
|
||||
const DataType* type = NULLPTR;
|
||||
int64_t length = 0;
|
||||
mutable int64_t null_count = kUnknownNullCount;
|
||||
int64_t offset = 0;
|
||||
BufferSpan buffers[3];
|
||||
|
||||
ArraySpan() = default;
|
||||
|
||||
explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}
|
||||
|
||||
ArraySpan(const ArrayData& data) { // NOLINT implicit conversion
|
||||
SetMembers(data);
|
||||
}
|
||||
explicit ArraySpan(const Scalar& data) { FillFromScalar(data); }
|
||||
|
||||
/// If dictionary-encoded, put dictionary in the first entry
|
||||
std::vector<ArraySpan> child_data;
|
||||
|
||||
/// \brief Populate ArraySpan to look like an array of length 1 pointing at
|
||||
/// the data members of a Scalar value
|
||||
void FillFromScalar(const Scalar& value);
|
||||
|
||||
void SetMembers(const ArrayData& data);
|
||||
|
||||
void SetBuffer(int index, const std::shared_ptr<Buffer>& buffer) {
|
||||
this->buffers[index].data = const_cast<uint8_t*>(buffer->data());
|
||||
this->buffers[index].size = buffer->size();
|
||||
this->buffers[index].owner = &buffer;
|
||||
}
|
||||
|
||||
const ArraySpan& dictionary() const { return child_data[0]; }
|
||||
|
||||
/// \brief Return the number of buffers (out of 3) that are used to
|
||||
/// constitute this array
|
||||
int num_buffers() const;
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline T* GetValues(int i, int64_t absolute_offset) {
|
||||
return reinterpret_cast<T*>(buffers[i].data) + absolute_offset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T* GetValues(int i) {
|
||||
return GetValues<T>(i, this->offset);
|
||||
}
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i, int64_t absolute_offset) const {
|
||||
return reinterpret_cast<const T*>(buffers[i].data) + absolute_offset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i) const {
|
||||
return GetValues<T>(i, this->offset);
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a span
|
||||
///
|
||||
/// \param i The buffer index
|
||||
/// \param length The required length (in number of typed values) of the requested span
|
||||
/// \pre i > 0
|
||||
/// \pre length <= the length of the buffer (in number of values) that's expected for
|
||||
/// this array type
|
||||
/// \return A span<const T> of the requested length
|
||||
template <typename T>
|
||||
util::span<const T> GetSpan(int i, int64_t length) const {
|
||||
const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
|
||||
assert(i > 0 && length + offset <= buffer_length);
|
||||
ARROW_UNUSED(buffer_length);
|
||||
return util::span<const T>(buffers[i].data_as<T>() + this->offset, length);
|
||||
}
|
||||
|
||||
/// \brief Access a buffer's data as a span
|
||||
///
|
||||
/// \param i The buffer index
|
||||
/// \param length The required length (in number of typed values) of the requested span
|
||||
/// \pre i > 0
|
||||
/// \pre length <= the length of the buffer (in number of values) that's expected for
|
||||
/// this array type
|
||||
/// \return A span<T> of the requested length
|
||||
template <typename T>
|
||||
util::span<T> GetSpan(int i, int64_t length) {
|
||||
const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
|
||||
assert(i > 0 && length + offset <= buffer_length);
|
||||
ARROW_UNUSED(buffer_length);
|
||||
return util::span<T>(buffers[i].mutable_data_as<T>() + this->offset, length);
|
||||
}
|
||||
|
||||
inline bool IsNull(int64_t i) const { return !IsValid(i); }
|
||||
|
||||
inline bool IsValid(int64_t i) const {
|
||||
if (this->buffers[0].data != NULLPTR) {
|
||||
return bit_util::GetBit(this->buffers[0].data, i + this->offset);
|
||||
} else {
|
||||
const auto type = this->type->id();
|
||||
if (type == Type::SPARSE_UNION) {
|
||||
return !IsNullSparseUnion(i);
|
||||
}
|
||||
if (type == Type::DENSE_UNION) {
|
||||
return !IsNullDenseUnion(i);
|
||||
}
|
||||
if (type == Type::RUN_END_ENCODED) {
|
||||
return !IsNullRunEndEncoded(i);
|
||||
}
|
||||
return this->null_count != this->length;
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> ToArrayData() const;
|
||||
|
||||
std::shared_ptr<Array> ToArray() const;
|
||||
|
||||
std::shared_ptr<Buffer> GetBuffer(int index) const {
|
||||
const BufferSpan& buf = this->buffers[index];
|
||||
if (buf.owner) {
|
||||
return *buf.owner;
|
||||
} else if (buf.data != NULLPTR) {
|
||||
// Buffer points to some memory without an owning buffer
|
||||
return std::make_shared<Buffer>(buf.data, buf.size);
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
void SetSlice(int64_t offset, int64_t length) {
|
||||
this->offset = offset;
|
||||
this->length = length;
|
||||
if (this->type->id() == Type::NA) {
|
||||
this->null_count = this->length;
|
||||
} else if (this->MayHaveNulls()) {
|
||||
this->null_count = kUnknownNullCount;
|
||||
} else {
|
||||
this->null_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Return physical null count, or compute and set it if it's not known
|
||||
int64_t GetNullCount() const;
|
||||
|
||||
/// \brief Return true if the array has a validity bitmap and the physical null
|
||||
/// count is known to be non-zero or not yet known
|
||||
///
|
||||
/// Note that this is not the same as MayHaveLogicalNulls, which also checks
|
||||
/// for the presence of nulls in child data for types like unions and run-end
|
||||
/// encoded types.
|
||||
///
|
||||
/// \see HasValidityBitmap
|
||||
/// \see MayHaveLogicalNulls
|
||||
bool MayHaveNulls() const {
|
||||
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
|
||||
// but no buffer
|
||||
return null_count != 0 && buffers[0].data != NULLPTR;
|
||||
}
|
||||
|
||||
/// \brief Return true if the array has a validity bitmap
|
||||
bool HasValidityBitmap() const { return buffers[0].data != NULLPTR; }
|
||||
|
||||
/// \brief Return true if the validity bitmap may have 0's in it, or if the
|
||||
/// child arrays (in the case of types without a validity bitmap) may have
|
||||
/// nulls, or if the dictionary of dictionay array may have nulls.
|
||||
///
|
||||
/// \see ArrayData::MayHaveLogicalNulls
|
||||
bool MayHaveLogicalNulls() const {
|
||||
if (buffers[0].data != NULLPTR) {
|
||||
return null_count != 0;
|
||||
}
|
||||
const auto t = type->id();
|
||||
if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
|
||||
return UnionMayHaveLogicalNulls();
|
||||
}
|
||||
if (t == Type::RUN_END_ENCODED) {
|
||||
return RunEndEncodedMayHaveLogicalNulls();
|
||||
}
|
||||
if (t == Type::DICTIONARY) {
|
||||
return DictionaryMayHaveLogicalNulls();
|
||||
}
|
||||
return null_count != 0;
|
||||
}
|
||||
|
||||
/// \brief Compute the logical null count for arrays of all types including
|
||||
/// those that do not have a validity bitmap like union and run-end encoded
|
||||
/// arrays
|
||||
///
|
||||
/// If the array has a validity bitmap, this function behaves the same as
|
||||
/// GetNullCount. For types that have no validity bitmap, this function will
|
||||
/// recompute the logical null count every time it is called.
|
||||
///
|
||||
/// \see GetNullCount
|
||||
int64_t ComputeLogicalNullCount() const;
|
||||
|
||||
/// Some DataTypes (StringView, BinaryView) may have an arbitrary number of variadic
|
||||
/// buffers. Since ArraySpan only has 3 buffers, we pack the variadic buffers into
|
||||
/// buffers[2]; IE buffers[2].data points to the first shared_ptr<Buffer> of the
|
||||
/// variadic set and buffers[2].size is the number of variadic buffers times
|
||||
/// sizeof(shared_ptr<Buffer>).
|
||||
///
|
||||
/// \see HasVariadicBuffers
|
||||
util::span<const std::shared_ptr<Buffer>> GetVariadicBuffers() const;
|
||||
bool HasVariadicBuffers() const;
|
||||
|
||||
private:
|
||||
ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& data,
|
||||
int64_t i);
|
||||
|
||||
bool IsNullSparseUnion(int64_t i) const;
|
||||
bool IsNullDenseUnion(int64_t i) const;
|
||||
|
||||
/// \brief Return true if the value at logical index i is null
|
||||
///
|
||||
/// This function uses binary-search, so it has a O(log N) cost.
|
||||
/// Iterating over the whole array and calling IsNull is O(N log N), so
|
||||
/// for better performance it is recommended to use a
|
||||
/// ree_util::RunEndEncodedArraySpan to iterate run by run instead.
|
||||
bool IsNullRunEndEncoded(int64_t i) const;
|
||||
|
||||
bool UnionMayHaveLogicalNulls() const;
|
||||
bool RunEndEncodedMayHaveLogicalNulls() const;
|
||||
bool DictionaryMayHaveLogicalNulls() const;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
void FillZeroLengthArray(const DataType* type, ArraySpan* span);
|
||||
|
||||
/// Construct a zero-copy view of this ArrayData with the given type.
|
||||
///
|
||||
/// This method checks if the types are layout-compatible.
|
||||
/// Nested types are traversed in depth-first order. Data buffers must have
|
||||
/// the same item sizes, even though the logical types may be different.
|
||||
/// An error is returned if the types are not layout-compatible.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,76 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Compare two arrays, returning an edit script which expresses the difference
|
||||
/// between them
|
||||
///
|
||||
/// An edit script is an array of struct(insert: bool, run_length: int64_t).
|
||||
/// Each element of "insert" determines whether an element was inserted into (true)
|
||||
/// or deleted from (false) base. Each insertion or deletion is followed by a run of
|
||||
/// elements which are unchanged from base to target; the length of this run is stored
|
||||
/// in "run_length". (Note that the edit script begins and ends with a run of shared
|
||||
/// elements but both fields of the struct must have the same length. To accommodate this
|
||||
/// the first element of "insert" should be ignored.)
|
||||
///
|
||||
/// For example for base "hlloo" and target "hello", the edit script would be
|
||||
/// [
|
||||
/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
|
||||
/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
|
||||
/// {"insert": false, "run_length": 0} // delete("o") then an empty run
|
||||
/// ]
|
||||
///
|
||||
/// Diffing arrays containing nulls is not currently supported.
|
||||
///
|
||||
/// \param[in] base baseline for comparison
|
||||
/// \param[in] target an array of identical type to base whose elements differ from base's
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \return an edit script array which can be applied to base to produce target
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief visitor interface for easy traversal of an edit script
|
||||
///
|
||||
/// visitor will be called for each hunk of insertions and deletions.
|
||||
ARROW_EXPORT Status VisitEditScript(
|
||||
const Array& edits,
|
||||
const std::function<Status(int64_t delete_begin, int64_t delete_end,
|
||||
int64_t insert_begin, int64_t insert_end)>& visitor);
|
||||
|
||||
/// \brief return a function which will format an edit script in unified
|
||||
/// diff format to os, given base and target arrays of type
|
||||
ARROW_EXPORT Result<
|
||||
std::function<Status(const Array& edits, const Array& base, const Array& target)>>
|
||||
MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,167 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <variant>
|
||||
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \class ArrayStatistics
|
||||
/// \brief Statistics for an Array
|
||||
///
|
||||
/// Apache Arrow format doesn't have statistics but data source such
|
||||
/// as Apache Parquet may have statistics. Statistics associated with
|
||||
/// data source can be read unified API via this class.
|
||||
struct ARROW_EXPORT ArrayStatistics {
|
||||
/// \brief The type for maximum and minimum values. If the target
|
||||
/// value exists, one of them is used. `std::nullopt` is used
|
||||
/// otherwise.
|
||||
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
|
||||
using NumericType = std::variant<int64_t, double>;
|
||||
using CountType = NumericType;
|
||||
using SizeType = NumericType;
|
||||
|
||||
static const std::shared_ptr<DataType>& ValueToArrowType(
|
||||
const std::optional<ValueType>& value,
|
||||
const std::shared_ptr<DataType>& array_type) {
|
||||
if (!value.has_value()) {
|
||||
return null();
|
||||
}
|
||||
|
||||
struct Visitor {
|
||||
const std::shared_ptr<DataType>& array_type;
|
||||
|
||||
const std::shared_ptr<DataType>& operator()(const bool&) { return boolean(); }
|
||||
const std::shared_ptr<DataType>& operator()(const int64_t&) { return int64(); }
|
||||
const std::shared_ptr<DataType>& operator()(const uint64_t&) { return uint64(); }
|
||||
const std::shared_ptr<DataType>& operator()(const double&) { return float64(); }
|
||||
const std::shared_ptr<DataType>& operator()(const std::string&) {
|
||||
switch (array_type->id()) {
|
||||
case Type::STRING:
|
||||
case Type::BINARY:
|
||||
case Type::FIXED_SIZE_BINARY:
|
||||
case Type::LARGE_STRING:
|
||||
case Type::LARGE_BINARY:
|
||||
case Type::BINARY_VIEW:
|
||||
case Type::STRING_VIEW:
|
||||
return array_type;
|
||||
default:
|
||||
return utf8();
|
||||
}
|
||||
}
|
||||
} visitor{array_type};
|
||||
return std::visit(visitor, value.value());
|
||||
}
|
||||
|
||||
/// \brief The number of null values, may not be set
|
||||
std::optional<int64_t> null_count = std::nullopt;
|
||||
|
||||
/// \brief The number of distinct values, may not be set
|
||||
/// Note: when set to `int64_t`, it represents `exact_distinct_count`,
|
||||
/// and when set to `double`, it represents `approximate_distinct_count`.
|
||||
std::optional<CountType> distinct_count = std::nullopt;
|
||||
|
||||
/// \brief The maximum length in bytes of the rows in an array; may not be set
|
||||
/// Note: when the type is `int64_t`, it represents `max_byte_width_exact`,
|
||||
/// and when the type is `double`, it represents `max_byte_width_approximate`.
|
||||
std::optional<SizeType> max_byte_width = std::nullopt;
|
||||
|
||||
/// \brief The average size in bytes of a row in an array, may not be set.
|
||||
std::optional<double> average_byte_width = std::nullopt;
|
||||
|
||||
/// \brief Whether the average size in bytes is exact or not.
|
||||
bool is_average_byte_width_exact = false;
|
||||
|
||||
/// \brief The minimum value, may not be set
|
||||
std::optional<ValueType> min = std::nullopt;
|
||||
|
||||
/// \brief Compute Arrow type of the minimum value.
|
||||
///
|
||||
/// If \ref ValueType is `std::string`, `array_type` may be
|
||||
/// used. If `array_type` is a binary-like type such as \ref
|
||||
/// arrow::binary and \ref arrow::large_utf8, `array_type` is
|
||||
/// returned. \ref arrow::utf8 is returned otherwise.
|
||||
///
|
||||
/// If \ref ValueType isn't `std::string`, `array_type` isn't used.
|
||||
///
|
||||
/// \param array_type The Arrow type of the associated array.
|
||||
///
|
||||
/// \return \ref arrow::null if the minimum value is `std::nullopt`,
|
||||
/// Arrow type based on \ref ValueType of the \ref min
|
||||
/// otherwise.
|
||||
const std::shared_ptr<DataType>& MinArrowType(
|
||||
const std::shared_ptr<DataType>& array_type) {
|
||||
return ValueToArrowType(min, array_type);
|
||||
}
|
||||
|
||||
/// \brief Whether the minimum value is exact or not
|
||||
bool is_min_exact = false;
|
||||
|
||||
/// \brief The maximum value, may not be set
|
||||
std::optional<ValueType> max = std::nullopt;
|
||||
|
||||
/// \brief Compute Arrow type of the maximum value.
|
||||
///
|
||||
/// If \ref ValueType is `std::string`, `array_type` may be
|
||||
/// used. If `array_type` is a binary-like type such as \ref
|
||||
/// arrow::binary and \ref arrow::large_utf8, `array_type` is
|
||||
/// returned. \ref arrow::utf8 is returned otherwise.
|
||||
///
|
||||
/// If \ref ValueType isn't `std::string`, `array_type` isn't used.
|
||||
///
|
||||
/// \param array_type The Arrow type of the associated array.
|
||||
///
|
||||
/// \return \ref arrow::null if the maximum value is `std::nullopt`,
|
||||
/// Arrow type based on \ref ValueType of the \ref max
|
||||
/// otherwise.
|
||||
const std::shared_ptr<DataType>& MaxArrowType(
|
||||
const std::shared_ptr<DataType>& array_type) {
|
||||
return ValueToArrowType(max, array_type);
|
||||
}
|
||||
|
||||
/// \brief Whether the maximum value is exact or not
|
||||
bool is_max_exact = false;
|
||||
|
||||
/// \brief Check two \ref arrow::ArrayStatistics for equality
|
||||
///
|
||||
/// \param other The \ref arrow::ArrayStatistics instance to compare against.
|
||||
///
|
||||
/// \param equal_options Options used to compare double values for equality.
|
||||
///
|
||||
/// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise,
|
||||
/// false.
|
||||
bool Equals(const ArrayStatistics& other,
|
||||
const EqualOptions& equal_options = EqualOptions::Defaults()) const {
|
||||
return ArrayStatisticsEquals(*this, other, equal_options);
|
||||
}
|
||||
|
||||
/// \brief Check two statistics for equality
|
||||
bool operator==(const ArrayStatistics& other) const { return Equals(other); }
|
||||
|
||||
/// \brief Check two statistics for not equality
|
||||
bool operator!=(const ArrayStatistics& other) const { return !Equals(other); }
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,96 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \defgroup array-factories Array factory functions
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Create a strongly-typed Array instance from generic ArrayData
|
||||
/// \param[in] data the array contents
|
||||
/// \return the resulting Array instance
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
/// \brief Create a strongly-typed Array instance with all elements null
|
||||
/// \param[in] type the array type
|
||||
/// \param[in] length the array length
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
|
||||
int64_t length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Create an Array instance whose slots are the given scalar
|
||||
/// \param[in] scalar the value with which to fill the array
|
||||
/// \param[in] length the array length
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeArrayFromScalar(
|
||||
const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Create an empty Array of a given type
|
||||
///
|
||||
/// The output Array will be of the given type.
|
||||
///
|
||||
/// \param[in] type the data type of the empty Array
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
/// \return the resulting Array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// @}
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Swap endian of each element in a generic ArrayData
|
||||
///
|
||||
/// As dictionaries are often shared between different arrays, dictionaries
|
||||
/// are not swapped by this function and should be handled separately.
|
||||
///
|
||||
/// \param[in] data the array contents
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
/// \return the resulting ArrayData whose elements were swapped
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
|
||||
const std::shared_ptr<ArrayData>& data, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// Given a number of ArrayVectors, treat each ArrayVector as the
|
||||
/// chunks of a chunked array. Then rechunk each ArrayVector such that
|
||||
/// all ArrayVectors are chunked identically. It is mandatory that
|
||||
/// all ArrayVectors contain the same total number of elements.
|
||||
ARROW_EXPORT
|
||||
std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,56 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// Internal functions implementing Array::Validate() and friends.
|
||||
|
||||
// O(1) array metadata validation
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArray(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArray(const ArrayData& data);
|
||||
|
||||
// O(N) array data validation.
|
||||
// Note that, starting from 7.0.0, "full" routines also validate metadata.
|
||||
// Before, ValidateArray() needed to be called before ValidateArrayFull()
|
||||
// to ensure metadata correctness, otherwise invalid memory accesses
|
||||
// may occur.
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArrayFull(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArrayFull(const ArrayData& data);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateUTF8(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateUTF8(const ArrayData& data);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
Reference in New Issue
Block a user