Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,323 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <iosfwd>
#include <memory>
#include <string>
#include <vector>
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/compare.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
#include "arrow/visitor.h"
namespace arrow {
// ----------------------------------------------------------------------
// User array accessor types
/// \brief Array base type
/// Immutable data array with some logical type and some length.
///
/// Any memory is owned by the respective Buffer instance (or its parents).
///
/// The base class is only required to have a null bitmap buffer if the null
/// count is greater than 0
///
/// If known, the null count can be provided in the base Array constructor. If
/// the null count is not known, pass -1 to indicate that the null count is to
/// be computed on the first call to null_count()
class ARROW_EXPORT Array {
public:
virtual ~Array() = default;
/// \brief Return true if value at index is null. Does not boundscheck
bool IsNull(int64_t i) const { return !IsValid(i); }
/// \brief Return true if value at index is valid (not null). Does not
/// boundscheck
bool IsValid(int64_t i) const {
if (null_bitmap_data_ != NULLPTR) {
return bit_util::GetBit(null_bitmap_data_, i + data_->offset);
}
// Dispatching with a few conditionals like this makes IsNull more
// efficient for how it is used in practice. Making IsNull virtual
// would add a vtable lookup to every call and prevent inlining +
// a potential inner-branch removal.
if (type_id() == Type::SPARSE_UNION) {
return !internal::IsNullSparseUnion(*data_, i);
}
if (type_id() == Type::DENSE_UNION) {
return !internal::IsNullDenseUnion(*data_, i);
}
if (type_id() == Type::RUN_END_ENCODED) {
return !internal::IsNullRunEndEncoded(*data_, i);
}
return data_->null_count != data_->length;
}
/// \brief Return a Scalar containing the value of this array at i
Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
/// Size in the number of elements this array contains.
int64_t length() const { return data_->length; }
/// A relative position into another array's data, to enable zero-copy
/// slicing. This value defaults to zero
int64_t offset() const { return data_->offset; }
/// The number of null entries in the array. If the null count was not known
/// at time of construction (and set to a negative value), then the null
/// count will be computed and cached on the first invocation of this
/// function
int64_t null_count() const;
/// \brief Computes the logical null count for arrays of all types including
/// those that do not have a validity bitmap like union and run-end encoded
/// arrays
///
/// If the array has a validity bitmap, this function behaves the same as
/// null_count(). For types that have no validity bitmap, this function will
/// recompute the null count every time it is called.
///
/// \see GetNullCount
int64_t ComputeLogicalNullCount() const;
const std::shared_ptr<DataType>& type() const { return data_->type; }
Type::type type_id() const { return data_->type->id(); }
/// Buffer for the validity (null) bitmap, if any. Note that Union types
/// never have a null bitmap.
///
/// Note that for `null_count == 0` or for null type, this will be null.
/// This buffer does not account for any slice offset
const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
/// Raw pointer to the null bitmap.
///
/// Note that for `null_count == 0` or for null type, this will be null.
/// This buffer does not account for any slice offset
const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
/// Equality comparison with another array
///
/// Note that arrow::ArrayStatistics is not included in the comparison.
bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
bool Equals(const std::shared_ptr<Array>& arr,
const EqualOptions& = EqualOptions::Defaults()) const;
/// \brief Return the formatted unified diff of arrow::Diff between this
/// Array and another Array
std::string Diff(const Array& other) const;
/// Approximate equality comparison with another array
///
/// epsilon is only used if this is FloatArray or DoubleArray
///
/// Note that arrow::ArrayStatistics is not included in the comparison.
bool ApproxEquals(const std::shared_ptr<Array>& arr,
const EqualOptions& = EqualOptions::Defaults()) const;
bool ApproxEquals(const Array& arr,
const EqualOptions& = EqualOptions::Defaults()) const;
/// Compare if the range of slots specified are equal for the given array and
/// this array. end_idx exclusive. This methods does not bounds check.
///
/// Note that arrow::ArrayStatistics is not included in the comparison.
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
const Array& other,
const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
const std::shared_ptr<Array>& other,
const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
int64_t other_start_idx,
const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
int64_t end_idx, int64_t other_start_idx,
const EqualOptions& = EqualOptions::Defaults()) const;
/// \brief Apply the ArrayVisitor::Visit() method specialized to the array type
Status Accept(ArrayVisitor* visitor) const;
/// Construct a zero-copy view of this array with the given type.
///
/// This method checks if the types are layout-compatible.
/// Nested types are traversed in depth-first order. Data buffers must have
/// the same item sizes, even though the logical types may be different.
/// An error is returned if the types are not layout-compatible.
Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;
/// \brief Construct a copy of the array with all buffers on destination
/// Memory Manager
///
/// This method recursively copies the array's buffers and those of its children
/// onto the destination MemoryManager device and returns the new Array.
Result<std::shared_ptr<Array>> CopyTo(const std::shared_ptr<MemoryManager>& to) const;
/// \brief Construct a new array attempting to zero-copy view if possible.
///
/// Like CopyTo this method recursively goes through all of the array's buffers
/// and those of it's children and first attempts to create zero-copy
/// views on the destination MemoryManager device. If it can't, it falls back
/// to performing a copy. See Buffer::ViewOrCopy.
Result<std::shared_ptr<Array>> ViewOrCopyTo(
const std::shared_ptr<MemoryManager>& to) const;
/// Construct a zero-copy slice of the array with the indicated offset and
/// length
///
/// \param[in] offset the position of the first element in the constructed
/// slice
/// \param[in] length the length of the slice. If there are not enough
/// elements in the array, the length will be adjusted accordingly
///
/// \return a new object wrapped in std::shared_ptr<Array>
std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
/// Slice from offset until end of the array
std::shared_ptr<Array> Slice(int64_t offset) const;
/// Input-checking variant of Array::Slice
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
/// Input-checking variant of Array::Slice
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
const std::shared_ptr<ArrayData>& data() const { return data_; }
int num_fields() const { return static_cast<int>(data_->child_data.size()); }
/// \return PrettyPrint representation of array suitable for debugging
std::string ToString() const;
/// \brief Perform cheap validation checks to determine obvious inconsistencies
/// within the array's internal data.
///
/// This is O(k) where k is the number of descendents.
///
/// \return Status
Status Validate() const;
/// \brief Perform extensive validation checks to determine inconsistencies
/// within the array's internal data.
///
/// This is potentially O(k*n) where k is the number of descendents and n
/// is the array length.
///
/// \return Status
Status ValidateFull() const;
/// \brief Return the device_type that this array's data is allocated on
///
/// This just delegates to calling device_type on the underlying ArrayData
/// object which backs this Array.
///
/// \return DeviceAllocationType
DeviceAllocationType device_type() const { return data_->device_type(); }
/// \brief Return the statistics of this Array
///
/// This just delegates to calling statistics on the underlying ArrayData
/// object which backs this Array.
///
/// \return const std::shared_ptr<ArrayStatistics>&
const std::shared_ptr<ArrayStatistics>& statistics() const { return data_->statistics; }
protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
std::shared_ptr<ArrayData> data_;
const uint8_t* null_bitmap_data_ = NULLPTR;
/// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
if (data->buffers.size() > 0) {
null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
} else {
null_bitmap_data_ = NULLPTR;
}
data_ = data;
}
private:
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
};
ARROW_EXPORT void PrintTo(const Array& x, std::ostream* os);
static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
os << x.ToString();
return os;
}
/// Base class for non-nested arrays
class ARROW_EXPORT FlatArray : public Array {
protected:
using Array::Array;
};
/// Base class for arrays of fixed-size logical types
class ARROW_EXPORT PrimitiveArray : public FlatArray {
public:
/// Does not account for any slice offset
const std::shared_ptr<Buffer>& values() const { return data_->buffers[1]; }
protected:
PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
PrimitiveArray() : raw_values_(NULLPTR) {}
void SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
}
explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
const uint8_t* raw_values_;
};
/// Degenerate null type Array
class ARROW_EXPORT NullArray : public FlatArray {
public:
using TypeClass = NullType;
explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
explicit NullArray(int64_t length);
private:
void SetData(const std::shared_ptr<ArrayData>& data) {
null_bitmap_data_ = NULLPTR;
data->null_count = data->length;
data_ = data;
}
};
} // namespace arrow

View File

@@ -0,0 +1,321 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor classes for Binary, LargeBinary, String, LargeString,
// FixedSizeBinary
#pragma once
#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup binary-arrays
///
/// @{
// ----------------------------------------------------------------------
// Binary and String
/// Base class for variable-sized binary arrays, regardless of offset size
/// and logical interpretation.
template <typename TYPE>
class BaseBinaryArray : public FlatArray {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
/// Return the pointer to the given elements bytes
// XXX should GetValue(int64_t i) return a string_view?
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
const offset_type pos = raw_value_offsets_[i];
*out_length = raw_value_offsets_[i + 1] - pos;
return raw_data_ + pos;
}
/// \brief Get binary value as a string_view
///
/// \param i the value index
/// \return the view over the selected value
std::string_view GetView(int64_t i) const {
const offset_type pos = raw_value_offsets_[i];
return std::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
raw_value_offsets_[i + 1] - pos);
}
std::optional<std::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
/// \brief Get binary value as a string_view
/// Provided for consistency with other arrays.
///
/// \param i the value index
/// \return the view over the selected value
std::string_view Value(int64_t i) const { return GetView(i); }
/// \brief Get binary value as a std::string
///
/// \param i the value index
/// \return the value copied into a std::string
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
const offset_type* raw_value_offsets() const { return raw_value_offsets_; }
const uint8_t* raw_data() const { return raw_data_; }
/// \brief Return the data buffer absolute offset of the data for the value
/// at the passed index.
///
/// Does not perform boundschecking
offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; }
/// \brief Return the length of the data for the value at the passed index.
///
/// Does not perform boundschecking
offset_type value_length(int64_t i) const {
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
/// \brief Return the total length of the memory in the data buffer
/// referenced by this array. If the array has been sliced then this may be
/// less than the size of the data buffer (data_->buffers[2]).
offset_type total_values_length() const {
if (data_->length > 0) {
return raw_value_offsets_[data_->length] - raw_value_offsets_[0];
} else {
return 0;
}
}
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
// For subclasses
BaseBinaryArray() = default;
// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
raw_value_offsets_ = data->GetValuesSafe<offset_type>(1);
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
}
const offset_type* raw_value_offsets_ = NULLPTR;
const uint8_t* raw_data_ = NULLPTR;
};
/// Concrete Array class for variable-size binary data
class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
public:
explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
protected:
// For subclasses such as StringArray
BinaryArray() : BaseBinaryArray() {}
};
/// Concrete Array class for variable-size string (utf-8) data
class ARROW_EXPORT StringArray : public BinaryArray {
public:
using TypeClass = StringType;
explicit StringArray(const std::shared_ptr<ArrayData>& data);
StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};
/// Concrete Array class for large variable-size binary data
class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
public:
explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
protected:
// For subclasses such as LargeStringArray
LargeBinaryArray() : BaseBinaryArray() {}
};
/// Concrete Array class for large variable-size string (utf-8) data
class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
public:
using TypeClass = LargeStringType;
explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};
// ----------------------------------------------------------------------
// BinaryView and StringView
/// Concrete Array class for variable-size binary view data using the
/// BinaryViewType::c_type struct to reference in-line or out-of-line string values
class ARROW_EXPORT BinaryViewArray : public FlatArray {
public:
using TypeClass = BinaryViewType;
using IteratorType = stl::ArrayIterator<BinaryViewArray>;
using c_type = BinaryViewType::c_type;
explicit BinaryViewArray(std::shared_ptr<ArrayData> data);
BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> views, BufferVector data_buffers,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
// For API compatibility with BinaryArray etc.
std::string_view GetView(int64_t i) const;
std::string GetString(int64_t i) const { return std::string{GetView(i)}; }
const auto& values() const { return data_->buffers[1]; }
const c_type* raw_values() const { return raw_values_; }
std::optional<std::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
using FlatArray::FlatArray;
void SetData(std::shared_ptr<ArrayData> data) {
FlatArray::SetData(std::move(data));
raw_values_ = data_->GetValuesSafe<c_type>(1);
}
const c_type* raw_values_;
};
/// Concrete Array class for variable-size string view (utf-8) data using
/// BinaryViewType::c_type to reference in-line or out-of-line string values
class ARROW_EXPORT StringViewArray : public BinaryViewArray {
public:
using TypeClass = StringViewType;
explicit StringViewArray(std::shared_ptr<ArrayData> data);
using BinaryViewArray::BinaryViewArray;
/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};
// ----------------------------------------------------------------------
// Fixed width binary
/// Concrete Array class for fixed-size binary data
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
public:
using TypeClass = FixedSizeBinaryType;
using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const uint8_t* GetValue(int64_t i) const { return values_ + i * byte_width_; }
const uint8_t* Value(int64_t i) const { return GetValue(i); }
std::string_view GetView(int64_t i) const {
return std::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width_);
}
std::optional<std::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
int32_t byte_width() const { return byte_width_; }
const uint8_t* raw_values() const { return values_; }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
void SetData(const std::shared_ptr<ArrayData>& data) {
this->PrimitiveArray::SetData(data);
byte_width_ =
internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
values_ = raw_values_ + data_->offset * byte_width_;
}
const uint8_t* values_;
int32_t byte_width_;
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,104 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include "arrow/array/array_binary.h"
#include "arrow/array/data.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup numeric-arrays
///
/// @{
// ----------------------------------------------------------------------
// Decimal32Array
/// Concrete Array class for 32-bit decimal data
class ARROW_EXPORT Decimal32Array : public FixedSizeBinaryArray {
public:
using TypeClass = Decimal32Type;
using FixedSizeBinaryArray::FixedSizeBinaryArray;
/// \brief Construct Decimal32Array from ArrayData instance
explicit Decimal32Array(const std::shared_ptr<ArrayData>& data);
std::string FormatValue(int64_t i) const;
};
// ----------------------------------------------------------------------
// Decimal64Array
/// Concrete Array class for 64-bit decimal data
class ARROW_EXPORT Decimal64Array : public FixedSizeBinaryArray {
public:
using TypeClass = Decimal64Type;
using FixedSizeBinaryArray::FixedSizeBinaryArray;
/// \brief Construct Decimal64Array from ArrayData instance
explicit Decimal64Array(const std::shared_ptr<ArrayData>& data);
std::string FormatValue(int64_t i) const;
};
// ----------------------------------------------------------------------
// Decimal128Array
/// Concrete Array class for 128-bit decimal data
class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
public:
using TypeClass = Decimal128Type;
using FixedSizeBinaryArray::FixedSizeBinaryArray;
/// \brief Construct Decimal128Array from ArrayData instance
explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
std::string FormatValue(int64_t i) const;
};
// Backward compatibility
using DecimalArray = Decimal128Array;
// ----------------------------------------------------------------------
// Decimal256Array
/// Concrete Array class for 256-bit decimal data
class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
public:
using TypeClass = Decimal256Type;
using FixedSizeBinaryArray::FixedSizeBinaryArray;
/// \brief Construct Decimal256Array from ArrayData instance
explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
std::string FormatValue(int64_t i) const;
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,182 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
// ----------------------------------------------------------------------
// DictionaryArray
/// \brief Array type for dictionary-encoded data with a
/// data-dependent dictionary
///
/// A dictionary array contains an array of non-negative integers (the
/// "dictionary indices") along with a data type containing a "dictionary"
/// corresponding to the distinct values represented in the data.
///
/// For example, the array
///
/// ["foo", "bar", "foo", "bar", "foo", "bar"]
///
/// with dictionary ["bar", "foo"], would have dictionary array representation
///
/// indices: [1, 0, 1, 0, 1, 0]
/// dictionary: ["bar", "foo"]
///
/// The indices in principle may be any integer type.
class ARROW_EXPORT DictionaryArray : public Array {
public:
using TypeClass = DictionaryType;
explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
DictionaryArray(const std::shared_ptr<DataType>& type,
const std::shared_ptr<Array>& indices,
const std::shared_ptr<Array>& dictionary);
/// \brief Construct DictionaryArray from dictionary and indices
/// array and validate
///
/// This function does the validation of the indices and input type. It checks if
/// all indices are non-negative and smaller than the size of the dictionary.
///
/// \param[in] type a dictionary type
/// \param[in] dictionary the dictionary with same value type as the
/// type object
/// \param[in] indices an array of non-negative integers smaller than the
/// size of the dictionary
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
const std::shared_ptr<Array>& dictionary);
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
dictionary);
}
/// \brief Transpose this DictionaryArray
///
/// This method constructs a new dictionary array with the given dictionary
/// type, transposing indices using the transpose map. The type and the
/// transpose map are typically computed using DictionaryUnifier.
///
/// \param[in] type the new type object
/// \param[in] dictionary the new dictionary
/// \param[in] transpose_map transposition array of this array's indices
/// into the target array's indices
/// \param[in] pool a pool to allocate the array data from
Result<std::shared_ptr<Array>> Transpose(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
Result<std::shared_ptr<Array>> Compact(MemoryPool* pool = default_memory_pool()) const;
/// \brief Determine whether dictionary arrays may be compared without unification
bool CanCompareIndices(const DictionaryArray& other) const;
/// \brief Return the dictionary for this array, which is stored as
/// a member of the ArrayData internal structure
const std::shared_ptr<Array>& dictionary() const;
const std::shared_ptr<Array>& indices() const;
/// \brief Return the ith value of indices, cast to int64_t. Not recommended
/// for use in performance-sensitive code. Does not validate whether the
/// value is null or out-of-bounds.
int64_t GetValueIndex(int64_t i) const;
const DictionaryType* dict_type() const { return dict_type_; }
private:
void SetData(const std::shared_ptr<ArrayData>& data);
const DictionaryType* dict_type_;
std::shared_ptr<Array> indices_;
// Lazily initialized when invoking dictionary()
mutable std::shared_ptr<Array> dictionary_;
};
/// \brief Helper class for incremental dictionary unification
class ARROW_EXPORT DictionaryUnifier {
public:
virtual ~DictionaryUnifier() = default;
/// \brief Construct a DictionaryUnifier
/// \param[in] value_type the data type of the dictionaries
/// \param[in] pool MemoryPool to use for memory allocations
static Result<std::unique_ptr<DictionaryUnifier>> Make(
std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
/// \brief Unify dictionaries across array chunks
///
/// The dictionaries in the array chunks will be unified, their indices
/// accordingly transposed.
///
/// Only dictionaries with a primitive value type are currently supported.
/// However, dictionaries nested inside a more complex type are correctly unified.
static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
const std::shared_ptr<ChunkedArray>& array,
MemoryPool* pool = default_memory_pool());
/// \brief Unify dictionaries across the chunks of each table column
///
/// The dictionaries in each table column will be unified, their indices
/// accordingly transposed.
///
/// Only dictionaries with a primitive value type are currently supported.
/// However, dictionaries nested inside a more complex type are correctly unified.
static Result<std::shared_ptr<Table>> UnifyTable(
const Table& table, MemoryPool* pool = default_memory_pool());
/// \brief Append dictionary to the internal memo
virtual Status Unify(const Array& dictionary) = 0;
/// \brief Append dictionary and compute transpose indices
/// \param[in] dictionary the dictionary values to unify
/// \param[out] out_transpose a Buffer containing computed transpose indices
/// as int32_t values equal in length to the passed dictionary. The value in
/// each slot corresponds to the new index value for each original index
/// for a DictionaryArray with the old dictionary
virtual Status Unify(const Array& dictionary,
std::shared_ptr<Buffer>* out_transpose) = 0;
/// \brief Return a result DictionaryType with the smallest possible index
/// type to accommodate the unified dictionary. The unifier cannot be used
/// after this is called
virtual Status GetResult(std::shared_ptr<DataType>* out_type,
std::shared_ptr<Array>* out_dict) = 0;
/// \brief Return a unified dictionary with the given index type. If
/// the index type is not large enough then an invalid status will be returned.
/// The unifier cannot be used after this is called
virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
std::shared_ptr<Array>* out_dict) = 0;
};
} // namespace arrow

View File

@@ -0,0 +1,887 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList,
// Map, Struct, and Union
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup nested-arrays
///
/// @{
// ----------------------------------------------------------------------
// VarLengthListLikeArray
template <typename TYPE>
class VarLengthListLikeArray;
namespace internal {
// Private helper for [Large]List[View]Array::SetData.
// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header
// doesn't play well with MSVC.
template <typename TYPE>
void SetListData(VarLengthListLikeArray<TYPE>* self,
const std::shared_ptr<ArrayData>& data,
Type::type expected_type_id = TYPE::type_id);
/// \brief A version of Flatten that keeps recursively flattening until an array of
/// non-list values is reached.
///
/// Array types considered to be lists by this function:
/// - list
/// - large_list
/// - list_view
/// - large_list_view
/// - fixed_size_list
///
/// \see ListArray::Flatten
ARROW_EXPORT Result<std::shared_ptr<Array>> FlattenLogicalListRecursively(
const Array& in_array, MemoryPool* memory_pool);
} // namespace internal
/// Base class for variable-sized list and list-view arrays, regardless of offset size.
template <typename TYPE>
class VarLengthListLikeArray : public Array {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
const TypeClass* var_length_list_like_type() const { return this->list_type_; }
/// \brief Return array object containing the list's values
///
/// Note that this buffer does not account for any slice offset or length.
const std::shared_ptr<Array>& values() const { return values_; }
/// Note that this buffer does not account for any slice offset or length.
const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[1]; }
const std::shared_ptr<DataType>& value_type() const { return list_type_->value_type(); }
/// Return pointer to raw value offsets accounting for any slice offset
const offset_type* raw_value_offsets() const { return raw_value_offsets_; }
// The following functions will not perform boundschecking
offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; }
/// \brief Return the size of the value at a particular index
///
/// Since non-empty null lists and list-views are possible, avoid calling this
/// function when the list at slot i is null.
///
/// \pre IsValid(i)
virtual offset_type value_length(int64_t i) const = 0;
/// \pre IsValid(i)
std::shared_ptr<Array> value_slice(int64_t i) const {
return values_->Slice(value_offset(i), value_length(i));
}
/// \brief Flatten all level recursively until reach a non-list type, and return
/// a non-list type Array.
///
/// \see internal::FlattenLogicalListRecursively
Result<std::shared_ptr<Array>> FlattenRecursively(
MemoryPool* memory_pool = default_memory_pool()) const {
return internal::FlattenLogicalListRecursively(*this, memory_pool);
}
protected:
friend void internal::SetListData<TYPE>(VarLengthListLikeArray<TYPE>* self,
const std::shared_ptr<ArrayData>& data,
Type::type expected_type_id);
const TypeClass* list_type_ = NULLPTR;
std::shared_ptr<Array> values_;
const offset_type* raw_value_offsets_ = NULLPTR;
};
// ----------------------------------------------------------------------
// ListArray / LargeListArray
template <typename TYPE>
class BaseListArray : public VarLengthListLikeArray<TYPE> {
public:
using TypeClass = TYPE;
using offset_type = typename TYPE::offset_type;
const TypeClass* list_type() const { return this->var_length_list_like_type(); }
/// \brief Return the size of the value at a particular index
///
/// Since non-empty null lists are possible, avoid calling this
/// function when the list at slot i is null.
///
/// \pre IsValid(i)
offset_type value_length(int64_t i) const final {
return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i];
}
};
/// Concrete Array class for list data
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
public:
explicit ListArray(std::shared_ptr<ArrayData> data);
ListArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct ListArray from array of offsets and child value array
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed.
///
/// If a null_bitmap is not provided, the nulls will be inferred from the offsets'
/// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls.
///
/// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an
/// array with offset() > 0).
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int32 type
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
/// \param[in] null_bitmap Optional validity bitmap
/// \param[in] null_count Optional null count in null_bitmap
static Result<std::shared_ptr<ListArray>> FromArrays(
const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
static Result<std::shared_ptr<ListArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
/// \brief Build a ListArray from a ListViewArray
static Result<std::shared_ptr<ListArray>> FromListView(const ListViewArray& source,
MemoryPool* pool);
/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration of this array's offsets as well as null elements backed
/// by non-empty lists (they are skipped, thus copying may be needed).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Return list offsets as an Int32Array
///
/// The returned array will not have a validity bitmap, so you cannot expect
/// to pass it to ListArray::FromArrays() and get back the same list array
/// if the original one has nulls.
std::shared_ptr<Array> offsets() const;
protected:
// This constructor defers SetData to a derived array class
ListArray() = default;
void SetData(const std::shared_ptr<ArrayData>& data);
};
/// Concrete Array class for large list data (with 64-bit offsets)
class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
public:
explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct LargeListArray from array of offsets and child value array
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed.
///
/// If a null_bitmap is not provided, the nulls will be inferred from the offsets'
/// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls.
///
/// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an
/// array with offset() > 0).
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int64 type
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
/// \param[in] null_bitmap Optional validity bitmap
/// \param[in] null_count Optional null count in null_bitmap
static Result<std::shared_ptr<LargeListArray>> FromArrays(
const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
static Result<std::shared_ptr<LargeListArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
/// \brief Build a LargeListArray from a LargeListViewArray
static Result<std::shared_ptr<LargeListArray>> FromListView(
const LargeListViewArray& source, MemoryPool* pool);
/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration of this array's offsets as well as null elements backed
/// by non-empty lists (they are skipped, thus copying may be needed).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Return list offsets as an Int64Array
std::shared_ptr<Array> offsets() const;
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
};
// ----------------------------------------------------------------------
// ListViewArray / LargeListViewArray
template <typename TYPE>
class BaseListViewArray : public VarLengthListLikeArray<TYPE> {
public:
using TypeClass = TYPE;
using offset_type = typename TYPE::offset_type;
const TypeClass* list_view_type() const { return this->var_length_list_like_type(); }
/// \brief Note that this buffer does not account for any slice offset or length.
const std::shared_ptr<Buffer>& value_sizes() const { return this->data_->buffers[2]; }
/// \brief Return pointer to raw value offsets accounting for any slice offset
const offset_type* raw_value_sizes() const { return raw_value_sizes_; }
/// \brief Return the size of the value at a particular index
///
/// This should not be called if the list-view at slot i is null.
/// The returned size in those cases could be any value from 0 to the
/// length of the child values array.
///
/// \pre IsValid(i)
offset_type value_length(int64_t i) const final { return this->raw_value_sizes_[i]; }
protected:
const offset_type* raw_value_sizes_ = NULLPTR;
};
/// \brief Concrete Array class for list-view data
class ARROW_EXPORT ListViewArray : public BaseListViewArray<ListViewType> {
public:
explicit ListViewArray(std::shared_ptr<ArrayData> data);
ListViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> value_offsets,
std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array> values,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct ListViewArray from array of offsets, sizes, and child
/// value array
///
/// Construct a ListViewArray using buffers from offsets and sizes arrays
/// that project views into the child values array.
///
/// This function does the bare minimum of validation of the offsets/sizes and
/// input types. The offset and length of the offsets and sizes arrays must
/// match and that will be checked, but their contents will be assumed to be
/// well-formed.
///
/// If a null_bitmap is not provided, the nulls will be inferred from the
/// offsets's null bitmap. But if a null_bitmap is provided, the offsets array
/// can't have nulls.
///
/// And when a null_bitmap is provided, neither the offsets or sizes array can be a
/// slice (i.e. an array with offset() > 0).
///
/// \param[in] offsets An array of int32 offsets into the values array. NULL values are
/// supported if the corresponding values in sizes is NULL or 0.
/// \param[in] sizes An array containing the int32 sizes of every view. NULL values are
/// taken to represent a NULL list-view in the array being created.
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool
/// \param[in] null_bitmap Optional validity bitmap
/// \param[in] null_count Optional null count in null_bitmap
static Result<std::shared_ptr<ListViewArray>> FromArrays(
const Array& offsets, const Array& sizes, const Array& values,
MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
static Result<std::shared_ptr<ListViewArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
const Array& values, MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
/// \brief Build a ListViewArray from a ListArray
static Result<std::shared_ptr<ListViewArray>> FromList(const ListArray& list_array,
MemoryPool* pool);
/// \brief Return an Array that is a concatenation of the list-views in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration this array's offsets (which can be in any order)
/// and sizes. Nulls are skipped.
///
/// This function invokes Concatenate() if list-views are non-contiguous. It
/// will try to minimize the number of array slices passed to Concatenate() by
/// maximizing the size of each slice (containing as many contiguous
/// list-views as possible).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Return list-view offsets as an Int32Array
///
/// The returned array will not have a validity bitmap, so you cannot expect
/// to pass it to ListArray::FromArrays() and get back the same list array
/// if the original one has nulls.
std::shared_ptr<Array> offsets() const;
/// \brief Return list-view sizes as an Int32Array
///
/// The returned array will not have a validity bitmap, so you cannot expect
/// to pass it to ListViewArray::FromArrays() and get back the same list
/// array if the original one has nulls.
std::shared_ptr<Array> sizes() const;
protected:
// This constructor defers SetData to a derived array class
ListViewArray() = default;
void SetData(const std::shared_ptr<ArrayData>& data);
};
/// \brief Concrete Array class for large list-view data (with 64-bit offsets
/// and sizes)
class ARROW_EXPORT LargeListViewArray : public BaseListViewArray<LargeListViewType> {
public:
explicit LargeListViewArray(std::shared_ptr<ArrayData> data);
LargeListViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> value_offsets,
std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array> values,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct LargeListViewArray from array of offsets, sizes, and child
/// value array
///
/// Construct an LargeListViewArray using buffers from offsets and sizes arrays
/// that project views into the values array.
///
/// This function does the bare minimum of validation of the offsets/sizes and
/// input types. The offset and length of the offsets and sizes arrays must
/// match and that will be checked, but their contents will be assumed to be
/// well-formed.
///
/// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or
/// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a
/// null_bitmap is provided, the offsets array and the sizes array can't have nulls.
///
/// And when a null_bitmap is provided, neither the offsets or sizes array can be a
/// slice (i.e. an array with offset() > 0).
///
/// \param[in] offsets An array of int64 offsets into the values array. NULL values are
/// supported if the corresponding values in sizes is NULL or 0.
/// \param[in] sizes An array containing the int64 sizes of every view. NULL values are
/// taken to represent a NULL list-view in the array being created.
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool
/// \param[in] null_bitmap Optional validity bitmap
/// \param[in] null_count Optional null count in null_bitmap
static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
const Array& offsets, const Array& sizes, const Array& values,
MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
const Array& values, MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
/// \brief Build a LargeListViewArray from a LargeListArray
static Result<std::shared_ptr<LargeListViewArray>> FromList(
const LargeListArray& list_array, MemoryPool* pool);
/// \brief Return an Array that is a concatenation of the large list-views in this
/// array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration this array's offsets (which can be in any order)
/// and sizes. Nulls are skipped.
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Return list-view offsets as an Int64Array
///
/// The returned array will not have a validity bitmap, so you cannot expect
/// to pass it to LargeListArray::FromArrays() and get back the same list array
/// if the original one has nulls.
std::shared_ptr<Array> offsets() const;
/// \brief Return list-view sizes as an Int64Array
///
/// The returned array will not have a validity bitmap, so you cannot expect
/// to pass it to LargeListViewArray::FromArrays() and get back the same list
/// array if the original one has nulls.
std::shared_ptr<Array> sizes() const;
protected:
// This constructor defers SetData to a derived array class
LargeListViewArray() = default;
void SetData(const std::shared_ptr<ArrayData>& data);
};
// ----------------------------------------------------------------------
// MapArray
/// Concrete Array class for map data
///
/// NB: "value" in this context refers to a pair of a key and the corresponding item
class ARROW_EXPORT MapArray : public ListArray {
public:
using TypeClass = MapType;
explicit MapArray(const std::shared_ptr<ArrayData>& data);
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
MapArray(const std::shared_ptr<DataType>& type, int64_t length, BufferVector buffers,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct MapArray from array of offsets and child key, item arrays
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int32 type
/// \param[in] keys Array containing key values
/// \param[in] items Array containing item values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// \param[in] null_bitmap Optional validity bitmap
/// allocated because of null values
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR);
static Result<std::shared_ptr<Array>> FromArrays(
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
MemoryPool* pool = default_memory_pool(),
std::shared_ptr<Buffer> null_bitmap = NULLPTR);
const MapType* map_type() const { return map_type_; }
/// \brief Return array object containing all map keys
const std::shared_ptr<Array>& keys() const { return keys_; }
/// \brief Return array object containing all mapped items
const std::shared_ptr<Array>& items() const { return items_; }
/// Validate child data before constructing the actual MapArray.
static Status ValidateChildData(
const std::vector<std::shared_ptr<ArrayData>>& child_data);
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
static Result<std::shared_ptr<Array>> FromArraysInternal(
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
MemoryPool* pool, std::shared_ptr<Buffer> null_bitmap = NULLPTR);
private:
const MapType* map_type_;
std::shared_ptr<Array> keys_, items_;
};
// ----------------------------------------------------------------------
// FixedSizeListArray
/// Concrete Array class for fixed size list data
class ARROW_EXPORT FixedSizeListArray : public Array {
public:
using TypeClass = FixedSizeListType;
using offset_type = TypeClass::offset_type;
explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const FixedSizeListType* list_type() const;
/// \brief Return array object containing the list's values
const std::shared_ptr<Array>& values() const;
const std::shared_ptr<DataType>& value_type() const;
// The following functions will not perform boundschecking
int64_t value_offset(int64_t i) const {
i += data_->offset;
return list_size_ * i;
}
/// \brief Return the fixed-size of the values
///
/// No matter the value of the index parameter, the result is the same.
/// So even when the value at slot i is null, this function will return a
/// non-zero size.
///
/// \pre IsValid(i)
int32_t value_length(int64_t i = 0) const {
ARROW_UNUSED(i);
return list_size_;
}
/// \pre IsValid(i)
std::shared_ptr<Array> value_slice(int64_t i) const {
return values_->Slice(value_offset(i), value_length(i));
}
/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration null elements (they are skipped, thus copying may be needed).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Flatten all level recursively until reach a non-list type, and return
/// a non-list type Array.
///
/// \see internal::FlattenLogicalListRecursively
Result<std::shared_ptr<Array>> FlattenRecursively(
MemoryPool* memory_pool = default_memory_pool()) const {
return internal::FlattenLogicalListRecursively(*this, memory_pool);
}
/// \brief Construct FixedSizeListArray from child value array and value_length
///
/// \param[in] values Array containing list values
/// \param[in] list_size The fixed length of each list
/// \param[in] null_bitmap Optional validity bitmap
/// \param[in] null_count Optional null count in null_bitmap
/// \return Will have length equal to values.length() / list_size
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& values, int32_t list_size,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
/// \brief Construct FixedSizeListArray from child value array and type
///
/// \param[in] values Array containing list values
/// \param[in] type The fixed sized list type
/// \param[in] null_bitmap Optional validity bitmap
/// \param[in] null_count Optional null count in null_bitmap
/// \return Will have length equal to values.length() / type.list_size()
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
int32_t list_size_;
private:
std::shared_ptr<Array> values_;
};
// ----------------------------------------------------------------------
// Struct
/// Concrete Array class for struct data
class ARROW_EXPORT StructArray : public Array {
public:
using TypeClass = StructType;
explicit StructArray(const std::shared_ptr<ArrayData>& data);
StructArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Array>>& children,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Return a StructArray from child arrays and field names.
///
/// The length and data type are automatically inferred from the arguments.
/// There should be at least one child array.
static Result<std::shared_ptr<StructArray>> Make(
const ArrayVector& children, const std::vector<std::string>& field_names,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Return a StructArray from child arrays and fields.
///
/// The length is automatically inferred from the arguments.
/// There should be at least one child array. This method does not
/// check that field types and child array types are consistent.
static Result<std::shared_ptr<StructArray>> Make(
const ArrayVector& children, const FieldVector& fields,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const StructType* struct_type() const;
// Return a shared pointer in case the requestor desires to share ownership
// with this array. The returned array has its offset, length and null
// count adjusted.
const std::shared_ptr<Array>& field(int pos) const;
const ArrayVector& fields() const;
/// Returns null if name not found
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
/// Indicate if field named `name` can be found unambiguously in the struct.
Status CanReferenceFieldByName(const std::string& name) const;
/// Indicate if fields named `names` can be found unambiguously in the struct.
Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;
/// \brief Flatten this array as a vector of arrays, one for each field
///
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
/// \brief Get one of the child arrays, combining its null bitmap
/// with the parent struct array's bitmap.
///
/// \param[in] index Which child array to get
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
Result<std::shared_ptr<Array>> GetFlattenedField(
int index, MemoryPool* pool = default_memory_pool()) const;
private:
// For caching boxed child data
// XXX This is not handled in a thread-safe manner.
mutable ArrayVector boxed_fields_;
};
// ----------------------------------------------------------------------
// Union
/// Base class for SparseUnionArray and DenseUnionArray
class ARROW_EXPORT UnionArray : public Array {
public:
using type_code_t = int8_t;
/// Note that this buffer does not account for any slice offset
const std::shared_ptr<Buffer>& type_codes() const { return data_->buffers[1]; }
const type_code_t* raw_type_codes() const { return raw_type_codes_; }
/// The logical type code of the value at index.
type_code_t type_code(int64_t i) const { return raw_type_codes_[i]; }
/// The physical child id containing value at index.
int child_id(int64_t i) const { return union_type_->child_ids()[raw_type_codes_[i]]; }
const UnionType* union_type() const { return union_type_; }
UnionMode::type mode() const { return union_type_->mode(); }
/// \brief Return the given field as an individual array.
///
/// For sparse unions, the returned array has its offset, length and null
/// count adjusted.
std::shared_ptr<Array> field(int pos) const;
protected:
void SetData(std::shared_ptr<ArrayData> data);
const type_code_t* raw_type_codes_;
const UnionType* union_type_;
// For caching boxed child data
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
};
/// Concrete Array class for sparse union data
class ARROW_EXPORT SparseUnionArray : public UnionArray {
public:
using TypeClass = SparseUnionType;
explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
/// \brief Construct SparseUnionArray from type_ids and children
///
/// This function does the bare minimum of validation of the input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
std::vector<type_code_t> type_codes) {
return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
std::move(type_codes));
}
/// \brief Construct SparseUnionArray with custom field names from type_ids and children
///
/// This function does the bare minimum of validation of the input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
std::vector<std::string> field_names = {},
std::vector<type_code_t> type_codes = {});
const SparseUnionType* union_type() const {
return internal::checked_cast<const SparseUnionType*>(union_type_);
}
/// \brief Get one of the child arrays, adjusting its null bitmap
/// where the union array type code does not match.
///
/// \param[in] index Which child array to get (i.e. the physical index, not the type
/// code) \param[in] pool The pool to allocate null bitmaps from, if necessary
Result<std::shared_ptr<Array>> GetFlattenedField(
int index, MemoryPool* pool = default_memory_pool()) const;
protected:
void SetData(std::shared_ptr<ArrayData> data);
};
/// \brief Concrete Array class for dense union data
///
/// Note that union types do not have a validity bitmap
class ARROW_EXPORT DenseUnionArray : public UnionArray {
public:
using TypeClass = DenseUnionType;
explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
std::shared_ptr<Buffer> type_ids,
std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
/// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] value_offsets An array of signed int32 values indicating the
/// relative offset into the respective child array for the type in a given slot.
/// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
const Array& value_offsets,
ArrayVector children,
std::vector<type_code_t> type_codes) {
return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
std::move(type_codes));
}
/// \brief Construct DenseUnionArray with custom field names from type_ids,
/// value_offsets, and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] value_offsets An array of signed int32 values indicating the
/// relative offset into the respective child array for the type in a given slot.
/// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
const Array& value_offsets,
ArrayVector children,
std::vector<std::string> field_names = {},
std::vector<type_code_t> type_codes = {});
const DenseUnionType* union_type() const {
return internal::checked_cast<const DenseUnionType*>(union_type_);
}
/// Note that this buffer does not account for any slice offset
const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[2]; }
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i]; }
const int32_t* raw_value_offsets() const { return raw_value_offsets_; }
protected:
const int32_t* raw_value_offsets_;
void SetData(const std::shared_ptr<ArrayData>& data);
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,220 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor types for primitive/C-type-based arrays, such as numbers,
// boolean, and temporal types.
#pragma once
#include <cstdint>
#include <memory>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h" // IWYU pragma: export
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// Concrete Array class for boolean data
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
public:
using TypeClass = BooleanType;
using IteratorType = stl::ArrayIterator<BooleanArray>;
explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
bool Value(int64_t i) const {
return bit_util::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
i + data_->offset);
}
bool GetView(int64_t i) const { return Value(i); }
std::optional<bool> operator[](int64_t i) const { return *IteratorType(*this, i); }
/// \brief Return the number of false (0) values among the valid
/// values. Result is not cached.
int64_t false_count() const;
/// \brief Return the number of true (1) values among the valid
/// values. Result is not cached.
int64_t true_count() const;
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
using PrimitiveArray::PrimitiveArray;
};
/// \addtogroup numeric-arrays
///
/// @{
/// \brief Concrete Array class for numeric data with a corresponding C type
///
/// This class is templated on the corresponding DataType subclass for the
/// given data, for example NumericArray<Int8Type> or NumericArray<Date32Type>.
///
/// Note that convenience aliases are available for all accepted types
/// (for example Int8Array for NumericArray<Int8Type>).
template <typename TYPE>
class NumericArray : public PrimitiveArray {
public:
using TypeClass = TYPE;
using value_type = typename TypeClass::c_type;
using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
explicit NumericArray(const std::shared_ptr<ArrayData>& data) {
NumericArray::SetData(data);
}
// Only enable this constructor without a type argument for types without additional
// metadata
template <typename T1 = TYPE>
NumericArray(enable_if_parameter_free<T1, int64_t> length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0) {
NumericArray::SetData(ArrayData::Make(TypeTraits<T1>::type_singleton(), length,
{null_bitmap, data}, null_count, offset));
}
NumericArray(std::shared_ptr<DataType> type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0) {
NumericArray::SetData(ArrayData::Make(std::move(type), length, {null_bitmap, data},
null_count, offset));
}
const value_type* raw_values() const { return values_; }
value_type Value(int64_t i) const { return values_[i]; }
// For API compatibility with BinaryArray etc.
value_type GetView(int64_t i) const { return values_[i]; }
std::optional<value_type> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
NumericArray() : values_(NULLPTR) {}
void SetData(const std::shared_ptr<ArrayData>& data) {
this->PrimitiveArray::SetData(data);
values_ = raw_values_
? (reinterpret_cast<const value_type*>(raw_values_) + data_->offset)
: NULLPTR;
}
const value_type* values_;
};
/// DayTimeArray
/// ---------------------
/// \brief Array of Day and Millisecond values.
class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray {
public:
using TypeClass = DayTimeIntervalType;
using IteratorType = stl::ArrayIterator<DayTimeIntervalArray>;
explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data);
DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
DayTimeIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
TypeClass::DayMilliseconds GetValue(int64_t i) const;
TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); }
// For compatibility with Take kernel.
TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
std::optional<TypeClass::DayMilliseconds> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); }
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
};
/// \brief Array of Month, Day and nanosecond values.
class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray {
public:
using TypeClass = MonthDayNanoIntervalType;
using IteratorType = stl::ArrayIterator<MonthDayNanoIntervalArray>;
explicit MonthDayNanoIntervalArray(const std::shared_ptr<ArrayData>& data);
MonthDayNanoIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
TypeClass::MonthDayNanos GetValue(int64_t i) const;
TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); }
// For compatibility with Take kernel.
TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
std::optional<TypeClass::MonthDayNanos> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); }
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,133 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor classes run-end encoded arrays
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup run-end-encoded-arrays
///
/// @{
// ----------------------------------------------------------------------
// RunEndEncoded
/// \brief Array type for run-end encoded data
class ARROW_EXPORT RunEndEncodedArray : public Array {
private:
std::shared_ptr<Array> run_ends_array_;
std::shared_ptr<Array> values_array_;
public:
using TypeClass = RunEndEncodedType;
explicit RunEndEncodedArray(const std::shared_ptr<ArrayData>& data);
/// \brief Construct a RunEndEncodedArray from all parameters
///
/// The length and offset parameters refer to the dimensions of the logical
/// array which is the array we would get after expanding all the runs into
/// repeated values. As such, length can be much greater than the length of
/// the child run_ends and values arrays.
RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Array>& run_ends,
const std::shared_ptr<Array>& values, int64_t offset = 0);
/// \brief Construct a RunEndEncodedArray from all parameters
///
/// The length and offset parameters refer to the dimensions of the logical
/// array which is the array we would get after expanding all the runs into
/// repeated values. As such, length can be much greater than the length of
/// the child run_ends and values arrays.
static Result<std::shared_ptr<RunEndEncodedArray>> Make(
const std::shared_ptr<DataType>& type, int64_t logical_length,
const std::shared_ptr<Array>& run_ends, const std::shared_ptr<Array>& values,
int64_t logical_offset = 0);
/// \brief Construct a RunEndEncodedArray from values and run ends arrays
///
/// The data type is automatically inferred from the arguments.
/// The run_ends and values arrays must have the same length.
static Result<std::shared_ptr<RunEndEncodedArray>> Make(
int64_t logical_length, const std::shared_ptr<Array>& run_ends,
const std::shared_ptr<Array>& values, int64_t logical_offset = 0);
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
public:
/// \brief Returns an array holding the logical indexes of each run-end
///
/// The physical offset to the array is applied.
const std::shared_ptr<Array>& run_ends() const { return run_ends_array_; }
/// \brief Returns an array holding the values of each run
///
/// The physical offset to the array is applied.
const std::shared_ptr<Array>& values() const { return values_array_; }
/// \brief Returns an array holding the logical indexes of each run end
///
/// If a non-zero logical offset is set, this function allocates a new
/// array and rewrites all the run end values to be relative to the logical
/// offset and cuts the end of the array to the logical length.
Result<std::shared_ptr<Array>> LogicalRunEnds(MemoryPool* pool) const;
/// \brief Returns an array holding the values of each run
///
/// If a non-zero logical offset is set, this function allocates a new
/// array containing only the values within the logical range.
std::shared_ptr<Array> LogicalValues() const;
/// \brief Find the physical offset of this REE array
///
/// This function uses binary-search, so it has a O(log N) cost.
int64_t FindPhysicalOffset() const;
/// \brief Find the physical length of this REE array
///
/// The physical length of an REE is the number of physical values (and
/// run-ends) necessary to represent the logical range of values from offset
/// to length.
///
/// Avoid calling this function if the physical length can be established in
/// some other way (e.g. when iterating over the runs sequentially until the
/// end). This function uses binary-search, so it has a O(log N) cost.
int64_t FindPhysicalLength() const;
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,215 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <cstring>
#include <memory>
#include <type_traits>
#include "arrow/array/builder_base.h"
#include "arrow/buffer.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup numeric-builders
///
/// @{
namespace internal {
class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
public:
AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool,
int64_t alignment = kDefaultBufferAlignment);
explicit AdaptiveIntBuilderBase(MemoryPool* pool,
int64_t alignment = kDefaultBufferAlignment)
: AdaptiveIntBuilderBase(sizeof(uint8_t), pool, alignment) {}
/// \brief Append multiple nulls
/// \param[in] length the number of nulls to append
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(CommitPendingData());
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(Reserve(length));
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
UnsafeSetNull(length);
}
return Status::OK();
}
Status AppendNull() final {
pending_data_[pending_pos_] = 0;
pending_valid_[pending_pos_] = 0;
pending_has_nulls_ = true;
++pending_pos_;
++length_;
++null_count_;
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
return CommitPendingData();
}
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(CommitPendingData());
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(Reserve(length));
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
UnsafeSetNotNull(length);
}
return Status::OK();
}
Status AppendEmptyValue() final {
pending_data_[pending_pos_] = 0;
pending_valid_[pending_pos_] = 1;
++pending_pos_;
++length_;
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
return CommitPendingData();
}
return Status::OK();
}
void Reset() override;
Status Resize(int64_t capacity) override;
protected:
Status AppendInternal(const uint64_t val) {
pending_data_[pending_pos_] = val;
pending_valid_[pending_pos_] = 1;
++pending_pos_;
++length_;
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
return CommitPendingData();
}
return Status::OK();
}
virtual Status CommitPendingData() = 0;
template <typename new_type, typename old_type>
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
ExpandIntSizeInternal();
template <typename new_type, typename old_type>
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
ExpandIntSizeInternal();
std::shared_ptr<ResizableBuffer> data_;
uint8_t* raw_data_ = NULLPTR;
const uint8_t start_int_size_;
uint8_t int_size_;
static constexpr int32_t pending_size_ = 1024;
uint8_t pending_valid_[pending_size_];
uint64_t pending_data_[pending_size_];
int32_t pending_pos_ = 0;
bool pending_has_nulls_ = false;
};
} // namespace internal
class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
public:
explicit AdaptiveUIntBuilder(uint8_t start_int_size,
MemoryPool* pool = default_memory_pool());
explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool())
: AdaptiveUIntBuilder(sizeof(uint8_t), pool) {}
using internal::AdaptiveIntBuilderBase::Reset;
/// Scalar append
Status Append(const uint64_t val) { return AppendInternal(val); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const uint64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
std::shared_ptr<DataType> type() const override;
protected:
Status CommitPendingData() override;
Status ExpandIntSize(uint8_t new_int_size);
Status AppendValuesInternal(const uint64_t* values, int64_t length,
const uint8_t* valid_bytes);
template <typename new_type>
Status ExpandIntSizeN();
};
class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
public:
explicit AdaptiveIntBuilder(uint8_t start_int_size,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment);
explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {}
using internal::AdaptiveIntBuilderBase::Reset;
/// Scalar append
Status Append(const int64_t val) { return AppendInternal(static_cast<uint64_t>(val)); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const int64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
std::shared_ptr<DataType> type() const override;
protected:
Status CommitPendingData() override;
Status ExpandIntSize(uint8_t new_int_size);
Status AppendValuesInternal(const int64_t* values, int64_t length,
const uint8_t* valid_bytes);
template <typename new_type>
Status ExpandIntSizeN();
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,371 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm> // IWYU pragma: keep
#include <cstdint>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/array_primitive.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace internal {
template <class Builder, class V>
class ArrayBuilderExtraOps {
public:
/// \brief Append a value from an optional or null if it has no value.
Status AppendOrNull(const std::optional<V>& value) {
auto* self = static_cast<Builder*>(this);
return value.has_value() ? self->Append(*value) : self->AppendNull();
}
/// \brief Append a value from an optional or null if it has no value.
///
/// Unsafe methods don't check existing size.
void UnsafeAppendOrNull(const std::optional<V>& value) {
auto* self = static_cast<Builder*>(this);
return value.has_value() ? self->UnsafeAppend(*value) : self->UnsafeAppendNull();
}
};
} // namespace internal
/// \defgroup numeric-builders Concrete builder subclasses for numeric types
/// @{
/// @}
/// \defgroup temporal-builders Concrete builder subclasses for temporal types
/// @{
/// @}
/// \defgroup binary-builders Concrete builder subclasses for binary types
/// @{
/// @}
/// \defgroup nested-builders Concrete builder subclasses for nested types
/// @{
/// @}
/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types
/// @{
/// @}
/// \defgroup run-end-encoded-builders Concrete builder subclasses for run-end encoded
/// arrays
/// @{
/// @}
constexpr int64_t kMinBuilderCapacity = 1 << 5;
constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
/// Base class for all data array builders.
///
/// This class provides a facilities for incrementally building the null bitmap
/// (see Append methods) and as a side effect the current number of slots and
/// the null count.
///
/// \note Users are expected to use builders as one of the concrete types below.
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
class ARROW_EXPORT ArrayBuilder {
public:
explicit ArrayBuilder(MemoryPool* pool, int64_t alignment = kDefaultBufferAlignment)
: pool_(pool), alignment_(alignment), null_bitmap_builder_(pool, alignment) {}
ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
virtual ~ArrayBuilder() = default;
/// For nested types. Since the objects are owned by this class instance, we
/// skip shared pointers and just return a raw pointer
ArrayBuilder* child(int i) { return children_[i].get(); }
const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
int num_children() const { return static_cast<int>(children_.size()); }
virtual int64_t length() const { return length_; }
int64_t null_count() const { return null_count_; }
int64_t capacity() const { return capacity_; }
/// \brief Ensure that enough memory has been allocated to fit the indicated
/// number of total elements in the builder, including any that have already
/// been appended. Does not account for reallocations that may be due to
/// variable size data, like binary values. To make space for incremental
/// appends, use Reserve instead.
///
/// \param[in] capacity the minimum number of total array values to
/// accommodate. Must be greater than the current capacity.
/// \return Status
virtual Status Resize(int64_t capacity);
/// \brief Ensure that there is enough space allocated to append the indicated
/// number of elements without any further reallocation. Overallocation is
/// used in order to minimize the impact of incremental Reserve() calls.
/// Note that additional_capacity is relative to the current number of elements
/// rather than to the current capacity, so calls to Reserve() which are not
/// interspersed with addition of new elements may not increase the capacity.
///
/// \param[in] additional_capacity the number of additional array values
/// \return Status
Status Reserve(int64_t additional_capacity) {
auto current_capacity = capacity();
auto min_capacity = length() + additional_capacity;
if (min_capacity <= current_capacity) return Status::OK();
// leave growth factor up to BufferBuilder
auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
return Resize(new_capacity);
}
/// Reset the builder.
virtual void Reset();
/// \brief Append a null value to builder
virtual Status AppendNull() = 0;
/// \brief Append a number of null values to builder
virtual Status AppendNulls(int64_t length) = 0;
/// \brief Append a non-null value to builder
///
/// The appended value is an implementation detail, but the corresponding
/// memory slot is guaranteed to be initialized.
/// This method is useful when appending a null value to a parent nested type.
virtual Status AppendEmptyValue() = 0;
/// \brief Append a number of non-null values to builder
///
/// The appended values are an implementation detail, but the corresponding
/// memory slot is guaranteed to be initialized.
/// This method is useful when appending null values to a parent nested type.
virtual Status AppendEmptyValues(int64_t length) = 0;
/// \brief Append a value from a scalar
Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
virtual Status AppendScalars(const ScalarVector& scalars);
/// \brief Append a range of values from an array.
///
/// The given array must be the same type as the builder.
virtual Status AppendArraySlice(const ArraySpan& ARROW_ARG_UNUSED(array),
int64_t ARROW_ARG_UNUSED(offset),
int64_t ARROW_ARG_UNUSED(length)) {
return Status::NotImplemented("AppendArraySlice for builder for ", *type());
}
/// \brief Return result of builder as an internal generic ArrayData
/// object. Resets builder except for dictionary builder
///
/// \param[out] out the finalized ArrayData object
/// \return Status
virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
/// \brief Return result of builder as an Array object.
///
/// The builder is reset except for DictionaryBuilder.
///
/// \param[out] out the finalized Array object
/// \return Status
Status Finish(std::shared_ptr<Array>* out);
/// \brief Return result of builder as an Array object.
///
/// The builder is reset except for DictionaryBuilder.
///
/// \return The finalized Array object
Result<std::shared_ptr<Array>> Finish();
/// \brief Return the type of the built Array
virtual std::shared_ptr<DataType> type() const = 0;
protected:
/// Append to null bitmap
Status AppendToBitmap(bool is_valid);
/// Vector append. Treat each zero byte as a null. If valid_bytes is null
/// assume all of length bits are valid.
Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
/// Uniform append. Append N times the same validity bit.
Status AppendToBitmap(int64_t num_bits, bool value);
/// Set the next length bits to not null (i.e. valid).
Status SetNotNull(int64_t length);
// Unsafe operations (don't check capacity/don't resize)
void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
// Append to null bitmap, update the length
void UnsafeAppendToBitmap(bool is_valid) {
null_bitmap_builder_.UnsafeAppend(is_valid);
++length_;
if (!is_valid) ++null_count_;
}
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
// assume all of length bits are valid.
void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
if (valid_bytes == NULLPTR) {
return UnsafeSetNotNull(length);
}
null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
length_ += length;
null_count_ = null_bitmap_builder_.false_count();
}
// Vector append. Copy from a given bitmap. If bitmap is null assume
// all of length bits are valid.
void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
if (bitmap == NULLPTR) {
return UnsafeSetNotNull(length);
}
null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
length_ += length;
null_count_ = null_bitmap_builder_.false_count();
}
// Append the same validity value a given number of times.
void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
if (value) {
UnsafeSetNotNull(num_bits);
} else {
UnsafeSetNull(num_bits);
}
}
void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
// Set the next validity bits to not null (i.e. valid).
void UnsafeSetNotNull(int64_t length);
// Set the next validity bits to null (i.e. invalid).
void UnsafeSetNull(int64_t length);
static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
/// \brief Finish to an array of the specified ArrayType
template <typename ArrayType>
Status FinishTyped(std::shared_ptr<ArrayType>* out) {
std::shared_ptr<Array> out_untyped;
ARROW_RETURN_NOT_OK(Finish(&out_untyped));
*out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
return Status::OK();
}
// Check the requested capacity for validity
Status CheckCapacity(int64_t new_capacity) {
if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
return Status::Invalid(
"Resize capacity must be positive (requested: ", new_capacity, ")");
}
if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
", current length: ", length_, ")");
}
return Status::OK();
}
// Check for array type
Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
const Array& array, const char* message);
Status CheckArrayType(Type::type expected_type, const Array& array,
const char* message);
MemoryPool* pool_;
int64_t alignment_;
TypedBufferBuilder<bool> null_bitmap_builder_;
int64_t null_count_ = 0;
// Array length, so far. Also, the index of the next element to be added
int64_t length_ = 0;
int64_t capacity_ = 0;
// Child value array builders. These are owned by this class
std::vector<std::shared_ptr<ArrayBuilder>> children_;
private:
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
};
/// \brief Construct an empty ArrayBuilder corresponding to the data
/// type
/// \param[in] pool the MemoryPool to use for allocations
/// \param[in] type the data type to create the builder for
/// \param[out] out the created ArrayBuilder
ARROW_EXPORT
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
std::unique_ptr<ArrayBuilder>* out);
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
std::unique_ptr<ArrayBuilder> out;
ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
return out;
}
/// \brief Construct an empty ArrayBuilder corresponding to the data
/// type, where any top-level or nested dictionary builders return the
/// exact index type specified by the type.
ARROW_EXPORT
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
std::unique_ptr<ArrayBuilder>* out);
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
std::unique_ptr<ArrayBuilder> out;
ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
return out;
}
/// \brief Construct an empty DictionaryBuilder initialized optionally
/// with a preexisting dictionary
/// \param[in] pool the MemoryPool to use for allocations
/// \param[in] type the dictionary type to create the builder for
/// \param[in] dictionary the initial dictionary, if any. May be nullptr
/// \param[out] out the created ArrayBuilder
ARROW_EXPORT
Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
const std::shared_ptr<Array>& dictionary,
std::unique_ptr<ArrayBuilder>* out);
inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool()) {
std::unique_ptr<ArrayBuilder> out;
ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
return out;
}
} // namespace arrow

View File

@@ -0,0 +1,993 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <array>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <numeric>
#include <string>
#include <string_view>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/array_binary.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/binary_view_util.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup binary-builders
///
/// @{
// ----------------------------------------------------------------------
// Binary and String
template <typename TYPE>
class BaseBinaryBuilder
: public ArrayBuilder,
public internal::ArrayBuilderExtraOps<BaseBinaryBuilder<TYPE>, std::string_view> {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
offsets_builder_(pool, alignment),
value_data_builder_(pool, alignment) {}
BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
: BaseBinaryBuilder(pool) {}
Status Append(const uint8_t* value, offset_type length) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendNextOffset();
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
UnsafeAppendToBitmap(true);
return Status::OK();
}
Status Append(const char* value, offset_type length) {
return Append(reinterpret_cast<const uint8_t*>(value), length);
}
Status Append(std::string_view value) {
return Append(value.data(), static_cast<offset_type>(value.size()));
}
/// Extend the last appended value by appending more data at the end
///
/// Unlike Append, this does not create a new offset.
Status ExtendCurrent(const uint8_t* value, offset_type length) {
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
return Status::OK();
}
Status ExtendCurrent(std::string_view value) {
return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<offset_type>(value.size()));
}
Status AppendNulls(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
UnsafeAppendToBitmap(length, false);
return Status::OK();
}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendNextOffset();
UnsafeAppendToBitmap(false);
return Status::OK();
}
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendNextOffset();
UnsafeAppendToBitmap(true);
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
UnsafeAppendToBitmap(length, true);
return Status::OK();
}
/// \brief Append without checking capacity
///
/// Offsets and data should have been presized using Reserve() and
/// ReserveData(), respectively.
void UnsafeAppend(const uint8_t* value, offset_type length) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(value, length);
UnsafeAppendToBitmap(true);
}
void UnsafeAppend(const char* value, offset_type length) {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
}
void UnsafeAppend(const std::string& value) {
UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
}
void UnsafeAppend(std::string_view value) {
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
}
/// Like ExtendCurrent, but do not check capacity
void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
value_data_builder_.UnsafeAppend(value, length);
}
void UnsafeExtendCurrent(std::string_view value) {
UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<offset_type>(value.size()));
}
void UnsafeAppendNull() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(false);
}
void UnsafeAppendEmptyValue() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(true);
}
/// \brief Append a sequence of strings in one shot.
///
/// \param[in] values a vector of strings
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const std::vector<std::string>& values,
const uint8_t* valid_bytes = NULLPTR) {
std::size_t total_length = std::accumulate(
values.begin(), values.end(), 0ULL,
[](uint64_t sum, const std::string& str) { return sum + str.size(); });
ARROW_RETURN_NOT_OK(Reserve(values.size()));
ARROW_RETURN_NOT_OK(ReserveData(total_length));
if (valid_bytes != NULLPTR) {
for (std::size_t i = 0; i < values.size(); ++i) {
UnsafeAppendNextOffset();
if (valid_bytes[i]) {
value_data_builder_.UnsafeAppend(
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
}
}
} else {
for (const auto& value : values) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()),
value.size());
}
}
UnsafeAppendToBitmap(valid_bytes, values.size());
return Status::OK();
}
/// \brief Append a sequence of nul-terminated strings in one shot.
/// If one of the values is NULL, it is processed as a null
/// value even if the corresponding valid_bytes entry is 1.
///
/// \param[in] values a contiguous C array of nul-terminated char *
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const char** values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
std::size_t total_length = 0;
std::vector<std::size_t> value_lengths(length);
bool have_null_value = false;
for (int64_t i = 0; i < length; ++i) {
if (values[i] != NULLPTR) {
auto value_length = strlen(values[i]);
value_lengths[i] = value_length;
total_length += value_length;
} else {
have_null_value = true;
}
}
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(ReserveData(total_length));
if (valid_bytes) {
int64_t valid_bytes_offset = 0;
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
if (valid_bytes[i]) {
if (values[i]) {
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
} else {
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
i - valid_bytes_offset);
UnsafeAppendToBitmap(false);
valid_bytes_offset = i + 1;
}
}
}
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
} else {
if (have_null_value) {
std::vector<uint8_t> valid_vector(length, 0);
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
if (values[i]) {
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
valid_vector[i] = 1;
}
}
UnsafeAppendToBitmap(valid_vector.data(), length);
} else {
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
}
UnsafeAppendToBitmap(NULLPTR, length);
}
}
return Status::OK();
}
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
auto bitmap = array.GetValues<uint8_t>(0, 0);
auto offsets = array.GetValues<offset_type>(1);
auto data = array.GetValues<uint8_t>(2, 0);
auto total_length = offsets[offset + length] - offsets[offset];
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(ReserveData(total_length));
for (int64_t i = 0; i < length; i++) {
if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
const offset_type start = offsets[offset + i];
const offset_type end = offsets[offset + i + 1];
UnsafeAppend(data + start, end - start);
} else {
UnsafeAppendNull();
}
}
return Status::OK();
}
void Reset() override {
ArrayBuilder::Reset();
offsets_builder_.Reset();
value_data_builder_.Reset();
}
Status ValidateOverflow(int64_t new_bytes) {
auto new_size = value_data_builder_.length() + new_bytes;
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", new_size);
} else {
return Status::OK();
}
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
// One more than requested for offsets
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
return ArrayBuilder::Resize(capacity);
}
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements) {
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
return value_data_builder_.Reserve(elements);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
// Write final offset (values length)
ARROW_RETURN_NOT_OK(AppendNextOffset());
// These buffers' padding zeroed by BufferBuilder
std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
null_count_, 0);
Reset();
return Status::OK();
}
/// \return data pointer of the value date builder
const uint8_t* value_data() const { return value_data_builder_.data(); }
/// \return size of values buffer so far
int64_t value_data_length() const { return value_data_builder_.length(); }
/// \return capacity of values buffer
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
/// \return data pointer of the value date builder
const offset_type* offsets_data() const { return offsets_builder_.data(); }
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
const offset_type* offsets = offsets_builder_.data();
const auto offset = offsets[i];
if (i == (length_ - 1)) {
*out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
} else {
*out_length = offsets[i + 1] - offset;
}
return value_data_builder_.data() + offset;
}
offset_type offset(int64_t i) const { return offsets_data()[i]; }
/// Temporary access to a value.
///
/// This view becomes invalid on the next modifying operation.
std::string_view GetView(int64_t i) const {
offset_type value_length;
const uint8_t* value_data = GetValue(i, &value_length);
return std::string_view(reinterpret_cast<const char*>(value_data), value_length);
}
// Cannot make this a static attribute because of linking issues
static constexpr int64_t memory_limit() {
return std::numeric_limits<offset_type>::max() - 1;
}
protected:
TypedBufferBuilder<offset_type> offsets_builder_;
TypedBufferBuilder<uint8_t> value_data_builder_;
Status AppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
}
void UnsafeAppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
};
/// \class BinaryBuilder
/// \brief Builder class for variable-length binary data
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
public:
using BaseBinaryBuilder::BaseBinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return binary(); }
};
/// \class StringBuilder
/// \brief Builder class for UTF8 strings
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
public:
using BinaryBuilder::BinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return utf8(); }
};
/// \class LargeBinaryBuilder
/// \brief Builder class for large variable-length binary data
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
public:
using BaseBinaryBuilder::BaseBinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return large_binary(); }
};
/// \class LargeStringBuilder
/// \brief Builder class for large UTF8 strings
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
public:
using LargeBinaryBuilder::LargeBinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return large_utf8(); }
};
// ----------------------------------------------------------------------
// BinaryViewBuilder, StringViewBuilder
//
// These builders do not support building raw pointer view arrays.
namespace internal {
// We allocate medium-sized memory chunks and accumulate data in those, which
// may result in some waste if there are many large-ish strings. If a string
// comes along that does not fit into a block, we allocate a new block and
// write into that.
//
// Later we can implement optimizations to continuing filling underfull blocks
// after encountering a large string that required allocating a new block.
class ARROW_EXPORT StringHeapBuilder {
public:
static constexpr int64_t kDefaultBlocksize = 32 << 10; // 32KB
StringHeapBuilder(MemoryPool* pool, int64_t alignment)
: pool_(pool), alignment_(alignment) {}
void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; }
using c_type = BinaryViewType::c_type;
template <bool Safe>
std::conditional_t<Safe, Result<c_type>, c_type> Append(const uint8_t* value,
int64_t length) {
if (length <= BinaryViewType::kInlineSize) {
return util::ToInlineBinaryView(value, static_cast<int32_t>(length));
}
if constexpr (Safe) {
ARROW_RETURN_NOT_OK(Reserve(length));
}
auto v = util::ToNonInlineBinaryView(value, static_cast<int32_t>(length),
static_cast<int32_t>(blocks_.size() - 1),
current_offset_);
memcpy(current_out_buffer_, value, static_cast<size_t>(length));
current_out_buffer_ += length;
current_remaining_bytes_ -= length;
current_offset_ += static_cast<int32_t>(length);
return v;
}
static constexpr int64_t ValueSizeLimit() {
return std::numeric_limits<int32_t>::max();
}
/// \brief Ensure that the indicated number of bytes can be appended via
/// UnsafeAppend operations without the need to allocate more memory
Status Reserve(int64_t num_bytes) {
if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) {
return Status::CapacityError(
"BinaryView or StringView elements cannot reference "
"strings larger than 2GB");
}
if (num_bytes > current_remaining_bytes_) {
ARROW_RETURN_NOT_OK(FinishLastBlock());
current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
ARROW_ASSIGN_OR_RAISE(
std::shared_ptr<ResizableBuffer> new_block,
AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_));
current_offset_ = 0;
current_out_buffer_ = new_block->mutable_data();
blocks_.emplace_back(std::move(new_block));
}
return Status::OK();
}
void Reset() {
current_offset_ = 0;
current_out_buffer_ = NULLPTR;
current_remaining_bytes_ = 0;
blocks_.clear();
}
int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
Result<std::vector<std::shared_ptr<ResizableBuffer>>> Finish() {
if (!blocks_.empty()) {
ARROW_RETURN_NOT_OK(FinishLastBlock());
}
current_offset_ = 0;
current_out_buffer_ = NULLPTR;
current_remaining_bytes_ = 0;
return std::move(blocks_);
}
private:
Status FinishLastBlock() {
if (current_remaining_bytes_ > 0) {
// Avoid leaking uninitialized bytes from the allocator
ARROW_RETURN_NOT_OK(
blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_,
/*shrink_to_fit=*/true));
blocks_.back()->ZeroPadding();
}
return Status::OK();
}
MemoryPool* pool_;
int64_t alignment_;
int64_t blocksize_ = kDefaultBlocksize;
std::vector<std::shared_ptr<ResizableBuffer>> blocks_;
int32_t current_offset_ = 0;
uint8_t* current_out_buffer_ = NULLPTR;
int64_t current_remaining_bytes_ = 0;
};
} // namespace internal
class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
public:
using TypeClass = BinaryViewType;
// this constructor provided for MakeBuilder compatibility
BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool);
explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
data_builder_(pool, alignment),
data_heap_builder_(pool, alignment) {}
/// Set the size for future preallocated data buffers.
///
/// The default size is 32KB, so after each 32KB of string data appended to the builder
/// a new data buffer will be allocated. Adjust this to a larger value to decrease the
/// frequency of allocation, or to a smaller value to lower the overhead of each
/// allocation.
void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); }
/// The number of bytes which can be appended to this builder without allocating another
/// data buffer.
int64_t current_block_bytes_remaining() const {
return data_heap_builder_.current_remaining_bytes();
}
Status Append(const uint8_t* value, int64_t length) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(true);
ARROW_ASSIGN_OR_RAISE(auto v,
data_heap_builder_.Append</*Safe=*/true>(value, length));
data_builder_.UnsafeAppend(v);
return Status::OK();
}
Status Append(const char* value, int64_t length) {
return Append(reinterpret_cast<const uint8_t*>(value), length);
}
Status Append(std::string_view value) {
return Append(value.data(), static_cast<int64_t>(value.size()));
}
/// \brief Append without checking capacity
///
/// Builder should have been presized using Reserve() and ReserveData(),
/// respectively, and the value must not be larger than 2GB
void UnsafeAppend(const uint8_t* value, int64_t length) {
UnsafeAppendToBitmap(true);
auto v = data_heap_builder_.Append</*Safe=*/false>(value, length);
data_builder_.UnsafeAppend(v);
}
void UnsafeAppend(const char* value, int64_t length) {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
}
void UnsafeAppend(const std::string& value) {
UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
}
void UnsafeAppend(std::string_view value) {
UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
}
/// \brief Ensures there is enough allocated available capacity in the
/// out-of-line data heap to append the indicated number of bytes without
/// additional allocations
Status ReserveData(int64_t length);
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
UnsafeSetNull(length);
return Status::OK();
}
/// \brief Append a single null element
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
UnsafeAppendToBitmap(false);
return Status::OK();
}
/// \brief Append a empty element (length-0 inline string)
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
UnsafeAppendToBitmap(true);
return Status::OK();
}
/// \brief Append several empty elements
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
UnsafeSetNotNull(length);
return Status::OK();
}
void UnsafeAppendNull() {
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
UnsafeAppendToBitmap(false);
}
void UnsafeAppendEmptyValue() {
data_builder_.UnsafeAppend(BinaryViewType::c_type{});
UnsafeAppendToBitmap(true);
}
/// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies
/// the underlying out-of-line string memory to avoid memory lifetime issues
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override;
void Reset() override;
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
return ArrayBuilder::Resize(capacity);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
std::shared_ptr<DataType> type() const override { return binary_view(); }
protected:
TypedBufferBuilder<BinaryViewType::c_type> data_builder_;
// Accumulates out-of-line data in fixed-size chunks which are then attached
// to the resulting ArrayData
internal::StringHeapBuilder data_heap_builder_;
};
class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
public:
using BinaryViewBuilder::BinaryViewBuilder;
std::shared_ptr<DataType> type() const override { return utf8_view(); }
};
// ----------------------------------------------------------------------
// FixedSizeBinaryBuilder
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
public:
using TypeClass = FixedSizeBinaryType;
explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment);
Status Append(const uint8_t* value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(value);
return Status::OK();
}
Status Append(const char* value) {
return Append(reinterpret_cast<const uint8_t*>(value));
}
Status Append(std::string_view view) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(view);
return Status::OK();
}
Status Append(const std::string& s) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(s);
return Status::OK();
}
Status Append(const Buffer& s) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(s);
return Status::OK();
}
Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
template <size_t NBYTES>
Status Append(const std::array<uint8_t, NBYTES>& value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(
std::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
return Status::OK();
}
Status AppendValues(const uint8_t* data, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
int64_t bitmap_offset);
Status AppendNull() final;
Status AppendNulls(int64_t length) final;
Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
return AppendValues(
array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}
void UnsafeAppend(const uint8_t* value) {
UnsafeAppendToBitmap(true);
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
byte_builder_.UnsafeAppend(value, byte_width_);
}
}
void UnsafeAppend(const char* value) {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
}
void UnsafeAppend(std::string_view value) {
#ifndef NDEBUG
CheckValueSize(static_cast<size_t>(value.size()));
#endif
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
}
void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); }
void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
void UnsafeAppendNull() {
UnsafeAppendToBitmap(false);
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
}
Status ValidateOverflow(int64_t new_bytes) const {
auto new_size = byte_builder_.length() + new_bytes;
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", new_size);
} else {
return Status::OK();
}
}
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements) {
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
return byte_builder_.Reserve(elements);
}
void Reset() override;
Status Resize(int64_t capacity) override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
/// \return size of values buffer so far
int64_t value_data_length() const { return byte_builder_.length(); }
int32_t byte_width() const { return byte_width_; }
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
const uint8_t* GetValue(int64_t i) const;
/// Temporary mutable access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
uint8_t* GetMutableValue(int64_t i) {
uint8_t* data_ptr = byte_builder_.mutable_data();
return data_ptr + i * byte_width_;
}
/// Temporary mutable access to a value.
///
/// This view becomes invalid on the next modifying operation.
std::string_view GetView(int64_t i) const;
/// Advance builder without allocating nor writing any values
///
/// The internal pointer is advanced by `length` values and the same number
/// of non-null entries are appended to the validity bitmap.
/// This method assumes that the `length` values were populated directly,
/// for example using `GetMutableValue`.
void UnsafeAdvance(int64_t length) {
byte_builder_.UnsafeAdvance(length * byte_width_);
UnsafeAppendToBitmap(length, true);
}
/// Advance builder without allocating nor writing any values
///
/// The internal pointer is advanced by `length` values and the same number
/// of validity bits are appended to the validity bitmap.
/// This method assumes that the `length` values were populated directly,
/// for example using `GetMutableValue`.
void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) {
byte_builder_.UnsafeAdvance(length * byte_width_);
UnsafeAppendToBitmap(validity, valid_bits_offset, length);
}
static constexpr int64_t memory_limit() {
return std::numeric_limits<int64_t>::max() - 1;
}
std::shared_ptr<DataType> type() const override {
return fixed_size_binary(byte_width_);
}
protected:
int32_t byte_width_;
BufferBuilder byte_builder_;
void CheckValueSize(int64_t size);
};
/// @}
// ----------------------------------------------------------------------
// Chunked builders: build a sequence of BinaryArray or StringArray that are
// limited to a particular size (to the upper limit of 2GB)
namespace internal {
class ARROW_EXPORT ChunkedBinaryBuilder {
public:
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
MemoryPool* pool = default_memory_pool());
ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
MemoryPool* pool = default_memory_pool());
virtual ~ChunkedBinaryBuilder() = default;
Status Append(const uint8_t* value, int32_t length) {
if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
max_chunk_value_length_)) {
if (builder_->value_data_length() == 0) {
// The current item is larger than max_chunk_size_;
// this chunk will be oversize and hold *only* this item
ARROW_RETURN_NOT_OK(builder_->Append(value, length));
return NextChunk();
}
// The current item would cause builder_->value_data_length() to exceed
// max_chunk_size_, so finish this chunk and append the current item to the next
// chunk
ARROW_RETURN_NOT_OK(NextChunk());
return Append(value, length);
}
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
// The current item would cause builder_->length() to exceed max_chunk_length_, so
// finish this chunk and append the current item to the next chunk
ARROW_RETURN_NOT_OK(NextChunk());
}
return builder_->Append(value, length);
}
Status Append(std::string_view value) {
return Append(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<int32_t>(value.size()));
}
Status AppendNull() {
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
ARROW_RETURN_NOT_OK(NextChunk());
}
return builder_->AppendNull();
}
Status Reserve(int64_t values);
virtual Status Finish(ArrayVector* out);
protected:
Status NextChunk();
// maximum total character data size per chunk
int64_t max_chunk_value_length_;
// maximum elements allowed per chunk
int64_t max_chunk_length_ = kListMaximumElements;
// when Reserve() would cause builder_ to exceed its max_chunk_length_,
// add to extra_capacity_ instead and wait to reserve until the next chunk
int64_t extra_capacity_ = 0;
std::unique_ptr<BinaryBuilder> builder_;
std::vector<std::shared_ptr<Array>> chunks_;
};
class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
public:
using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
Status Finish(ArrayVector* out) override;
};
} // namespace internal
} // namespace arrow

View File

@@ -0,0 +1,164 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/array/array_decimal.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/data.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup numeric-builders
///
/// @{
class ARROW_EXPORT Decimal32Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal32Type;
using ValueType = Decimal32;
explicit Decimal32Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment);
using FixedSizeBinaryBuilder::Append;
using FixedSizeBinaryBuilder::AppendValues;
using FixedSizeBinaryBuilder::Reset;
Status Append(Decimal32 val);
void UnsafeAppend(Decimal32 val);
void UnsafeAppend(std::string_view val);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<Decimal32Array>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return decimal_type_; }
protected:
std::shared_ptr<Decimal32Type> decimal_type_;
};
class ARROW_EXPORT Decimal64Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal64Type;
using ValueType = Decimal64;
explicit Decimal64Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment);
using FixedSizeBinaryBuilder::Append;
using FixedSizeBinaryBuilder::AppendValues;
using FixedSizeBinaryBuilder::Reset;
Status Append(Decimal64 val);
void UnsafeAppend(Decimal64 val);
void UnsafeAppend(std::string_view val);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<Decimal64Array>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return decimal_type_; }
protected:
std::shared_ptr<Decimal64Type> decimal_type_;
};
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal128Type;
using ValueType = Decimal128;
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment);
using FixedSizeBinaryBuilder::Append;
using FixedSizeBinaryBuilder::AppendValues;
using FixedSizeBinaryBuilder::Reset;
Status Append(Decimal128 val);
void UnsafeAppend(Decimal128 val);
void UnsafeAppend(std::string_view val);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return decimal_type_; }
protected:
std::shared_ptr<Decimal128Type> decimal_type_;
};
class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal256Type;
using ValueType = Decimal256;
explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment);
using FixedSizeBinaryBuilder::Append;
using FixedSizeBinaryBuilder::AppendValues;
using FixedSizeBinaryBuilder::Reset;
Status Append(const Decimal256& val);
void UnsafeAppend(const Decimal256& val);
void UnsafeAppend(std::string_view val);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return decimal_type_; }
protected:
std::shared_ptr<Decimal256Type> decimal_type_;
};
using DecimalBuilder = Decimal128Builder;
/// @}
} // namespace arrow

View File

@@ -0,0 +1,728 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstdint>
#include <memory>
#include <type_traits>
#include "arrow/array/array_base.h"
#include "arrow/array/array_binary.h"
#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
#include "arrow/array/builder_base.h" // IWYU pragma: export
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
#include "arrow/array/data.h"
#include "arrow/array/util.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
// ----------------------------------------------------------------------
// Dictionary builder
namespace internal {
template <typename T, typename Enable = void>
struct DictionaryValue {
using type = typename T::c_type;
using PhysicalType = T;
};
template <typename T>
struct DictionaryValue<T, enable_if_base_binary<T>> {
using type = std::string_view;
using PhysicalType =
typename std::conditional<std::is_same<typename T::offset_type, int32_t>::value,
BinaryType, LargeBinaryType>::type;
};
template <typename T>
struct DictionaryValue<T, enable_if_binary_view_like<T>> {
using type = std::string_view;
using PhysicalType = BinaryViewType;
};
template <typename T>
struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
using type = std::string_view;
using PhysicalType = BinaryType;
};
class ARROW_EXPORT DictionaryMemoTable {
public:
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<DataType>& type);
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<Array>& dictionary);
~DictionaryMemoTable();
Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out);
/// \brief Insert new memo values
Status InsertValues(const Array& values);
int32_t size() const;
template <typename T>
Status GetOrInsert(typename DictionaryValue<T>::type value, int32_t* out) {
// We want to keep the DictionaryMemoTable implementation private, also we can't
// use extern template classes because of compiler issues (MinGW?). Instead,
// we expose explicit function overrides for each supported physical type.
const typename DictionaryValue<T>::PhysicalType* physical_type = NULLPTR;
return GetOrInsert(physical_type, value, out);
}
private:
Status GetOrInsert(const BooleanType*, bool value, int32_t* out);
Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out);
Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out);
Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out);
Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
Status GetOrInsert(const DurationType*, int64_t value, int32_t* out);
Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out);
Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const MonthDayNanoIntervalType*,
MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out);
Status GetOrInsert(const DayTimeIntervalType*,
DayTimeIntervalType::DayMilliseconds value, int32_t* out);
Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out);
Status GetOrInsert(const FloatType*, float value, int32_t* out);
Status GetOrInsert(const DoubleType*, double value, int32_t* out);
Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out);
Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out);
Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out);
class DictionaryMemoTableImpl;
std::unique_ptr<DictionaryMemoTableImpl> impl_;
};
} // namespace internal
/// \addtogroup dictionary-builders
///
/// @{
namespace internal {
/// \brief Array builder for created encoded DictionaryArray from
/// dense array
///
/// Unlike other builders, dictionary builder does not completely
/// reset the state on Finish calls.
template <typename BuilderType, typename T>
class DictionaryBuilderBase : public ArrayBuilder {
public:
using TypeClass = DictionaryType;
using Value = typename DictionaryValue<T>::type;
// WARNING: the type given below is the value type, not the DictionaryType.
// The DictionaryType is instantiated on the Finish() call.
template <typename B = BuilderType, typename T1 = T>
DictionaryBuilderBase(uint8_t start_int_size,
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
!is_fixed_size_binary_type<T1>::value,
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(start_int_size, pool, alignment),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool, alignment),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
const std::shared_ptr<DataType>& index_type,
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(index_type, pool, alignment),
value_type_(value_type) {}
template <typename B = BuilderType, typename T1 = T>
DictionaryBuilderBase(uint8_t start_int_size,
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
is_fixed_size_binary_type<T1>::value,
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(start_int_size, pool, alignment),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(pool, alignment),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
const std::shared_ptr<DataType>& index_type,
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(index_type, pool, alignment),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
: DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
// This constructor doesn't check for errors. Use InsertMemoValues instead.
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool, alignment),
value_type_(dictionary->type()) {}
~DictionaryBuilderBase() override = default;
/// \brief The current number of entries in the dictionary
int64_t dictionary_length() const { return memo_table_->size(); }
/// \brief The value byte width (for FixedSizeBinaryType)
template <typename T1 = T>
enable_if_fixed_size_binary<T1, int32_t> byte_width() const {
return byte_width_;
}
/// \brief Append a scalar value
Status Append(Value value) {
ARROW_RETURN_NOT_OK(Reserve(1));
int32_t memo_index;
ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert<T>(value, &memo_index));
ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index));
length_ += 1;
return Status::OK();
}
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
template <typename T1 = T>
enable_if_fixed_size_binary<T1, Status> Append(const uint8_t* value) {
return Append(std::string_view(reinterpret_cast<const char*>(value), byte_width_));
}
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
template <typename T1 = T>
enable_if_fixed_size_binary<T1, Status> Append(const char* value) {
return Append(std::string_view(value, byte_width_));
}
/// \brief Append a string (only for binary types)
template <typename T1 = T>
enable_if_binary_like<T1, Status> Append(const uint8_t* value, int32_t length) {
return Append(reinterpret_cast<const char*>(value), length);
}
/// \brief Append a string (only for binary types)
template <typename T1 = T>
enable_if_binary_like<T1, Status> Append(const char* value, int32_t length) {
return Append(std::string_view(value, length));
}
/// \brief Append a string (only for string types)
template <typename T1 = T>
enable_if_string_like<T1, Status> Append(const char* value, int32_t length) {
return Append(std::string_view(value, length));
}
/// \brief Append a decimal (only for Decimal32/64/128/256 Type)
template <typename T1 = T, typename CType = typename TypeTraits<T1>::CType>
enable_if_decimal<T1, Status> Append(const CType& value) {
auto bytes = value.ToBytes();
return Append(bytes.data(), static_cast<int32_t>(bytes.size()));
}
/// \brief Append a scalar null value
Status AppendNull() final {
length_ += 1;
null_count_ += 1;
return indices_builder_.AppendNull();
}
Status AppendNulls(int64_t length) final {
length_ += length;
null_count_ += length;
return indices_builder_.AppendNulls(length);
}
Status AppendEmptyValue() final {
length_ += 1;
return indices_builder_.AppendEmptyValue();
}
Status AppendEmptyValues(int64_t length) final {
length_ += length;
return indices_builder_.AppendEmptyValues(length);
}
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
if (!scalar.is_valid) return AppendNulls(n_repeats);
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*scalar.type);
const DictionaryScalar& dict_scalar =
internal::checked_cast<const DictionaryScalar&>(scalar);
const auto& dict = internal::checked_cast<const typename TypeTraits<T>::ArrayType&>(
*dict_scalar.value.dictionary);
ARROW_RETURN_NOT_OK(Reserve(n_repeats));
switch (dict_ty.index_type()->id()) {
case Type::UINT8:
return AppendScalarImpl<UInt8Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT8:
return AppendScalarImpl<Int8Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT16:
return AppendScalarImpl<UInt16Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT16:
return AppendScalarImpl<Int16Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT32:
return AppendScalarImpl<UInt32Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT32:
return AppendScalarImpl<Int32Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT64:
return AppendScalarImpl<UInt64Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT64:
return AppendScalarImpl<Int64Type>(dict, *dict_scalar.value.index, n_repeats);
default:
return Status::TypeError("Invalid index type: ", dict_ty);
}
return Status::OK();
}
Status AppendScalars(const ScalarVector& scalars) override {
for (const auto& scalar : scalars) {
ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
}
return Status::OK();
}
Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
// Visit the indices and insert the unpacked values.
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
// See if possible to avoid using ToArrayData here
const typename TypeTraits<T>::ArrayType dict(array.dictionary().ToArrayData());
ARROW_RETURN_NOT_OK(Reserve(length));
switch (dict_ty.index_type()->id()) {
case Type::UINT8:
return AppendArraySliceImpl<uint8_t>(dict, array, offset, length);
case Type::INT8:
return AppendArraySliceImpl<int8_t>(dict, array, offset, length);
case Type::UINT16:
return AppendArraySliceImpl<uint16_t>(dict, array, offset, length);
case Type::INT16:
return AppendArraySliceImpl<int16_t>(dict, array, offset, length);
case Type::UINT32:
return AppendArraySliceImpl<uint32_t>(dict, array, offset, length);
case Type::INT32:
return AppendArraySliceImpl<int32_t>(dict, array, offset, length);
case Type::UINT64:
return AppendArraySliceImpl<uint64_t>(dict, array, offset, length);
case Type::INT64:
return AppendArraySliceImpl<int64_t>(dict, array, offset, length);
default:
return Status::TypeError("Invalid index type: ", dict_ty);
}
return Status::OK();
}
/// \brief Insert values into the dictionary's memo, but do not append any
/// indices. Can be used to initialize a new builder with known dictionary
/// values
/// \param[in] values dictionary values to add to memo. Type must match
/// builder type
Status InsertMemoValues(const Array& values) {
return memo_table_->InsertValues(values);
}
/// \brief Append a whole dense array to the builder
template <typename T1 = T>
enable_if_t<!is_fixed_size_binary_type<T1>::value, Status> AppendArray(
const Array& array) {
using ArrayType = typename TypeTraits<T>::ArrayType;
#ifndef NDEBUG
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
value_type_, array, "Wrong value type of array to be appended"));
#endif
const auto& concrete_array = static_cast<const ArrayType&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
ARROW_RETURN_NOT_OK(AppendNull());
} else {
ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
}
}
return Status::OK();
}
template <typename T1 = T>
enable_if_fixed_size_binary<T1, Status> AppendArray(const Array& array) {
#ifndef NDEBUG
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
value_type_, array, "Wrong value type of array to be appended"));
#endif
const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
ARROW_RETURN_NOT_OK(AppendNull());
} else {
ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
}
}
return Status::OK();
}
void Reset() override {
// Perform a partial reset. Call ResetFull to also reset the accumulated
// dictionary values
ArrayBuilder::Reset();
indices_builder_.Reset();
}
/// \brief Reset and also clear accumulated dictionary values in memo table
void ResetFull() {
Reset();
memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_));
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
capacity_ = indices_builder_.capacity();
return Status::OK();
}
/// \brief Return dictionary indices and a delta dictionary since the last
/// time that Finish or FinishDelta were called, and reset state of builder
/// (except the memo table)
Status FinishDelta(std::shared_ptr<Array>* out_indices,
std::shared_ptr<Array>* out_delta) {
std::shared_ptr<ArrayData> indices_data;
std::shared_ptr<ArrayData> delta_data;
ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data));
*out_indices = MakeArray(indices_data);
*out_delta = MakeArray(delta_data);
return Status::OK();
}
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), value_type_);
}
protected:
template <typename c_type>
Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
const ArraySpan& array, int64_t offset, int64_t length) {
const c_type* values = array.GetValues<c_type>(1) + offset;
return VisitBitBlocks(
array.buffers[0].data, array.offset + offset, length,
[&](const int64_t position) {
const int64_t index = static_cast<int64_t>(values[position]);
if (dict.IsValid(index)) {
return Append(dict.GetView(index));
}
return AppendNull();
},
[&]() { return AppendNull(); });
}
template <typename IndexType>
Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
const Scalar& index_scalar, int64_t n_repeats) {
using ScalarType = typename TypeTraits<IndexType>::ScalarType;
const auto index = internal::checked_cast<const ScalarType&>(index_scalar).value;
if (index_scalar.is_valid && dict.IsValid(index)) {
const auto& value = dict.GetView(index);
for (int64_t i = 0; i < n_repeats; i++) {
ARROW_RETURN_NOT_OK(Append(value));
}
return Status::OK();
}
return AppendNulls(n_repeats);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
std::shared_ptr<ArrayData> dictionary;
ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
// Set type of array data to the right dictionary type
(*out)->type = type();
(*out)->dictionary = dictionary;
return Status::OK();
}
Status FinishWithDictOffset(int64_t dict_offset,
std::shared_ptr<ArrayData>* out_indices,
std::shared_ptr<ArrayData>* out_dictionary) {
// Finalize indices array
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices));
// Generate dictionary array from hash table contents
ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary));
delta_offset_ = memo_table_->size();
// Update internals for further uses of this DictionaryBuilder
ArrayBuilder::Reset();
return Status::OK();
}
std::unique_ptr<DictionaryMemoTable> memo_table_;
// The size of the dictionary memo at last invocation of Finish, to use in
// FinishDelta for computing dictionary deltas
int32_t delta_offset_;
// Only used for FixedSizeBinaryType
int32_t byte_width_;
BuilderType indices_builder_;
std::shared_ptr<DataType> value_type_;
};
template <typename BuilderType>
class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
public:
template <typename B = BuilderType>
DictionaryBuilderBase(
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(index_type, pool) {}
template <typename B = BuilderType>
explicit DictionaryBuilderBase(
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
/// \brief Append a scalar null value
Status AppendNull() final {
length_ += 1;
null_count_ += 1;
return indices_builder_.AppendNull();
}
Status AppendNulls(int64_t length) final {
length_ += length;
null_count_ += length;
return indices_builder_.AppendNulls(length);
}
Status AppendEmptyValue() final {
length_ += 1;
return indices_builder_.AppendEmptyValue();
}
Status AppendEmptyValues(int64_t length) final {
length_ += length;
return indices_builder_.AppendEmptyValues(length);
}
/// \brief Append a whole dense array to the builder
Status AppendArray(const Array& array) {
#ifndef NDEBUG
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
Type::NA, array, "Wrong value type of array to be appended"));
#endif
for (int64_t i = 0; i < array.length(); i++) {
ARROW_RETURN_NOT_OK(AppendNull());
}
return Status::OK();
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
capacity_ = indices_builder_.capacity();
return Status::OK();
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
(*out)->type = dictionary((*out)->type, null());
(*out)->dictionary = NullArray(0).data();
return Status::OK();
}
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), null());
}
protected:
BuilderType indices_builder_;
};
} // namespace internal
/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
/// smallest index size that can accommodate the dictionary indices
template <typename T>
class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
public:
using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
using BASE::BASE;
/// \brief Append dictionary indices directly without modifying memo
///
/// NOTE: Experimental API
Status AppendIndices(const int64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
int64_t null_count_before = this->indices_builder_.null_count();
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
this->capacity_ = this->indices_builder_.capacity();
this->length_ += length;
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
return Status::OK();
}
};
/// \brief A DictionaryArray builder that always returns int32 dictionary
/// indices so that data cast to dictionary form will have a consistent index
/// type, e.g. for creating a ChunkedArray
template <typename T>
class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
public:
using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
using BASE::BASE;
/// \brief Append dictionary indices directly without modifying memo
///
/// NOTE: Experimental API
Status AppendIndices(const int32_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
int64_t null_count_before = this->indices_builder_.null_count();
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
this->capacity_ = this->indices_builder_.capacity();
this->length_ += length;
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
return Status::OK();
}
};
// ----------------------------------------------------------------------
// Binary / Unicode builders
// (compatibility aliases; those used to be derived classes with additional
// Append() overloads, but they have been folded into DictionaryBuilderBase)
using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
using StringDictionaryBuilder = DictionaryBuilder<StringType>;
using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
using StringDictionary32Builder = Dictionary32Builder<StringType>;
/// @}
} // namespace arrow

View File

@@ -0,0 +1,836 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup nested-builders
///
/// @{
// ----------------------------------------------------------------------
// VarLengthListLikeBuilder
template <typename TYPE>
class VarLengthListLikeBuilder : public ArrayBuilder {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
/// Use this constructor to incrementally build the value array along with offsets and
/// null bitmap.
VarLengthListLikeBuilder(MemoryPool* pool,
const std::shared_ptr<ArrayBuilder>& value_builder,
const std::shared_ptr<DataType>& type,
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
offsets_builder_(pool, alignment),
value_builder_(value_builder),
value_field_(type->field(0)->WithType(NULLPTR)) {}
VarLengthListLikeBuilder(MemoryPool* pool,
const std::shared_ptr<ArrayBuilder>& value_builder,
int64_t alignment = kDefaultBufferAlignment)
: VarLengthListLikeBuilder(pool, value_builder,
std::make_shared<TYPE>(value_builder->type()),
alignment) {}
~VarLengthListLikeBuilder() override = default;
Status Resize(int64_t capacity) override {
if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) {
return Status::CapacityError(type_name(),
" array cannot reserve space for more than ",
maximum_elements(), " got ", capacity);
}
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
// One more than requested for list offsets
const int64_t offsets_capacity =
is_list_view(TYPE::type_id) ? capacity : capacity + 1;
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity));
return ArrayBuilder::Resize(capacity);
}
void Reset() override {
ArrayBuilder::Reset();
offsets_builder_.Reset();
value_builder_->Reset();
}
/// \brief Start a new variable-length list slot
///
/// This function should be called before appending elements to the
/// value builder. Elements appended to the value builder before this function
/// is called for the first time, will not be members of any list value.
///
/// After this function is called, list_length elements SHOULD be appended to
/// the values builder. If this contract is violated, the behavior is defined by
/// the concrete builder implementation and SHOULD NOT be relied upon unless
/// the caller is specifically building a [Large]List or [Large]ListView array.
///
/// For [Large]List arrays, the list slot length will be the number of elements
/// appended to the values builder before the next call to Append* or Finish. For
/// [Large]ListView arrays, the list slot length will be exactly list_length, but if
/// Append* is called before at least list_length elements are appended to the values
/// builder, the current list slot will share elements with the next list
/// slots or an invalid [Large]ListView array will be generated because there
/// aren't enough elements in the values builder to fill the list slots.
///
/// If you're building a [Large]List and don't need to be compatible
/// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)`
/// is a simpler API.
///
/// \pre if is_valid is false, list_length MUST be 0
/// \param is_valid Whether the new list slot is valid
/// \param list_length The number of elements in the list
Status Append(bool is_valid, int64_t list_length) {
ARROW_RETURN_NOT_OK(Reserve(1));
assert(is_valid || list_length == 0);
UnsafeAppendToBitmap(is_valid);
UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length);
return Status::OK();
}
Status AppendNull() final {
// Append() a null list slot with list_length=0.
//
// When building [Large]List arrays, elements being appended to the values builder
// before the next call to Append* or Finish will extend the list slot length, but
// that is totally fine because list arrays admit non-empty null list slots.
//
// In the case of [Large]ListViews that's not a problem either because the
// list slot length remains zero.
return Append(false, 0);
}
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(length, false);
UnsafeAppendEmptyDimensions(/*num_values=*/length);
return Status::OK();
}
/// \brief Append an empty list slot
///
/// \post Another call to Append* or Finish should be made before appending to
/// the values builder to ensure list slot remains empty
Status AppendEmptyValue() final { return Append(true, 0); }
/// \brief Append an empty list slot
///
/// \post Another call to Append* or Finish should be made before appending to
/// the values builder to ensure the last list slot remains empty
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(length, true);
UnsafeAppendEmptyDimensions(/*num_values=*/length);
return Status::OK();
}
/// \brief Vector append
///
/// For list-array builders, the sizes are inferred from the offsets.
/// BaseListBuilder<T> provides an implementation that doesn't take sizes, but
/// this virtual function allows dispatching calls to both list-array and
/// list-view-array builders (which need the sizes)
///
/// \param offsets The offsets of the variable-length lists
/// \param sizes The sizes of the variable-length lists
/// \param length The number of offsets, sizes, and validity bits to append
/// \param valid_bytes If passed, valid_bytes is of equal length to values,
/// and any zero byte will be considered as a null for that slot
virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes,
int64_t length, const uint8_t* valid_bytes) = 0;
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
const offset_type* offsets = array.GetValues<offset_type>(1);
[[maybe_unused]] const offset_type* sizes = NULLPTR;
if constexpr (is_list_view(TYPE::type_id)) {
sizes = array.GetValues<offset_type>(2);
}
static_assert(internal::may_have_validity_bitmap(TYPE::type_id));
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t row = offset; row < offset + length; row++) {
const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
int64_t size = 0;
if (is_valid) {
if constexpr (is_list_view(TYPE::type_id)) {
size = sizes[row];
} else {
size = offsets[row + 1] - offsets[row];
}
}
UnsafeAppendToBitmap(is_valid);
UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size);
if (is_valid) {
ARROW_RETURN_NOT_OK(
value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size));
}
}
return Status::OK();
}
Status ValidateOverflow(int64_t new_elements) const {
auto new_length = value_builder_->length() + new_elements;
if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
return Status::CapacityError(type_name(), " array cannot contain more than ",
maximum_elements(), " elements, have ", new_elements);
} else {
return Status::OK();
}
}
ArrayBuilder* value_builder() const { return value_builder_.get(); }
// Cannot make this a static attribute because of linking issues
static constexpr int64_t maximum_elements() {
return std::numeric_limits<offset_type>::max() - 1;
}
std::shared_ptr<DataType> type() const override {
return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
}
private:
static constexpr const char* type_name() {
if constexpr (is_list_view(TYPE::type_id)) {
return "ListView";
} else {
return "List";
}
}
protected:
/// \brief Append dimensions for num_values empty list slots.
///
/// ListViewBuilder overrides this to also append the sizes.
virtual void UnsafeAppendEmptyDimensions(int64_t num_values) {
const int64_t offset = value_builder_->length();
for (int64_t i = 0; i < num_values; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
}
}
/// \brief Append dimensions for a single list slot.
///
/// ListViewBuilder overrides this to also append the size.
virtual void UnsafeAppendDimensions(int64_t offset, int64_t ARROW_ARG_UNUSED(size)) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
}
TypedBufferBuilder<offset_type> offsets_builder_;
std::shared_ptr<ArrayBuilder> value_builder_;
std::shared_ptr<Field> value_field_;
};
// ----------------------------------------------------------------------
// ListBuilder / LargeListBuilder
template <typename TYPE>
class BaseListBuilder : public VarLengthListLikeBuilder<TYPE> {
private:
using BASE = VarLengthListLikeBuilder<TYPE>;
public:
using TypeClass = TYPE;
using offset_type = typename BASE::offset_type;
using BASE::BASE;
using BASE::Append;
~BaseListBuilder() override = default;
/// \brief Start a new variable-length list slot
///
/// This function should be called before beginning to append elements to the
/// value builder
Status Append(bool is_valid = true) {
// The value_length parameter to BASE::Append(bool, int64_t) is ignored when
// building a list array, so we can pass 0 here.
return BASE::Append(is_valid, 0);
}
/// \brief Vector append
///
/// If passed, valid_bytes is of equal length to values, and any zero byte
/// will be considered as a null for that slot
Status AppendValues(const offset_type* offsets, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
ARROW_RETURN_NOT_OK(this->Reserve(length));
this->UnsafeAppendToBitmap(valid_bytes, length);
this->offsets_builder_.UnsafeAppend(offsets, length);
return Status::OK();
}
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
int64_t length, const uint8_t* valid_bytes) final {
// Offsets are assumed to be valid, but the first length-1 sizes have to be
// consistent with the offsets to partially rule out the possibility that the
// caller is passing sizes that could work if building a list-view, but don't
// work on building a list that requires offsets to be non-decreasing.
//
// CAUTION: the last size element (`sizes[length - 1]`) is not
// validated and could be inconsistent with the offsets given in a
// subsequent call to AppendValues.
#ifndef NDEBUG
if (sizes) {
for (int64_t i = 0; i < length - 1; ++i) {
if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) {
if (!valid_bytes || valid_bytes[i]) {
return Status::Invalid(
"BaseListBuilder: sizes are inconsistent with offsets provided");
}
}
}
}
#endif
return AppendValues(offsets, length, valid_bytes);
}
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
int64_t length) {
return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR);
}
Status AppendNextOffset() {
ARROW_RETURN_NOT_OK(this->ValidateOverflow(0));
const int64_t num_values = this->value_builder_->length();
return this->offsets_builder_.Append(static_cast<offset_type>(num_values));
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(AppendNextOffset());
// Offset padding zeroed by BufferBuilder
std::shared_ptr<Buffer> offsets;
std::shared_ptr<Buffer> null_bitmap;
ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets));
ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap));
if (this->value_builder_->length() == 0) {
// Try to make sure we get a non-null values buffer (ARROW-2744)
ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0));
}
std::shared_ptr<ArrayData> items;
ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items));
*out = ArrayData::Make(this->type(), this->length_,
{std::move(null_bitmap), std::move(offsets)},
{std::move(items)}, this->null_count_);
this->Reset();
return Status::OK();
}
};
/// \class ListBuilder
/// \brief Builder class for variable-length list array value types
///
/// To use this class, you must append values to the child array builder and use
/// the Append function to delimit each distinct list value (once the values
/// have been appended to the child array) or use the bulk API to append
/// a sequence of offsets and null values.
///
/// A note on types. Per arrow/type.h all types in the c++ implementation are
/// logical so even though this class always builds list array, this can
/// represent multiple different logical types. If no logical type is provided
/// at construction time, the class defaults to List<T> where t is taken from the
/// value_builder/values that the object is constructed with.
class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
public:
using BaseListBuilder::BaseListBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
};
/// \class LargeListBuilder
/// \brief Builder class for large variable-length list array value types
///
/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
public:
using BaseListBuilder::BaseListBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
};
// ----------------------------------------------------------------------
// ListViewBuilder / LargeListViewBuilder
template <typename TYPE>
class BaseListViewBuilder : public VarLengthListLikeBuilder<TYPE> {
private:
using BASE = VarLengthListLikeBuilder<TYPE>;
public:
using TypeClass = TYPE;
using offset_type = typename BASE::offset_type;
using BASE::BASE;
~BaseListViewBuilder() override = default;
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(BASE::Resize(capacity));
return sizes_builder_.Resize(capacity);
}
void Reset() override {
BASE::Reset();
sizes_builder_.Reset();
}
/// \brief Vector append
///
/// If passed, valid_bytes is of equal length to values, and any zero byte
/// will be considered as a null for that slot
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
int64_t length, const uint8_t* valid_bytes) final {
ARROW_RETURN_NOT_OK(this->Reserve(length));
this->UnsafeAppendToBitmap(valid_bytes, length);
this->offsets_builder_.UnsafeAppend(offsets, length);
this->sizes_builder_.UnsafeAppend(sizes, length);
return Status::OK();
}
Status AppendValues(const offset_type* offsets, const offset_type* sizes,
int64_t length) {
return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
// Offset and sizes padding zeroed by BufferBuilder
std::shared_ptr<Buffer> null_bitmap;
std::shared_ptr<Buffer> offsets;
std::shared_ptr<Buffer> sizes;
ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap));
ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets));
ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes));
if (this->value_builder_->length() == 0) {
// Try to make sure we get a non-null values buffer (ARROW-2744)
ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0));
}
std::shared_ptr<ArrayData> items;
ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items));
*out = ArrayData::Make(this->type(), this->length_,
{std::move(null_bitmap), std::move(offsets), std::move(sizes)},
{std::move(items)}, this->null_count_);
this->Reset();
return Status::OK();
}
protected:
void UnsafeAppendEmptyDimensions(int64_t num_values) override {
for (int64_t i = 0; i < num_values; ++i) {
this->offsets_builder_.UnsafeAppend(0);
}
for (int64_t i = 0; i < num_values; ++i) {
this->sizes_builder_.UnsafeAppend(0);
}
}
void UnsafeAppendDimensions(int64_t offset, int64_t size) override {
this->offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
this->sizes_builder_.UnsafeAppend(static_cast<offset_type>(size));
}
private:
TypedBufferBuilder<offset_type> sizes_builder_;
};
class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder<ListViewType> {
public:
using BaseListViewBuilder::BaseListViewBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<ListViewArray>* out) { return FinishTyped(out); }
};
class ARROW_EXPORT LargeListViewBuilder final
: public BaseListViewBuilder<LargeListViewType> {
public:
using BaseListViewBuilder::BaseListViewBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeListViewArray>* out) { return FinishTyped(out); }
};
// ----------------------------------------------------------------------
// Map builder
/// \class MapBuilder
/// \brief Builder class for arrays of variable-size maps
///
/// To use this class, you must use the Append function to delimit each distinct
/// map before appending values to the key and item array builders, or use the
/// bulk API to append a sequence of offsets and null maps.
///
/// Key uniqueness and ordering are not validated.
class ARROW_EXPORT MapBuilder : public ArrayBuilder {
public:
/// Use this constructor to define the built array's type explicitly. If key_builder
/// or item_builder has indeterminate type, this builder will also.
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
const std::shared_ptr<ArrayBuilder>& item_builder,
const std::shared_ptr<DataType>& type);
/// Use this constructor to infer the built array's type. If key_builder or
/// item_builder has indeterminate type, this builder will also.
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
const std::shared_ptr<DataType>& type);
Status Resize(int64_t capacity) override;
void Reset() override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
/// \brief Vector append
///
/// If passed, valid_bytes is of equal length to values, and any zero byte
/// will be considered as a null for that slot
Status AppendValues(const int32_t* offsets, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
/// \brief Start a new variable-length map slot
///
/// This function should be called before beginning to append elements to the
/// key and item builders
Status Append();
Status AppendNull() final;
Status AppendNulls(int64_t length) final;
Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
const auto* offsets = array.GetValues<int32_t>(1);
static_assert(internal::may_have_validity_bitmap(MapType::type_id));
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
if (is_valid) {
ARROW_RETURN_NOT_OK(Append());
const int64_t slot_length = offsets[row + 1] - offsets[row];
// Add together the inner StructArray offset to the Map/List offset
int64_t key_value_offset = array.child_data[0].offset + offsets[row];
ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
array.child_data[0].child_data[0], key_value_offset, slot_length));
ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
array.child_data[0].child_data[1], key_value_offset, slot_length));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
/// \brief Get builder to append keys.
///
/// Append a key with this builder should be followed by appending
/// an item or null value with item_builder().
ArrayBuilder* key_builder() const { return key_builder_.get(); }
/// \brief Get builder to append items
///
/// Appending an item with this builder should have been preceded
/// by appending a key with key_builder().
ArrayBuilder* item_builder() const { return item_builder_.get(); }
/// \brief Get builder to add Map entries as struct values.
///
/// This is used instead of key_builder()/item_builder() and allows
/// the Map to be built as a list of struct values.
ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
std::shared_ptr<DataType> type() const override {
// Key and Item builder may update types, but they don't contain the field names,
// so we need to reconstruct the type. (See ARROW-13735.)
return std::make_shared<MapType>(
field(entries_name_,
struct_({field(key_name_, key_builder_->type(), false),
field(item_name_, item_builder_->type(), item_nullable_)}),
false),
keys_sorted_);
}
Status ValidateOverflow(int64_t new_elements) {
return list_builder_->ValidateOverflow(new_elements);
}
protected:
inline Status AdjustStructBuilderLength();
protected:
bool keys_sorted_ = false;
bool item_nullable_ = false;
std::string entries_name_;
std::string key_name_;
std::string item_name_;
std::shared_ptr<ListBuilder> list_builder_;
std::shared_ptr<ArrayBuilder> key_builder_;
std::shared_ptr<ArrayBuilder> item_builder_;
};
// ----------------------------------------------------------------------
// FixedSizeList builder
/// \class FixedSizeListBuilder
/// \brief Builder class for fixed-length list array value types
class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
public:
using TypeClass = FixedSizeListType;
/// Use this constructor to define the built array's type explicitly. If value_builder
/// has indeterminate type, this builder will also.
FixedSizeListBuilder(MemoryPool* pool,
const std::shared_ptr<ArrayBuilder>& value_builder,
int32_t list_size);
/// Use this constructor to infer the built array's type. If value_builder has
/// indeterminate type, this builder will also.
FixedSizeListBuilder(MemoryPool* pool,
const std::shared_ptr<ArrayBuilder>& value_builder,
const std::shared_ptr<DataType>& type);
Status Resize(int64_t capacity) override;
void Reset() override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
/// \brief Append a valid fixed length list.
///
/// This function affects only the validity bitmap; the child values must be appended
/// using the child array builder.
Status Append();
/// \brief Vector append
///
/// If passed, valid_bytes will be read and any zero byte
/// will cause the corresponding slot to be null
///
/// This function affects only the validity bitmap; the child values must be appended
/// using the child array builder. This includes appending nulls for null lists.
/// XXX this restriction is confusing, should this method be omitted?
Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
/// \brief Append a null fixed length list.
///
/// The child array builder will have the appropriate number of nulls appended
/// automatically.
Status AppendNull() final;
/// \brief Append length null fixed length lists.
///
/// The child array builder will have the appropriate number of nulls appended
/// automatically.
Status AppendNulls(int64_t length) final;
Status ValidateOverflow(int64_t new_elements);
Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;
Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
array.child_data[0], list_size_ * (array.offset + row), list_size_));
ARROW_RETURN_NOT_OK(Append());
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
ArrayBuilder* value_builder() const { return value_builder_.get(); }
std::shared_ptr<DataType> type() const override {
return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
}
// Cannot make this a static attribute because of linking issues
static constexpr int64_t maximum_elements() {
return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
}
protected:
std::shared_ptr<Field> value_field_;
const int32_t list_size_;
std::shared_ptr<ArrayBuilder> value_builder_;
};
// ----------------------------------------------------------------------
// Struct
// ---------------------------------------------------------------------------------
// StructArray builder
/// Append, Resize and Reserve methods are acting on StructBuilder.
/// Please make sure all these methods of all child-builders' are consistently
/// called to maintain data-structure consistency.
class ARROW_EXPORT StructBuilder : public ArrayBuilder {
public:
/// If any of field_builders has indeterminate type, this builder will also
StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
/// Null bitmap is of equal length to every child field, and any zero byte
/// will be considered as a null for that field, but users must using app-
/// end methods or advance methods of the child builders' independently to
/// insert data.
Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
/// Append an element to the Struct. All child-builders' Append method must
/// be called independently to maintain data-structure consistency.
Status Append(bool is_valid = true) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(is_valid);
return Status::OK();
}
/// \brief Append a null value. Automatically appends an empty value to each child
/// builder.
Status AppendNull() final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
}
return Append(false);
}
/// \brief Append multiple null values. Automatically appends empty values to each
/// child builder.
Status AppendNulls(int64_t length) final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
}
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(length, false);
return Status::OK();
}
Status AppendEmptyValue() final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
}
return Append(true);
}
Status AppendEmptyValues(int64_t length) final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
}
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(length, true);
return Status::OK();
}
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i],
array.offset + offset, length));
}
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(validity, array.offset + offset, length);
return Status::OK();
}
void Reset() override;
ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
int num_fields() const { return static_cast<int>(children_.size()); }
std::shared_ptr<DataType> type() const override;
private:
std::shared_ptr<DataType> type_;
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,689 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <memory>
#include <vector>
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/float16.h"
namespace arrow {
class ARROW_EXPORT NullBuilder : public ArrayBuilder {
public:
explicit NullBuilder(MemoryPool* pool = default_memory_pool(),
int64_t ARROW_ARG_UNUSED(alignment) = kDefaultBufferAlignment)
: ArrayBuilder(pool) {}
explicit NullBuilder(const std::shared_ptr<DataType>& ARROW_ARG_UNUSED(type),
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: NullBuilder(pool, alignment) {}
/// \brief Append the specified number of null elements
Status AppendNulls(int64_t length) final {
if (length < 0) return Status::Invalid("length must be positive");
null_count_ += length;
length_ += length;
return Status::OK();
}
/// \brief Append a single null element
Status AppendNull() final { return AppendNulls(1); }
Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
Status Append(std::nullptr_t) { return AppendNull(); }
Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override {
return AppendNulls(length);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
std::shared_ptr<DataType> type() const override { return null(); }
Status Finish(std::shared_ptr<NullArray>* out) { return FinishTyped(out); }
};
/// \addtogroup numeric-builders
///
/// @{
/// Base class for all Builders that emit an Array of a scalar numerical type.
template <typename T>
class NumericBuilder
: public ArrayBuilder,
public internal::ArrayBuilderExtraOps<NumericBuilder<T>, typename T::c_type> {
public:
using TypeClass = T;
using value_type = typename T::c_type;
using ArrayType = typename TypeTraits<T>::ArrayType;
template <typename T1 = T>
explicit NumericBuilder(
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment),
type_(TypeTraits<T>::type_singleton()),
data_builder_(pool, alignment) {}
NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
int64_t alignment = kDefaultBufferAlignment)
: ArrayBuilder(pool, alignment), type_(type), data_builder_(pool, alignment) {}
/// Append a single scalar and increase the size if necessary.
Status Append(const value_type val) {
ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
UnsafeAppend(val);
return Status::OK();
}
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
/// The memory at the corresponding data slot is set to 0 to prevent
/// uninitialized memory access
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, value_type{}); // zero
UnsafeSetNull(length);
return Status::OK();
}
/// \brief Append a single null element
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(value_type{}); // zero
UnsafeAppendToBitmap(false);
return Status::OK();
}
/// \brief Append a empty element
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(value_type{}); // zero
UnsafeAppendToBitmap(true);
return Status::OK();
}
/// \brief Append several empty elements
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, value_type{}); // zero
UnsafeSetNotNull(length);
return Status::OK();
}
value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
value_type* GetMutableValue(int64_t index) {
return &data_builder_.mutable_data()[index];
}
void Reset() override {
data_builder_.Reset();
ArrayBuilder::Reset();
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
return ArrayBuilder::Resize(capacity);
}
value_type operator[](int64_t index) const { return GetValue(index); }
value_type& operator[](int64_t index) {
return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const value_type* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, length);
// length_ is update by these
ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] bitmap a validity bitmap to copy (may be null)
/// \param[in] bitmap_offset an offset into the validity bitmap
/// \return Status
Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap,
int64_t bitmap_offset) {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, length);
// length_ is update by these
ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const value_type* values, int64_t length,
const std::vector<bool>& is_valid) {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, length);
// length_ is update by these
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of values
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<value_type>& values,
const std::vector<bool>& is_valid) {
if (values.empty()) {
return Status::OK();
}
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of values
/// \return Status
Status AppendValues(const std::vector<value_type>& values) {
if (values.empty()) {
return Status::OK();
}
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
null_bitmap_builder_.FinishWithLength(length_));
ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \return Status
template <typename ValuesIter>
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values_begin, values_end);
// this updates the length_
UnsafeSetNotNull(length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot, with a specified nullmap
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \param[in] valid_begin InputIterator with elements indication valid(1)
/// or null(0) values.
/// \return Status
template <typename ValuesIter, typename ValidIter>
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
static_assert(!internal::is_null_pointer<ValidIter>::value,
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
"version instead");
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values_begin, values_end);
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
// Same as above, with a pointer type ValidIter
template <typename ValuesIter, typename ValidIter>
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values_begin, values_end);
// this updates the length_
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
}
return Status::OK();
}
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
return AppendValues(array.GetValues<value_type>(1) + offset, length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}
/// Append a single scalar under the assumption that the underlying Buffer is
/// large enough.
///
/// This method does not capacity-check; make sure to call Reserve
/// beforehand.
void UnsafeAppend(const value_type val) {
ArrayBuilder::UnsafeAppendToBitmap(true);
data_builder_.UnsafeAppend(val);
}
void UnsafeAppendNull() {
ArrayBuilder::UnsafeAppendToBitmap(false);
data_builder_.UnsafeAppend(value_type{}); // zero
}
/// Advance builder without allocating nor writing any values
///
/// The internal pointer is advanced by `length` values and the same number
/// of non-null entries are appended to the validity bitmap.
/// This method assumes that the `length` values were populated directly,
/// for example using `GetMutableValue`.
void UnsafeAdvance(int64_t length) {
data_builder_.UnsafeAdvance(length);
UnsafeAppendToBitmap(length, true);
}
/// Advance builder without allocating nor writing any values
///
/// The internal pointer is advanced by `length` values and the same number
/// of validity bits are appended to the validity bitmap.
/// This method assumes that the `length` values were populated directly,
/// for example using `GetMutableValue`.
void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) {
data_builder_.UnsafeAdvance(length);
UnsafeAppendToBitmap(validity, valid_bits_offset, length);
}
std::shared_ptr<DataType> type() const override { return type_; }
protected:
std::shared_ptr<DataType> type_;
TypedBufferBuilder<value_type> data_builder_;
};
// Builders
using UInt8Builder = NumericBuilder<UInt8Type>;
using UInt16Builder = NumericBuilder<UInt16Type>;
using UInt32Builder = NumericBuilder<UInt32Type>;
using UInt64Builder = NumericBuilder<UInt64Type>;
using Int8Builder = NumericBuilder<Int8Type>;
using Int16Builder = NumericBuilder<Int16Type>;
using Int32Builder = NumericBuilder<Int32Type>;
using Int64Builder = NumericBuilder<Int64Type>;
using FloatBuilder = NumericBuilder<FloatType>;
using DoubleBuilder = NumericBuilder<DoubleType>;
/// @}
/// \addtogroup temporal-builders
///
/// @{
using Date32Builder = NumericBuilder<Date32Type>;
using Date64Builder = NumericBuilder<Date64Type>;
using Time32Builder = NumericBuilder<Time32Type>;
using Time64Builder = NumericBuilder<Time64Type>;
using TimestampBuilder = NumericBuilder<TimestampType>;
using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
using DurationBuilder = NumericBuilder<DurationType>;
/// @}
/// \addtogroup numeric-builders
///
/// @{
class ARROW_EXPORT HalfFloatBuilder : public NumericBuilder<HalfFloatType> {
public:
using BaseClass = NumericBuilder<HalfFloatType>;
using Float16 = arrow::util::Float16;
using BaseClass::Append;
using BaseClass::AppendValues;
using BaseClass::BaseClass;
using BaseClass::GetValue;
using BaseClass::UnsafeAppend;
/// Scalar append a arrow::util::Float16
Status Append(const Float16 val) { return Append(val.bits()); }
/// Scalar append a arrow::util::Float16, without checking for capacity
void UnsafeAppend(const Float16 val) { UnsafeAppend(val.bits()); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous array of arrow::util::Float16
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const Float16* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
valid_bytes);
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous array of arrow::util::Float16
/// \param[in] length the number of values to append
/// \param[in] bitmap a validity bitmap to copy (may be null)
/// \param[in] bitmap_offset an offset into the validity bitmap
/// \return Status
Status AppendValues(const Float16* values, int64_t length, const uint8_t* bitmap,
int64_t bitmap_offset) {
return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
bitmap, bitmap_offset);
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous array of arrow::util::Float16
/// \param[in] length the number of values to append
/// \param[in] is_valid a std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const Float16* values, int64_t length,
const std::vector<bool>& is_valid) {
return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
is_valid);
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector<arrow::util::Float16>
/// \param[in] is_valid a std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<Float16>& values,
const std::vector<bool>& is_valid) {
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector<arrow::util::Float16>
/// \return Status
Status AppendValues(const std::vector<Float16>& values) {
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
}
/// \brief Append one value many times in one shot
/// \param[in] length the number of values to append
/// \param[in] value a arrow::util::Float16
Status AppendValues(int64_t length, Float16 value) {
RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, value.bits());
ArrayBuilder::UnsafeSetNotNull(length);
return Status::OK();
}
/// \brief Get the value at a certain index
/// \param[in] index the zero-based index
/// @tparam T arrow::util::Float16 or value_type (uint16_t)
template <typename T = BaseClass::value_type>
T GetValue(int64_t index) const {
static_assert(std::is_same_v<T, BaseClass::value_type> ||
std::is_same_v<T, arrow::util::Float16>);
if constexpr (std::is_same_v<T, BaseClass::value_type>) {
return BaseClass::GetValue(index);
} else {
return Float16::FromBits(BaseClass::GetValue(index));
}
}
};
/// @}
class ARROW_EXPORT BooleanBuilder
: public ArrayBuilder,
public internal::ArrayBuilderExtraOps<BooleanBuilder, bool> {
public:
using TypeClass = BooleanType;
using value_type = bool;
explicit BooleanBuilder(MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment);
BooleanBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment);
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, false);
UnsafeSetNull(length);
return Status::OK();
}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendNull();
return Status::OK();
}
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(false);
UnsafeSetNotNull(1);
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, false);
UnsafeSetNotNull(length);
return Status::OK();
}
/// Scalar append
Status Append(const bool val) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(val);
return Status::OK();
}
Status Append(const uint8_t val) { return Append(val != 0); }
/// Scalar append, without checking for capacity
void UnsafeAppend(const bool val) {
data_builder_.UnsafeAppend(val);
UnsafeAppendToBitmap(true);
}
void UnsafeAppendNull() {
data_builder_.UnsafeAppend(false);
UnsafeAppendToBitmap(false);
}
void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous array of bytes (non-zero is 1)
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const uint8_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a bitmap of values
/// \param[in] length the number of values to append
/// \param[in] validity a validity bitmap to copy (may be null)
/// \param[in] offset an offset into the values and validity bitmaps
/// \return Status
Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity,
int64_t offset);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const uint8_t* values, int64_t length,
const std::vector<bool>& is_valid);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of bytes
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<uint8_t>& values,
const std::vector<bool>& is_valid);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of bytes
/// \return Status
Status AppendValues(const std::vector<uint8_t>& values);
/// \brief Append a sequence of elements in one shot
/// \param[in] values an std::vector<bool> indicating true (1) or false
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
/// \brief Append a sequence of elements in one shot
/// \param[in] values an std::vector<bool> indicating true (1) or false
/// \return Status
Status AppendValues(const std::vector<bool>& values);
/// \brief Append a sequence of elements in one shot
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// or null(0) values
/// \return Status
template <typename ValuesIter>
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend<false>(
length, [&values_begin]() -> bool { return *values_begin++; });
// this updates length_
UnsafeSetNotNull(length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot, with a specified nullmap
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \param[in] valid_begin InputIterator with elements indication valid(1)
/// or null(0) values
/// \return Status
template <typename ValuesIter, typename ValidIter>
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
static_assert(!internal::is_null_pointer<ValidIter>::value,
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
"version instead");
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend<false>(
length, [&values_begin]() -> bool { return *values_begin++; });
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
// Same as above, for a pointer type ValidIter
template <typename ValuesIter, typename ValidIter>
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend<false>(
length, [&values_begin]() -> bool { return *values_begin++; });
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
}
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
Status AppendValues(int64_t length, bool value);
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
return AppendValues(array.GetValues<uint8_t>(1, 0), length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<BooleanArray>* out) { return FinishTyped(out); }
void Reset() override;
Status Resize(int64_t capacity) override;
std::shared_ptr<DataType> type() const override { return boolean(); }
protected:
TypedBufferBuilder<bool> data_builder_;
};
} // namespace arrow

View File

@@ -0,0 +1,303 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/array.h"
#include "arrow/array/builder_base.h"
namespace arrow {
/// \addtogroup run-end-encoded-builders
///
/// @{
namespace internal {
/// \brief An ArrayBuilder that deduplicates repeated values as they are
/// appended to the inner-ArrayBuilder and reports the length of the current run
/// of identical values.
///
/// The following sequence of calls
///
/// Append(2)
/// Append(2)
/// Append(2)
/// Append(7)
/// Append(7)
/// Append(2)
/// FinishInternal()
///
/// will cause the inner-builder to receive only 3 Append calls
///
/// Append(2)
/// Append(7)
/// Append(2)
/// FinishInternal()
///
/// Note that values returned by length(), null_count() and capacity() are
/// related to the compressed array built by the inner-ArrayBuilder.
class RunCompressorBuilder : public ArrayBuilder {
public:
RunCompressorBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> inner_builder,
std::shared_ptr<DataType> type);
~RunCompressorBuilder() override;
ARROW_DISALLOW_COPY_AND_ASSIGN(RunCompressorBuilder);
/// \brief Called right before a run is being closed
///
/// Subclasses can override this function to perform an additional action when
/// a run is closed (i.e. run-length is known and value is appended to the
/// inner builder).
///
/// \param value can be NULLPTR if closing a run of NULLs
/// \param length the greater than 0 length of the value run being closed
virtual Status WillCloseRun(const std::shared_ptr<const Scalar>& value,
int64_t length) {
return Status::OK();
}
/// \brief Called right before a run of empty values is being closed
///
/// Subclasses can override this function to perform an additional action when
/// a run of empty values is appended (i.e. run-length is known and a single
/// empty value is appended to the inner builder).
///
/// \param length the greater than 0 length of the value run being closed
virtual Status WillCloseRunOfEmptyValues(int64_t length) { return Status::OK(); }
/// \brief Allocate enough memory for a given number of array elements.
///
/// NOTE: Conservatively resizing a run-length compressed array for a given
/// number of logical elements is not possible, since the physical length will
/// vary depending on the values to be appended in the future. But we can
/// pessimistically assume that each run will contain a single value and
/// allocate that number of runs.
Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
/// \brief Allocate enough memory for a given number of runs.
///
/// Like Resize on non-encoded builders, it does not account for variable size
/// data.
Status ResizePhysical(int64_t capacity);
Status ReservePhysical(int64_t additional_capacity) {
return Reserve(additional_capacity);
}
void Reset() override;
Status AppendNull() final { return AppendNulls(1); }
Status AppendNulls(int64_t length) override;
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
Status AppendEmptyValues(int64_t length) override;
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
Status AppendScalars(const ScalarVector& scalars) override;
// AppendArraySlice() is not implemented.
/// \brief Append a slice of an array containing values from already
/// compressed runs.
///
/// NOTE: WillCloseRun() is not called as the length of each run cannot be
/// determined at this point. Caller should ensure that !has_open_run() by
/// calling FinishCurrentRun() before calling this.
///
/// Pre-condition: !has_open_run()
Status AppendRunCompressedArraySlice(const ArraySpan& array, int64_t offset,
int64_t length);
/// \brief Forces the closing of the current run if one is currently open.
///
/// This can be called when one wants to ensure the current run will not be
/// extended. This may cause identical values to appear close to each other in
/// the underlying array (i.e. two runs that could be a single run) if more
/// values are appended after this is called.
///
/// Finish() and FinishInternal() call this automatically.
virtual Status FinishCurrentRun();
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
ArrayBuilder& inner_builder() const { return *inner_builder_; }
std::shared_ptr<DataType> type() const override { return inner_builder_->type(); }
bool has_open_run() const { return current_run_length_ > 0; }
int64_t open_run_length() const { return current_run_length_; }
private:
inline void UpdateDimensions() {
capacity_ = inner_builder_->capacity();
length_ = inner_builder_->length();
null_count_ = inner_builder_->null_count();
}
private:
std::shared_ptr<ArrayBuilder> inner_builder_;
std::shared_ptr<const Scalar> current_value_ = NULLPTR;
int64_t current_run_length_ = 0;
};
} // namespace internal
// ----------------------------------------------------------------------
// RunEndEncoded builder
/// \brief Run-end encoded array builder.
///
/// NOTE: the value returned by and capacity() is related to the
/// compressed array (physical) and not the decoded array (logical) that is
/// run-end encoded. null_count() always returns 0. length(), on the other hand,
/// returns the logical length of the run-end encoded array.
class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {
private:
// An internal::RunCompressorBuilder that produces a run-end in the
// RunEndEncodedBuilder every time a value-run is closed.
class ValueRunBuilder : public internal::RunCompressorBuilder {
public:
ValueRunBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& value_builder,
const std::shared_ptr<DataType>& value_type,
RunEndEncodedBuilder& ree_builder);
~ValueRunBuilder() override = default;
Status WillCloseRun(const std::shared_ptr<const Scalar>&, int64_t length) override {
return ree_builder_.CloseRun(length);
}
Status WillCloseRunOfEmptyValues(int64_t length) override {
return ree_builder_.CloseRun(length);
}
private:
RunEndEncodedBuilder& ree_builder_;
};
public:
RunEndEncodedBuilder(MemoryPool* pool,
const std::shared_ptr<ArrayBuilder>& run_end_builder,
const std::shared_ptr<ArrayBuilder>& value_builder,
std::shared_ptr<DataType> type);
/// \brief Allocate enough memory for a given number of array elements.
///
/// NOTE: Conservatively resizing an REE for a given number of logical
/// elements is not possible, since the physical length will vary depending on
/// the values to be appended in the future. But we can pessimistically assume
/// that each run will contain a single value and allocate that number of
/// runs.
Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
/// \brief Allocate enough memory for a given number of runs.
Status ResizePhysical(int64_t capacity);
/// \brief Ensure that there is enough space allocated to append the indicated
/// number of run without any further reallocation. Overallocation is
/// used in order to minimize the impact of incremental ReservePhysical() calls.
/// Note that additional_capacity is relative to the current number of elements
/// rather than to the current capacity, so calls to Reserve() which are not
/// interspersed with addition of new elements may not increase the capacity.
///
/// \param[in] additional_capacity the number of additional runs
/// \return Status
Status ReservePhysical(int64_t additional_capacity) {
return Reserve(additional_capacity);
}
void Reset() override;
Status AppendNull() final { return AppendNulls(1); }
Status AppendNulls(int64_t length) override;
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
Status AppendEmptyValues(int64_t length) override;
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
Status AppendScalars(const ScalarVector& scalars) override;
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<RunEndEncodedArray>* out) { return FinishTyped(out); }
/// \brief Forces the closing of the current run if one is currently open.
///
/// This can be called when one wants to ensure the current run will not be
/// extended. This may cause identical values to appear close to each other in
/// the values array (i.e. two runs that could be a single run) if more
/// values are appended after this is called.
Status FinishCurrentRun();
std::shared_ptr<DataType> type() const override;
private:
/// \brief Update physical capacity and logical length
///
/// \param committed_logical_length number of logical values that have been
/// committed to the values array
/// \param open_run_length number of logical values in the currently open run if any
inline void UpdateDimensions(int64_t committed_logical_length,
int64_t open_run_length) {
capacity_ = run_end_builder().capacity();
length_ = committed_logical_length + open_run_length;
committed_logical_length_ = committed_logical_length;
}
// Pre-condition: !value_run_builder_.has_open_run()
template <typename RunEndCType>
Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length);
template <typename RunEndCType>
Status DoAppendRunEnd(int64_t run_end);
/// \brief Cast run_end to the appropriate type and appends it to the run_ends
/// array.
Status AppendRunEnd(int64_t run_end);
/// \brief Close a run by appending a value to the run_ends array and updating
/// length_ to reflect the new run.
///
/// Pre-condition: run_length > 0.
[[nodiscard]] Status CloseRun(int64_t run_length);
ArrayBuilder& run_end_builder();
ArrayBuilder& value_builder();
private:
std::shared_ptr<RunEndEncodedType> type_;
ValueRunBuilder* value_run_builder_;
// The length not counting the current open run in the value_run_builder_
int64_t committed_logical_length_ = 0;
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,66 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Contains declarations of time related Arrow builder types.
#pragma once
#include <memory>
#include "arrow/array/builder_base.h"
#include "arrow/array/builder_primitive.h"
namespace arrow {
/// \addtogroup temporal-builders
///
/// @{
// TODO(ARROW-7938): this class is untested
class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
public:
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: DayTimeIntervalBuilder(day_time_interval(), pool, alignment) {}
explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: NumericBuilder<DayTimeIntervalType>(type, pool, alignment) {}
};
class ARROW_EXPORT MonthDayNanoIntervalBuilder
: public NumericBuilder<MonthDayNanoIntervalType> {
public:
using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos;
explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool, alignment) {}
explicit MonthDayNanoIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
: NumericBuilder<MonthDayNanoIntervalType>(type, pool, alignment) {}
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,254 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer_builder.h"
#include "arrow/memory_pool.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup nested-builders
///
/// @{
/// \brief Base class for union array builds.
///
/// Note that while we subclass ArrayBuilder, as union types do not have a
/// validity bitmap, the bitmap builder member of ArrayBuilder is not used.
class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
public:
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
/// \brief Make a new child builder available to the UnionArray
///
/// \param[in] new_child the child builder
/// \param[in] field_name the name of the field in the union array type
/// if type inference is used
/// \return child index, which is the "type" argument that needs
/// to be passed to the "Append" method to add a new element to
/// the union array.
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
const std::string& field_name = "");
std::shared_ptr<DataType> type() const override;
int64_t length() const override { return types_builder_.length(); }
protected:
BasicUnionBuilder(MemoryPool* pool, int64_t alignment,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type);
int8_t NextTypeId();
std::vector<std::shared_ptr<Field>> child_fields_;
std::vector<int8_t> type_codes_;
UnionMode::type mode_;
std::vector<ArrayBuilder*> type_id_to_children_;
std::vector<int> type_id_to_child_id_;
// for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
int8_t dense_type_id_ = 0;
TypedBufferBuilder<int8_t> types_builder_;
};
/// \class DenseUnionBuilder
///
/// This API is EXPERIMENTAL.
class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
public:
/// Use this constructor to initialize the UnionBuilder with no child builders,
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit DenseUnionBuilder(MemoryPool* pool,
int64_t alignment = kDefaultBufferAlignment)
: BasicUnionBuilder(pool, alignment, {}, dense_union(FieldVector{})),
offsets_builder_(pool, alignment) {}
/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
DenseUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type,
int64_t alignment = kDefaultBufferAlignment)
: BasicUnionBuilder(pool, alignment, children, type),
offsets_builder_(pool, alignment) {}
Status AppendNull() final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
// Append a null arbitrarily to the first child
return child_builder->AppendNull();
}
Status AppendNulls(int64_t length) final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
// Append just a single null to the first child
return child_builder->AppendNull();
}
Status AppendEmptyValue() final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
// Append an empty value arbitrarily to the first child
return child_builder->AppendEmptyValue();
}
Status AppendEmptyValues(int64_t length) final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
// Append just a single empty value to the first child
return child_builder->AppendEmptyValue();
}
/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
///
/// \param[in] next_type type_id of the child to which the next value will be appended.
///
/// The corresponding child builder must be appended to independently after this method
/// is called.
Status Append(int8_t next_type) {
ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
return Status::CapacityError(
"a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
"child");
}
auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
return offsets_builder_.Append(offset);
}
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
private:
TypedBufferBuilder<int32_t> offsets_builder_;
};
/// \class SparseUnionBuilder
///
/// This API is EXPERIMENTAL.
class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
public:
/// Use this constructor to initialize the UnionBuilder with no child builders,
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit SparseUnionBuilder(MemoryPool* pool,
int64_t alignment = kDefaultBufferAlignment)
: BasicUnionBuilder(pool, alignment, {}, sparse_union(FieldVector{})) {}
/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
SparseUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type,
int64_t alignment = kDefaultBufferAlignment)
: BasicUnionBuilder(pool, alignment, children, type) {}
/// \brief Append a null value.
///
/// A null is appended to the first child, empty values to the other children.
Status AppendNull() final {
const auto first_child_code = type_codes_[0];
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
}
return Status::OK();
}
/// \brief Append multiple null values.
///
/// Nulls are appended to the first child, empty values to the other children.
Status AppendNulls(int64_t length) final {
const auto first_child_code = type_codes_[0];
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
ARROW_RETURN_NOT_OK(
type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
}
return Status::OK();
}
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
for (int8_t code : type_codes_) {
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
}
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
for (int8_t code : type_codes_) {
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
}
return Status::OK();
}
/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
///
/// \param[in] next_type type_id of the child to which the next value will be appended.
///
/// The corresponding child builder must be appended to independently after this method
/// is called, and all other child builders must have null or empty value appended.
Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override;
};
/// @}
} // namespace arrow

View File

@@ -0,0 +1,53 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace internal {
/// \brief Concatenate arrays
///
/// \param[in] arrays a vector of arrays to be concatenated
/// \param[in] pool memory to store the result will be allocated from this memory pool
/// \param[out] out_suggested_cast if a non-OK Result is returned, the function might set
/// out_suggested_cast to a cast suggestion that would allow concatenating the arrays
/// without overflow of offsets (e.g. string to large_string)
///
/// \return the concatenated array
ARROW_EXPORT
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, MemoryPool* pool,
std::shared_ptr<DataType>* out_suggested_cast);
} // namespace internal
/// \brief Concatenate arrays
///
/// \param[in] arrays a vector of arrays to be concatenated
/// \param[in] pool memory to store the result will be allocated from this memory pool
/// \return the concatenated array
ARROW_EXPORT
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
MemoryPool* pool = default_memory_pool());
} // namespace arrow

View File

@@ -0,0 +1,750 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <atomic> // IWYU pragma: export
#include <cassert>
#include <cstdint>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/macros.h"
#include "arrow/util/span.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace internal {
// ----------------------------------------------------------------------
// Null handling for types without a validity bitmap and the dictionary type
ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i);
ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i);
ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i);
ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data);
ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data);
ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data);
} // namespace internal
// When slicing, we do not know the null count of the sliced range without
// doing some computation. To avoid doing this eagerly, we set the null count
// to -1 (any negative number will do). When Array::null_count is called the
// first time, the null count will be computed. See ARROW-33
constexpr int64_t kUnknownNullCount = -1;
// ----------------------------------------------------------------------
// Generic array data container
/// \class ArrayData
/// \brief Mutable container for generic Arrow array data
///
/// This data structure is a self-contained representation of the memory and
/// metadata inside an Arrow array data structure (called vectors in Java). The
/// Array class and its concrete subclasses provide strongly-typed accessors
/// with support for the visitor pattern and other affordances.
///
/// This class is designed for easy internal data manipulation, analytical data
/// processing, and data transport to and from IPC messages.
///
/// This class is also useful in an analytics setting where memory may be
/// efficiently reused. For example, computing the Abs of a numeric array
/// should return null iff the input is null: therefore, an Abs function can
/// reuse the validity bitmap (a Buffer) of its input as the validity bitmap
/// of its output.
///
/// This class is meant mostly for immutable data access. Any mutable access
/// (either to ArrayData members or to the contents of its Buffers) should take
/// into account the fact that ArrayData instances are typically wrapped in a
/// shared_ptr and can therefore have multiple owners at any given time.
/// Therefore, mutable access is discouraged except when initially populating
/// the ArrayData.
struct ARROW_EXPORT ArrayData {
ArrayData() = default;
ArrayData(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
#ifndef NDEBUG
// in debug mode, call the `device_type` function to trigger
// the DCHECKs that validate all the buffers are on the same device
ARROW_UNUSED(this->device_type());
#endif
}
ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
this->child_data = std::move(child_data);
#ifndef NDEBUG
// in debug mode, call the `device_type` function to trigger
// the DCHECKs that validate all the buffers (including children)
// are on the same device
ARROW_UNUSED(this->device_type());
#endif
}
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
// Move constructor
ArrayData(ArrayData&& other) noexcept
: type(std::move(other.type)),
length(other.length),
null_count(other.null_count.load()),
offset(other.offset),
buffers(std::move(other.buffers)),
child_data(std::move(other.child_data)),
dictionary(std::move(other.dictionary)),
statistics(std::move(other.statistics)) {}
// Copy constructor
ArrayData(const ArrayData& other) noexcept
: type(other.type),
length(other.length),
null_count(other.null_count.load()),
offset(other.offset),
buffers(other.buffers),
child_data(other.child_data),
dictionary(other.dictionary),
statistics(other.statistics) {}
// Move assignment
ArrayData& operator=(ArrayData&& other) {
type = std::move(other.type);
length = other.length;
SetNullCount(other.null_count);
offset = other.offset;
buffers = std::move(other.buffers);
child_data = std::move(other.child_data);
dictionary = std::move(other.dictionary);
statistics = std::move(other.statistics);
return *this;
}
// Copy assignment
ArrayData& operator=(const ArrayData& other) {
type = other.type;
length = other.length;
SetNullCount(other.null_count);
offset = other.offset;
buffers = other.buffers;
child_data = other.child_data;
dictionary = other.dictionary;
statistics = other.statistics;
return *this;
}
/// \brief Return a shallow copy of this ArrayData
std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
/// \brief Deep copy this ArrayData to destination memory manager
///
/// Returns a new ArrayData object with buffers and all child buffers
/// copied to the destination memory manager. This includes dictionaries
/// if applicable.
Result<std::shared_ptr<ArrayData>> CopyTo(
const std::shared_ptr<MemoryManager>& to) const;
/// \brief View or copy this ArrayData to destination memory manager
///
/// Tries to view the buffer contents on the given memory manager's device
/// if possible (to avoid a copy) but falls back to copying if a no-copy view
/// isn't supported.
Result<std::shared_ptr<ArrayData>> ViewOrCopyTo(
const std::shared_ptr<MemoryManager>& to) const;
/// \brief Return the null-ness of a given array element
///
/// Calling `IsNull(i)` is the same as `!IsValid(i)`.
bool IsNull(int64_t i) const { return !IsValid(i); }
/// \brief Return the validity of a given array element
///
/// For most data types, this will simply query the validity bitmap.
/// For union and run-end-encoded arrays, the underlying child data is
/// queried instead.
/// For dictionary arrays, this reflects the validity of the dictionary
/// index, but the corresponding dictionary value might still be null.
/// For null arrays, this always returns false.
bool IsValid(int64_t i) const {
if (buffers[0] != NULLPTR) {
return bit_util::GetBit(buffers[0]->data(), i + offset);
}
const auto type = this->type->id();
if (type == Type::SPARSE_UNION) {
return !internal::IsNullSparseUnion(*this, i);
}
if (type == Type::DENSE_UNION) {
return !internal::IsNullDenseUnion(*this, i);
}
if (type == Type::RUN_END_ENCODED) {
return !internal::IsNullRunEndEncoded(*this, i);
}
return null_count.load() != length;
}
/// \brief Access a buffer's data as a typed C pointer
///
/// \param i the buffer index
/// \param absolute_offset the offset into the buffer
///
/// If `absolute_offset` is non-zero, the type `T` must match the
/// layout of buffer number `i` for the array's data type; otherwise
/// offset computation would be incorrect.
///
/// If the given buffer is bit-packed (such as a validity bitmap, or
/// the data buffer of a boolean array), then `absolute_offset` must be
/// zero for correct results, and any bit offset must be applied manually
/// by the caller.
template <typename T>
inline const T* GetValues(int i, int64_t absolute_offset) const {
if (buffers[i]) {
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
} else {
return NULLPTR;
}
}
/// \brief Access a buffer's data as a typed C pointer
///
/// \param i the buffer index
///
/// This method uses the array's offset to index into buffer number `i`.
///
/// Calling this method on a bit-packed buffer (such as a validity bitmap, or
/// the data buffer of a boolean array) will lead to incorrect results.
/// You should instead call `GetValues(i, 0)` and apply the bit offset manually.
template <typename T>
inline const T* GetValues(int i) const {
return GetValues<T>(i, offset);
}
/// \brief Access a buffer's data as a typed C pointer
///
/// \param i the buffer index
/// \param absolute_offset the offset into the buffer
///
/// Like `GetValues(i, absolute_offset)`, but returns nullptr if the given buffer
/// is not a CPU buffer.
template <typename T>
inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
if (buffers[i] && buffers[i]->is_cpu()) {
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
} else {
return NULLPTR;
}
}
/// \brief Access a buffer's data as a typed C pointer
///
/// \param i the buffer index
///
/// Like `GetValues(i)`, but returns nullptr if the given buffer is not a CPU buffer.
template <typename T>
inline const T* GetValuesSafe(int i) const {
return GetValuesSafe<T>(i, offset);
}
/// \brief Access a buffer's data as a mutable typed C pointer
///
/// \param i the buffer index
/// \param absolute_offset the offset into the buffer
///
/// Like `GetValues(i, absolute_offset)`, but allows mutating buffer contents.
/// This should only be used when initially populating the ArrayData, before
/// it is attached to a Array instance.
template <typename T>
inline T* GetMutableValues(int i, int64_t absolute_offset) {
if (buffers[i]) {
return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
} else {
return NULLPTR;
}
}
/// \brief Access a buffer's data as a mutable typed C pointer
///
/// \param i the buffer index
///
/// Like `GetValues(i)`, but allows mutating buffer contents.
/// This should only be used when initially populating the ArrayData, before
/// it is attached to a Array instance.
template <typename T>
inline T* GetMutableValues(int i) {
return GetMutableValues<T>(i, offset);
}
/// \brief Construct a zero-copy slice of the data with the given offset and length
///
/// This method applies the given slice to this ArrayData, taking into account
/// its existing offset and length.
/// If the given `length` is too large, the slice length is clamped so as not
/// to go past the offset end.
/// If the given `often` is too large, or if either `offset` or `length` is negative,
/// behavior is undefined.
///
/// The associated ArrayStatistics is always discarded in a sliced
/// ArrayData, even if the slice is trivially equal to the original ArrayData.
/// If you want to reuse the statistics from the original ArrayData, you must
/// explicitly reattach them.
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
/// \brief Construct a zero-copy slice of the data with the given offset and length
///
/// Like `Slice(offset, length)`, but returns an error if the requested slice
/// falls out of bounds.
/// Unlike Slice, `length` isn't clamped to the available buffer size.
Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
/// \brief Set the cached physical null count
///
/// \param v the number of nulls in the ArrayData
///
/// This should only be used when initially populating the ArrayData, if
/// it possible to compute the null count without visiting the entire validity
/// bitmap. In most cases, relying on `GetNullCount` is sufficient.
void SetNullCount(int64_t v) { null_count.store(v); }
/// \brief Return the physical null count
///
/// This method returns the number of array elements for which `IsValid` would
/// return false.
///
/// A cached value is returned if already available, otherwise it is first
/// computed and stored.
/// How it is is computed depends on the data type, see `IsValid` for details.
///
/// Note that this method is typically much faster than calling `IsValid`
/// for all elements. Therefore, it helps avoid per-element validity bitmap
/// lookups in the common cases where the array contains zero or only nulls.
int64_t GetNullCount() const;
/// \brief Return true if the array may have nulls in its validity bitmap
///
/// This method returns true if the data has a validity bitmap, and the physical
/// null count is either known to be non-zero or not yet known.
///
/// Unlike `MayHaveLogicalNulls`, this does not check for the presence of nulls
/// in child data for data types such as unions and run-end encoded types.
///
/// \see HasValidityBitmap
/// \see MayHaveLogicalNulls
bool MayHaveNulls() const {
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
// but no buffer
return null_count.load() != 0 && buffers[0] != NULLPTR;
}
/// \brief Return true if the array has a validity bitmap
bool HasValidityBitmap() const { return buffers[0] != NULLPTR; }
/// \brief Return true if the array may have logical nulls
///
/// Unlike `MayHaveNulls`, this method checks for null child values
/// for types without a validity bitmap, such as unions and run-end encoded
/// types, and for null dictionary values for dictionary types.
///
/// This implies that `MayHaveLogicalNulls` may return true for arrays that
/// don't have a top-level validity bitmap. It is therefore necessary
/// to call `HasValidityBitmap` before accessing a top-level validity bitmap.
///
/// Code that previously used MayHaveNulls and then dealt with the validity
/// bitmap directly can be fixed to handle all types correctly without
/// performance degradation when handling most types by adopting
/// HasValidityBitmap and MayHaveLogicalNulls.
///
/// Before:
///
/// uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
/// for (int64_t i = 0; i < array.length; ++i) {
/// if (validity && !bit_util::GetBit(validity, i)) {
/// continue; // skip a NULL
/// }
/// ...
/// }
///
/// After:
///
/// bool all_valid = !array.MayHaveLogicalNulls();
/// uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
/// for (int64_t i = 0; i < array.length; ++i) {
/// bool is_valid = all_valid ||
/// (validity && bit_util::GetBit(validity, i)) ||
/// array.IsValid(i);
/// if (!is_valid) {
/// continue; // skip a NULL
/// }
/// ...
/// }
bool MayHaveLogicalNulls() const {
if (buffers[0] != NULLPTR) {
return null_count.load() != 0;
}
const auto t = type->id();
if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
return internal::UnionMayHaveLogicalNulls(*this);
}
if (t == Type::RUN_END_ENCODED) {
return internal::RunEndEncodedMayHaveLogicalNulls(*this);
}
if (t == Type::DICTIONARY) {
return internal::DictionaryMayHaveLogicalNulls(*this);
}
return null_count.load() != 0;
}
/// \brief Compute the logical null count for arrays of all types
///
/// If the array has a validity bitmap, this function behaves the same as
/// GetNullCount. For arrays that have no validity bitmap but whose values
/// may be logically null (such as union arrays and run-end encoded arrays),
/// this function recomputes the null count every time it is called.
///
/// \see GetNullCount
int64_t ComputeLogicalNullCount() const;
/// \brief Return the device_type of the underlying buffers and children
///
/// If there are no buffers in this ArrayData object, it just returns
/// DeviceAllocationType::kCPU as a default. We also assume that all buffers
/// should be allocated on the same device type and perform DCHECKs to confirm
/// this in debug mode.
///
/// \return DeviceAllocationType
DeviceAllocationType device_type() const;
std::shared_ptr<DataType> type;
int64_t length = 0;
mutable std::atomic<int64_t> null_count{0};
// The logical start point into the physical buffers (in values, not bytes).
// Note that, for child data, this must be *added* to the child data's own offset.
int64_t offset = 0;
std::vector<std::shared_ptr<Buffer>> buffers;
std::vector<std::shared_ptr<ArrayData>> child_data;
// The dictionary for this Array, if any. Only used for dictionary type
std::shared_ptr<ArrayData> dictionary;
// The statistics for this Array.
std::shared_ptr<ArrayStatistics> statistics;
};
/// \brief A non-owning Buffer reference
struct ARROW_EXPORT BufferSpan {
// It is the user of this class's responsibility to ensure that
// buffers that were const originally are not written to
// accidentally.
uint8_t* data = NULLPTR;
int64_t size = 0;
// Pointer back to buffer that owns this memory
const std::shared_ptr<Buffer>* owner = NULLPTR;
template <typename T>
const T* data_as() const {
return reinterpret_cast<const T*>(data);
}
template <typename T>
T* mutable_data_as() {
return reinterpret_cast<T*>(data);
}
};
/// \brief EXPERIMENTAL: A non-owning array data container
///
/// Unlike ArrayData, this class doesn't own its referenced data type nor data buffers.
/// It is cheaply copyable and can therefore be suitable for use cases where
/// shared_ptr overhead is not acceptable. However, care should be taken to
/// keep alive the referenced objects and memory while the ArraySpan object is in use.
/// For this reason, this should not be exposed in most public APIs (apart from
/// compute kernel interfaces).
struct ARROW_EXPORT ArraySpan {
const DataType* type = NULLPTR;
int64_t length = 0;
mutable int64_t null_count = kUnknownNullCount;
int64_t offset = 0;
BufferSpan buffers[3];
ArraySpan() = default;
explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}
ArraySpan(const ArrayData& data) { // NOLINT implicit conversion
SetMembers(data);
}
explicit ArraySpan(const Scalar& data) { FillFromScalar(data); }
/// If dictionary-encoded, put dictionary in the first entry
std::vector<ArraySpan> child_data;
/// \brief Populate ArraySpan to look like an array of length 1 pointing at
/// the data members of a Scalar value
void FillFromScalar(const Scalar& value);
void SetMembers(const ArrayData& data);
void SetBuffer(int index, const std::shared_ptr<Buffer>& buffer) {
this->buffers[index].data = const_cast<uint8_t*>(buffer->data());
this->buffers[index].size = buffer->size();
this->buffers[index].owner = &buffer;
}
const ArraySpan& dictionary() const { return child_data[0]; }
/// \brief Return the number of buffers (out of 3) that are used to
/// constitute this array
int num_buffers() const;
// Access a buffer's data as a typed C pointer
template <typename T>
inline T* GetValues(int i, int64_t absolute_offset) {
return reinterpret_cast<T*>(buffers[i].data) + absolute_offset;
}
template <typename T>
inline T* GetValues(int i) {
return GetValues<T>(i, this->offset);
}
// Access a buffer's data as a typed C pointer
template <typename T>
inline const T* GetValues(int i, int64_t absolute_offset) const {
return reinterpret_cast<const T*>(buffers[i].data) + absolute_offset;
}
template <typename T>
inline const T* GetValues(int i) const {
return GetValues<T>(i, this->offset);
}
/// \brief Access a buffer's data as a span
///
/// \param i The buffer index
/// \param length The required length (in number of typed values) of the requested span
/// \pre i > 0
/// \pre length <= the length of the buffer (in number of values) that's expected for
/// this array type
/// \return A span<const T> of the requested length
template <typename T>
util::span<const T> GetSpan(int i, int64_t length) const {
const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
assert(i > 0 && length + offset <= buffer_length);
ARROW_UNUSED(buffer_length);
return util::span<const T>(buffers[i].data_as<T>() + this->offset, length);
}
/// \brief Access a buffer's data as a span
///
/// \param i The buffer index
/// \param length The required length (in number of typed values) of the requested span
/// \pre i > 0
/// \pre length <= the length of the buffer (in number of values) that's expected for
/// this array type
/// \return A span<T> of the requested length
template <typename T>
util::span<T> GetSpan(int i, int64_t length) {
const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
assert(i > 0 && length + offset <= buffer_length);
ARROW_UNUSED(buffer_length);
return util::span<T>(buffers[i].mutable_data_as<T>() + this->offset, length);
}
inline bool IsNull(int64_t i) const { return !IsValid(i); }
inline bool IsValid(int64_t i) const {
if (this->buffers[0].data != NULLPTR) {
return bit_util::GetBit(this->buffers[0].data, i + this->offset);
} else {
const auto type = this->type->id();
if (type == Type::SPARSE_UNION) {
return !IsNullSparseUnion(i);
}
if (type == Type::DENSE_UNION) {
return !IsNullDenseUnion(i);
}
if (type == Type::RUN_END_ENCODED) {
return !IsNullRunEndEncoded(i);
}
return this->null_count != this->length;
}
}
std::shared_ptr<ArrayData> ToArrayData() const;
std::shared_ptr<Array> ToArray() const;
std::shared_ptr<Buffer> GetBuffer(int index) const {
const BufferSpan& buf = this->buffers[index];
if (buf.owner) {
return *buf.owner;
} else if (buf.data != NULLPTR) {
// Buffer points to some memory without an owning buffer
return std::make_shared<Buffer>(buf.data, buf.size);
} else {
return NULLPTR;
}
}
void SetSlice(int64_t offset, int64_t length) {
this->offset = offset;
this->length = length;
if (this->type->id() == Type::NA) {
this->null_count = this->length;
} else if (this->MayHaveNulls()) {
this->null_count = kUnknownNullCount;
} else {
this->null_count = 0;
}
}
/// \brief Return physical null count, or compute and set it if it's not known
int64_t GetNullCount() const;
/// \brief Return true if the array has a validity bitmap and the physical null
/// count is known to be non-zero or not yet known
///
/// Note that this is not the same as MayHaveLogicalNulls, which also checks
/// for the presence of nulls in child data for types like unions and run-end
/// encoded types.
///
/// \see HasValidityBitmap
/// \see MayHaveLogicalNulls
bool MayHaveNulls() const {
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
// but no buffer
return null_count != 0 && buffers[0].data != NULLPTR;
}
/// \brief Return true if the array has a validity bitmap
bool HasValidityBitmap() const { return buffers[0].data != NULLPTR; }
/// \brief Return true if the validity bitmap may have 0's in it, or if the
/// child arrays (in the case of types without a validity bitmap) may have
/// nulls, or if the dictionary of dictionay array may have nulls.
///
/// \see ArrayData::MayHaveLogicalNulls
bool MayHaveLogicalNulls() const {
if (buffers[0].data != NULLPTR) {
return null_count != 0;
}
const auto t = type->id();
if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
return UnionMayHaveLogicalNulls();
}
if (t == Type::RUN_END_ENCODED) {
return RunEndEncodedMayHaveLogicalNulls();
}
if (t == Type::DICTIONARY) {
return DictionaryMayHaveLogicalNulls();
}
return null_count != 0;
}
/// \brief Compute the logical null count for arrays of all types including
/// those that do not have a validity bitmap like union and run-end encoded
/// arrays
///
/// If the array has a validity bitmap, this function behaves the same as
/// GetNullCount. For types that have no validity bitmap, this function will
/// recompute the logical null count every time it is called.
///
/// \see GetNullCount
int64_t ComputeLogicalNullCount() const;
/// Some DataTypes (StringView, BinaryView) may have an arbitrary number of variadic
/// buffers. Since ArraySpan only has 3 buffers, we pack the variadic buffers into
/// buffers[2]; IE buffers[2].data points to the first shared_ptr<Buffer> of the
/// variadic set and buffers[2].size is the number of variadic buffers times
/// sizeof(shared_ptr<Buffer>).
///
/// \see HasVariadicBuffers
util::span<const std::shared_ptr<Buffer>> GetVariadicBuffers() const;
bool HasVariadicBuffers() const;
private:
ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& data,
int64_t i);
bool IsNullSparseUnion(int64_t i) const;
bool IsNullDenseUnion(int64_t i) const;
/// \brief Return true if the value at logical index i is null
///
/// This function uses binary-search, so it has a O(log N) cost.
/// Iterating over the whole array and calling IsNull is O(N log N), so
/// for better performance it is recommended to use a
/// ree_util::RunEndEncodedArraySpan to iterate run by run instead.
bool IsNullRunEndEncoded(int64_t i) const;
bool UnionMayHaveLogicalNulls() const;
bool RunEndEncodedMayHaveLogicalNulls() const;
bool DictionaryMayHaveLogicalNulls() const;
};
namespace internal {
void FillZeroLengthArray(const DataType* type, ArraySpan* span);
/// Construct a zero-copy view of this ArrayData with the given type.
///
/// This method checks if the types are layout-compatible.
/// Nested types are traversed in depth-first order. Data buffers must have
/// the same item sizes, even though the logical types may be different.
/// An error is returned if the types are not layout-compatible.
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
const std::shared_ptr<DataType>& type);
} // namespace internal
} // namespace arrow

View File

@@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <functional>
#include <iosfwd>
#include <memory>
#include "arrow/array/array_base.h"
#include "arrow/array/array_nested.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \brief Compare two arrays, returning an edit script which expresses the difference
/// between them
///
/// An edit script is an array of struct(insert: bool, run_length: int64_t).
/// Each element of "insert" determines whether an element was inserted into (true)
/// or deleted from (false) base. Each insertion or deletion is followed by a run of
/// elements which are unchanged from base to target; the length of this run is stored
/// in "run_length". (Note that the edit script begins and ends with a run of shared
/// elements but both fields of the struct must have the same length. To accommodate this
/// the first element of "insert" should be ignored.)
///
/// For example for base "hlloo" and target "hello", the edit script would be
/// [
/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
/// {"insert": false, "run_length": 0} // delete("o") then an empty run
/// ]
///
/// Diffing arrays containing nulls is not currently supported.
///
/// \param[in] base baseline for comparison
/// \param[in] target an array of identical type to base whose elements differ from base's
/// \param[in] pool memory to store the result will be allocated from this memory pool
/// \return an edit script array which can be applied to base to produce target
ARROW_EXPORT
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
MemoryPool* pool = default_memory_pool());
/// \brief visitor interface for easy traversal of an edit script
///
/// visitor will be called for each hunk of insertions and deletions.
ARROW_EXPORT Status VisitEditScript(
const Array& edits,
const std::function<Status(int64_t delete_begin, int64_t delete_end,
int64_t insert_begin, int64_t insert_end)>& visitor);
/// \brief return a function which will format an edit script in unified
/// diff format to os, given base and target arrays of type
ARROW_EXPORT Result<
std::function<Status(const Array& edits, const Array& base, const Array& target)>>
MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
} // namespace arrow

View File

@@ -0,0 +1,167 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <optional>
#include <string>
#include <variant>
#include "arrow/compare.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \class ArrayStatistics
/// \brief Statistics for an Array
///
/// Apache Arrow format doesn't have statistics but data source such
/// as Apache Parquet may have statistics. Statistics associated with
/// data source can be read unified API via this class.
struct ARROW_EXPORT ArrayStatistics {
/// \brief The type for maximum and minimum values. If the target
/// value exists, one of them is used. `std::nullopt` is used
/// otherwise.
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
using NumericType = std::variant<int64_t, double>;
using CountType = NumericType;
using SizeType = NumericType;
static const std::shared_ptr<DataType>& ValueToArrowType(
const std::optional<ValueType>& value,
const std::shared_ptr<DataType>& array_type) {
if (!value.has_value()) {
return null();
}
struct Visitor {
const std::shared_ptr<DataType>& array_type;
const std::shared_ptr<DataType>& operator()(const bool&) { return boolean(); }
const std::shared_ptr<DataType>& operator()(const int64_t&) { return int64(); }
const std::shared_ptr<DataType>& operator()(const uint64_t&) { return uint64(); }
const std::shared_ptr<DataType>& operator()(const double&) { return float64(); }
const std::shared_ptr<DataType>& operator()(const std::string&) {
switch (array_type->id()) {
case Type::STRING:
case Type::BINARY:
case Type::FIXED_SIZE_BINARY:
case Type::LARGE_STRING:
case Type::LARGE_BINARY:
case Type::BINARY_VIEW:
case Type::STRING_VIEW:
return array_type;
default:
return utf8();
}
}
} visitor{array_type};
return std::visit(visitor, value.value());
}
/// \brief The number of null values, may not be set
std::optional<int64_t> null_count = std::nullopt;
/// \brief The number of distinct values, may not be set
/// Note: when set to `int64_t`, it represents `exact_distinct_count`,
/// and when set to `double`, it represents `approximate_distinct_count`.
std::optional<CountType> distinct_count = std::nullopt;
/// \brief The maximum length in bytes of the rows in an array; may not be set
/// Note: when the type is `int64_t`, it represents `max_byte_width_exact`,
/// and when the type is `double`, it represents `max_byte_width_approximate`.
std::optional<SizeType> max_byte_width = std::nullopt;
/// \brief The average size in bytes of a row in an array, may not be set.
std::optional<double> average_byte_width = std::nullopt;
/// \brief Whether the average size in bytes is exact or not.
bool is_average_byte_width_exact = false;
/// \brief The minimum value, may not be set
std::optional<ValueType> min = std::nullopt;
/// \brief Compute Arrow type of the minimum value.
///
/// If \ref ValueType is `std::string`, `array_type` may be
/// used. If `array_type` is a binary-like type such as \ref
/// arrow::binary and \ref arrow::large_utf8, `array_type` is
/// returned. \ref arrow::utf8 is returned otherwise.
///
/// If \ref ValueType isn't `std::string`, `array_type` isn't used.
///
/// \param array_type The Arrow type of the associated array.
///
/// \return \ref arrow::null if the minimum value is `std::nullopt`,
/// Arrow type based on \ref ValueType of the \ref min
/// otherwise.
const std::shared_ptr<DataType>& MinArrowType(
const std::shared_ptr<DataType>& array_type) {
return ValueToArrowType(min, array_type);
}
/// \brief Whether the minimum value is exact or not
bool is_min_exact = false;
/// \brief The maximum value, may not be set
std::optional<ValueType> max = std::nullopt;
/// \brief Compute Arrow type of the maximum value.
///
/// If \ref ValueType is `std::string`, `array_type` may be
/// used. If `array_type` is a binary-like type such as \ref
/// arrow::binary and \ref arrow::large_utf8, `array_type` is
/// returned. \ref arrow::utf8 is returned otherwise.
///
/// If \ref ValueType isn't `std::string`, `array_type` isn't used.
///
/// \param array_type The Arrow type of the associated array.
///
/// \return \ref arrow::null if the maximum value is `std::nullopt`,
/// Arrow type based on \ref ValueType of the \ref max
/// otherwise.
const std::shared_ptr<DataType>& MaxArrowType(
const std::shared_ptr<DataType>& array_type) {
return ValueToArrowType(max, array_type);
}
/// \brief Whether the maximum value is exact or not
bool is_max_exact = false;
/// \brief Check two \ref arrow::ArrayStatistics for equality
///
/// \param other The \ref arrow::ArrayStatistics instance to compare against.
///
/// \param equal_options Options used to compare double values for equality.
///
/// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise,
/// false.
bool Equals(const ArrayStatistics& other,
const EqualOptions& equal_options = EqualOptions::Defaults()) const {
return ArrayStatisticsEquals(*this, other, equal_options);
}
/// \brief Check two statistics for equality
bool operator==(const ArrayStatistics& other) const { return Equals(other); }
/// \brief Check two statistics for not equality
bool operator!=(const ArrayStatistics& other) const { return !Equals(other); }
};
} // namespace arrow

View File

@@ -0,0 +1,96 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <vector>
#include "arrow/array/data.h"
#include "arrow/compare.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \defgroup array-factories Array factory functions
///
/// @{
/// \brief Create a strongly-typed Array instance from generic ArrayData
/// \param[in] data the array contents
/// \return the resulting Array instance
ARROW_EXPORT
std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
/// \brief Create a strongly-typed Array instance with all elements null
/// \param[in] type the array type
/// \param[in] length the array length
/// \param[in] pool the memory pool to allocate memory from
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
int64_t length,
MemoryPool* pool = default_memory_pool());
/// \brief Create an Array instance whose slots are the given scalar
/// \param[in] scalar the value with which to fill the array
/// \param[in] length the array length
/// \param[in] pool the memory pool to allocate memory from
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeArrayFromScalar(
const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
/// \brief Create an empty Array of a given type
///
/// The output Array will be of the given type.
///
/// \param[in] type the data type of the empty Array
/// \param[in] pool the memory pool to allocate memory from
/// \return the resulting Array
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool());
/// @}
namespace internal {
/// \brief Swap endian of each element in a generic ArrayData
///
/// As dictionaries are often shared between different arrays, dictionaries
/// are not swapped by this function and should be handled separately.
///
/// \param[in] data the array contents
/// \param[in] pool the memory pool to allocate memory from
/// \return the resulting ArrayData whose elements were swapped
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
const std::shared_ptr<ArrayData>& data, MemoryPool* pool = default_memory_pool());
/// Given a number of ArrayVectors, treat each ArrayVector as the
/// chunks of a chunked array. Then rechunk each ArrayVector such that
/// all ArrayVectors are chunked identically. It is mandatory that
/// all ArrayVectors contain the same total number of elements.
ARROW_EXPORT
std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
} // namespace internal
} // namespace arrow

View File

@@ -0,0 +1,56 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace internal {
// Internal functions implementing Array::Validate() and friends.
// O(1) array metadata validation
ARROW_EXPORT
Status ValidateArray(const Array& array);
ARROW_EXPORT
Status ValidateArray(const ArrayData& data);
// O(N) array data validation.
// Note that, starting from 7.0.0, "full" routines also validate metadata.
// Before, ValidateArray() needed to be called before ValidateArrayFull()
// to ensure metadata correctness, otherwise invalid memory accesses
// may occur.
ARROW_EXPORT
Status ValidateArrayFull(const Array& array);
ARROW_EXPORT
Status ValidateArrayFull(const ArrayData& data);
ARROW_EXPORT
Status ValidateUTF8(const Array& array);
ARROW_EXPORT
Status ValidateUTF8(const ArrayData& data);
} // namespace internal
} // namespace arrow