Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// NOTE: API is EXPERIMENTAL and will change without going through a
// deprecation cycle
#pragma once
/// \defgroup compute-functions Abstract compute function API
/// @{
/// @}
/// \defgroup compute-concrete-options Concrete option classes for compute functions
/// @{
/// @}
#include "arrow/compute/api_aggregate.h" // IWYU pragma: export
#include "arrow/compute/api_scalar.h" // IWYU pragma: export
#include "arrow/compute/api_vector.h" // IWYU pragma: export
#include "arrow/compute/cast.h" // IWYU pragma: export
#include "arrow/compute/function.h" // IWYU pragma: export
#include "arrow/compute/function_options.h" // IWYU pragma: export
#include "arrow/compute/initialize.h" // IWYU pragma: export
#include "arrow/compute/kernel.h" // IWYU pragma: export
#include "arrow/compute/registry.h" // IWYU pragma: export
#include "arrow/datum.h" // IWYU pragma: export
#include "arrow/compute/expression.h" // IWYU pragma: export
/// \defgroup execnode-row Utilities for working with data in a row-major format
/// @{
/// @}
#include "arrow/compute/row/grouper.h" // IWYU pragma: export
/// \defgroup acero-internals Acero internals, useful for those extending Acero
/// @{
/// @}
#include "arrow/compute/exec.h" // IWYU pragma: export

View File

@@ -0,0 +1,596 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Eager evaluation convenience APIs for invoking common functions, including
// necessary memory allocations
#pragma once
#include <vector>
#include "arrow/compute/function_options.h"
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
class Array;
namespace compute {
class ExecContext;
// ----------------------------------------------------------------------
// Aggregate functions
/// \addtogroup compute-concrete-options
/// @{
/// \brief Control general scalar aggregate kernel behavior
///
/// By default, null values are ignored (skip_nulls = true).
class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
public:
explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
static constexpr const char kTypeName[] = "ScalarAggregateOptions";
static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
/// If true (the default), null values are ignored. Otherwise, if any value is null,
/// emit null.
bool skip_nulls;
/// If less than this many non-null values are observed, emit null.
uint32_t min_count;
};
/// \brief Control count aggregate kernel behavior.
///
/// By default, only non-null values are counted.
class ARROW_EXPORT CountOptions : public FunctionOptions {
public:
enum CountMode {
/// Count only non-null values.
ONLY_VALID = 0,
/// Count only null values.
ONLY_NULL,
/// Count both non-null and null values.
ALL,
};
explicit CountOptions(CountMode mode = CountMode::ONLY_VALID);
static constexpr const char kTypeName[] = "CountOptions";
static CountOptions Defaults() { return CountOptions{}; }
CountMode mode;
};
/// \brief Control Mode kernel behavior
///
/// Returns top-n common values and counts.
/// By default, returns the most common value and count.
class ARROW_EXPORT ModeOptions : public FunctionOptions {
public:
explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0);
static constexpr const char kTypeName[] = "ModeOptions";
static ModeOptions Defaults() { return ModeOptions{}; }
int64_t n = 1;
/// If true (the default), null values are ignored. Otherwise, if any value is null,
/// emit null.
bool skip_nulls;
/// If less than this many non-null values are observed, emit null.
uint32_t min_count;
};
/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
///
/// The divisor used in calculations is N - ddof, where N is the number of elements.
/// By default, ddof is zero, and population variance or stddev is returned.
class ARROW_EXPORT VarianceOptions : public FunctionOptions {
public:
explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0);
static constexpr const char kTypeName[] = "VarianceOptions";
static VarianceOptions Defaults() { return VarianceOptions{}; }
int ddof = 0;
/// If true (the default), null values are ignored. Otherwise, if any value is null,
/// emit null.
bool skip_nulls;
/// If less than this many non-null values are observed, emit null.
uint32_t min_count;
};
/// \brief Control Skew and Kurtosis kernel behavior
class ARROW_EXPORT SkewOptions : public FunctionOptions {
public:
explicit SkewOptions(bool skip_nulls = true, bool biased = true,
uint32_t min_count = 0);
static constexpr const char kTypeName[] = "SkewOptions";
static SkewOptions Defaults() { return SkewOptions{}; }
/// If true (the default), null values are ignored. Otherwise, if any value is null,
/// emit null.
bool skip_nulls;
/// If true (the default), the calculated value is biased. If false, the calculated
/// value includes a correction factor to reduce bias, making it more accurate for
/// small sample sizes.
bool biased;
/// If less than this many non-null values are observed, emit null.
uint32_t min_count;
};
/// \brief Control Quantile kernel behavior
///
/// By default, returns the median value.
class ARROW_EXPORT QuantileOptions : public FunctionOptions {
public:
/// Interpolation method to use when quantile lies between two data points
enum Interpolation {
LINEAR = 0,
LOWER,
HIGHER,
NEAREST,
MIDPOINT,
};
explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR,
bool skip_nulls = true, uint32_t min_count = 0);
explicit QuantileOptions(std::vector<double> q,
enum Interpolation interpolation = LINEAR,
bool skip_nulls = true, uint32_t min_count = 0);
static constexpr const char kTypeName[] = "QuantileOptions";
static QuantileOptions Defaults() { return QuantileOptions{}; }
/// probability level of quantile must be between 0 and 1 inclusive
std::vector<double> q;
enum Interpolation interpolation;
/// If true (the default), null values are ignored. Otherwise, if any value is null,
/// emit null.
bool skip_nulls;
/// If less than this many non-null values are observed, emit null.
uint32_t min_count;
};
/// \brief Control TDigest approximate quantile kernel behavior
///
/// By default, returns the median value.
class ARROW_EXPORT TDigestOptions : public FunctionOptions {
public:
explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
uint32_t buffer_size = 500, bool skip_nulls = true,
uint32_t min_count = 0);
explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
uint32_t buffer_size = 500, bool skip_nulls = true,
uint32_t min_count = 0);
static constexpr const char kTypeName[] = "TDigestOptions";
static TDigestOptions Defaults() { return TDigestOptions{}; }
/// probability level of quantile must be between 0 and 1 inclusive
std::vector<double> q;
/// compression parameter, default 100
uint32_t delta;
/// input buffer size, default 500
uint32_t buffer_size;
/// If true (the default), null values are ignored. Otherwise, if any value is null,
/// emit null.
bool skip_nulls;
/// If less than this many non-null values are observed, emit null.
uint32_t min_count;
};
/// \brief Control Pivot kernel behavior
///
/// These options apply to the "pivot_wider" and "hash_pivot_wider" functions.
///
/// Constraints:
/// - The corresponding `Aggregate::target` must have two FieldRef elements;
/// the first one points to the pivot key column, the second points to the
/// pivoted data column.
/// - The pivot key column can be string, binary or integer; its values will be
/// matched against `key_names` in order to dispatch the pivoted data into
/// the output. If the pivot key column is not string-like, the `key_names`
/// will be cast to the pivot key type.
///
/// "pivot_wider" example
/// ---------------------
///
/// Assuming the following two input columns with types utf8 and int16 (respectively):
/// ```
/// width | 11
/// height | 13
/// ```
/// and the options `PivotWiderOptions(.key_names = {"height", "width"})`
///
/// then the output will be a scalar with the type
/// `struct{"height": int16, "width": int16}`
/// and the value `{"height": 13, "width": 11}`.
///
/// "hash_pivot_wider" example
/// --------------------------
///
/// Assuming the following input with schema
/// `{"group": int32, "key": utf8, "value": int16}`:
/// ```
/// group | key | value
/// -----------------------------
/// 1 | height | 11
/// 1 | width | 12
/// 2 | width | 13
/// 3 | height | 14
/// 3 | depth | 15
/// ```
/// and the following settings:
/// - a hash grouping key "group"
/// - Aggregate(
/// .function = "hash_pivot_wider",
/// .options = PivotWiderOptions(.key_names = {"height", "width"}),
/// .target = {"key", "value"},
/// .name = {"properties"})
///
/// then the output will have the schema
/// `{"group": int32, "properties": struct{"height": int16, "width": int16}}`
/// and the following value:
/// ```
/// group | properties
/// | height | width
/// -----------------------------
/// 1 | 11 | 12
/// 2 | null | 13
/// 3 | 14 | null
/// ```
class ARROW_EXPORT PivotWiderOptions : public FunctionOptions {
public:
/// Configure the behavior of pivot keys not in `key_names`
enum UnexpectedKeyBehavior {
/// Unexpected pivot keys are ignored silently
kIgnore,
/// Unexpected pivot keys return a KeyError
kRaise
};
explicit PivotWiderOptions(std::vector<std::string> key_names,
UnexpectedKeyBehavior unexpected_key_behavior = kIgnore);
// Default constructor for serialization
PivotWiderOptions();
static constexpr const char kTypeName[] = "PivotWiderOptions";
static PivotWiderOptions Defaults() { return PivotWiderOptions{}; }
/// The values expected in the pivot key column
std::vector<std::string> key_names;
/// The behavior when pivot keys not in `key_names` are encountered
UnexpectedKeyBehavior unexpected_key_behavior = kIgnore;
};
/// \brief Control Index kernel behavior
class ARROW_EXPORT IndexOptions : public FunctionOptions {
public:
explicit IndexOptions(std::shared_ptr<Scalar> value);
// Default constructor for serialization
IndexOptions();
static constexpr const char kTypeName[] = "IndexOptions";
std::shared_ptr<Scalar> value;
};
/// \brief Configure a grouped aggregation
struct ARROW_EXPORT Aggregate {
Aggregate() = default;
Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
std::vector<FieldRef> target, std::string name = "")
: function(std::move(function)),
options(std::move(options)),
target(std::move(target)),
name(std::move(name)) {}
Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
FieldRef target, std::string name = "")
: Aggregate(std::move(function), std::move(options),
std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
Aggregate(std::string function, FieldRef target, std::string name)
: Aggregate(std::move(function), /*options=*/NULLPTR,
std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
Aggregate(std::string function, std::string name)
: Aggregate(std::move(function), /*options=*/NULLPTR,
/*target=*/std::vector<FieldRef>{}, std::move(name)) {}
/// the name of the aggregation function
std::string function;
/// options for the aggregation function
std::shared_ptr<FunctionOptions> options;
/// zero or more fields to which aggregations will be applied
std::vector<FieldRef> target;
/// optional output field name for aggregations
std::string name;
};
/// @}
/// \brief Count values in an array.
///
/// \param[in] options counting options, see CountOptions for more information
/// \param[in] datum to count
/// \param[in] ctx the function execution context, optional
/// \return out resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Count(const Datum& datum,
const CountOptions& options = CountOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Compute the mean of a numeric array.
///
/// \param[in] value datum to compute the mean, expecting Array
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed mean as a DoubleScalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Mean(
const Datum& value,
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Compute the product of values of a numeric array.
///
/// \param[in] value datum to compute product of, expecting Array or ChunkedArray
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed sum as a Scalar
///
/// \since 6.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Product(
const Datum& value,
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Sum values of a numeric array.
///
/// \param[in] value datum to sum, expecting Array or ChunkedArray
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed sum as a Scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Sum(
const Datum& value,
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the first value of an array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed first as Scalar
///
/// \since 13.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> First(
const Datum& value,
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the last value of an array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed last as a Scalar
///
/// \since 13.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Last(
const Datum& value,
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the min / max of a numeric array
///
/// This function returns both the min and max as a struct scalar, with type
/// struct<min: T, max: T>, where T is the input type
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a struct<min: T, max: T> scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> MinMax(
const Datum& value,
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Test whether any element in a boolean array evaluates to true.
///
/// This function returns true if any of the elements in the array evaluates
/// to true and false otherwise. Null values are ignored by default.
/// If null values are taken into account by setting ScalarAggregateOptions
/// parameter skip_nulls = false then Kleene logic is used.
/// See KleeneOr for more details on Kleene logic.
///
/// \param[in] value input datum, expecting a boolean array
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a BooleanScalar
///
/// \since 3.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Any(
const Datum& value,
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Test whether all elements in a boolean array evaluate to true.
///
/// This function returns true if all of the elements in the array evaluate
/// to true and false otherwise. Null values are ignored by default.
/// If null values are taken into account by setting ScalarAggregateOptions
/// parameter skip_nulls = false then Kleene logic is used.
/// See KleeneAnd for more details on Kleene logic.
///
/// \param[in] value input datum, expecting a boolean array
/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a BooleanScalar
/// \since 3.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> All(
const Datum& value,
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the modal (most common) value of a numeric array
///
/// This function returns top-n most common values and number of times they occur as
/// an array of `struct<mode: T, count: int64>`, where T is the input type.
/// Values with larger counts are returned before smaller ones.
/// If there are more than one values with same count, smaller value is returned first.
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see ModeOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as an array of struct<mode: T, count: int64>
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Mode(const Datum& value,
const ModeOptions& options = ModeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the standard deviation of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see VarianceOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed standard deviation as a DoubleScalar
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Stddev(const Datum& value,
const VarianceOptions& options = VarianceOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the variance of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see VarianceOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed variance as a DoubleScalar
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Variance(const Datum& value,
const VarianceOptions& options = VarianceOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the skewness of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see SkewOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed skewness as a DoubleScalar
///
/// \since 20.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Skew(const Datum& value,
const SkewOptions& options = SkewOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the kurtosis of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see SkewOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed kurtosis as a DoubleScalar
///
/// \since 20.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Kurtosis(const Datum& value,
const SkewOptions& options = SkewOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the quantiles of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see QuantileOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as an array
///
/// \since 4.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Quantile(const Datum& value,
const QuantileOptions& options = QuantileOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see TDigestOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as an array
///
/// \since 4.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> TDigest(const Datum& value,
const TDigestOptions& options = TDigestOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Find the first index of a value in an array.
///
/// \param[in] value The array to search.
/// \param[in] options The array to search for. See IndexOptions.
/// \param[in] ctx the function execution context, optional
/// \return out a Scalar containing the index (or -1 if not found).
///
/// \since 5.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Index(const Datum& value, const IndexOptions& options,
ExecContext* ctx = NULLPTR);
} // namespace compute
} // namespace arrow

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,834 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <utility>
#include "arrow/compute/function_options.h"
#include "arrow/compute/ordering.h"
#include "arrow/result.h"
#include "arrow/type_fwd.h"
namespace arrow {
namespace compute {
class ExecContext;
/// \addtogroup compute-concrete-options
/// @{
class ARROW_EXPORT FilterOptions : public FunctionOptions {
public:
/// Configure the action taken when a slot of the selection mask is null
enum NullSelectionBehavior {
/// The corresponding filtered value will be removed in the output.
DROP,
/// The corresponding filtered value will be null in the output.
EMIT_NULL,
};
explicit FilterOptions(NullSelectionBehavior null_selection = DROP);
static constexpr const char kTypeName[] = "FilterOptions";
static FilterOptions Defaults() { return FilterOptions(); }
NullSelectionBehavior null_selection_behavior = DROP;
};
class ARROW_EXPORT TakeOptions : public FunctionOptions {
public:
explicit TakeOptions(bool boundscheck = true);
static constexpr const char kTypeName[] = "TakeOptions";
static TakeOptions BoundsCheck() { return TakeOptions(true); }
static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
static TakeOptions Defaults() { return BoundsCheck(); }
bool boundscheck = true;
};
/// \brief Options for the dictionary encode function
class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions {
public:
/// Configure how null values will be encoded
enum NullEncodingBehavior {
/// The null value will be added to the dictionary with a proper index.
ENCODE,
/// The null value will be masked in the indices array.
MASK
};
explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK);
static constexpr const char kTypeName[] = "DictionaryEncodeOptions";
static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
NullEncodingBehavior null_encoding_behavior = MASK;
};
/// \brief Options for the run-end encode function
class ARROW_EXPORT RunEndEncodeOptions : public FunctionOptions {
public:
explicit RunEndEncodeOptions(std::shared_ptr<DataType> run_end_type = int32());
static constexpr const char kTypeName[] = "RunEndEncodeOptions";
static RunEndEncodeOptions Defaults() { return RunEndEncodeOptions(); }
std::shared_ptr<DataType> run_end_type;
};
class ARROW_EXPORT ArraySortOptions : public FunctionOptions {
public:
explicit ArraySortOptions(SortOrder order = SortOrder::Ascending,
NullPlacement null_placement = NullPlacement::AtEnd);
static constexpr const char kTypeName[] = "ArraySortOptions";
static ArraySortOptions Defaults() { return ArraySortOptions(); }
/// Sorting order
SortOrder order;
/// Whether nulls and NaNs are placed at the start or at the end
NullPlacement null_placement;
};
class ARROW_EXPORT SortOptions : public FunctionOptions {
public:
explicit SortOptions(std::vector<SortKey> sort_keys = {},
NullPlacement null_placement = NullPlacement::AtEnd);
explicit SortOptions(const Ordering& ordering);
static constexpr const char kTypeName[] = "SortOptions";
static SortOptions Defaults() { return SortOptions(); }
/// Convenience constructor to create an ordering from SortOptions
///
/// Note: Both classes contain the exact same information. However,
/// sort_options should only be used in a "function options" context while Ordering
/// is used more generally.
Ordering AsOrdering() && { return Ordering(std::move(sort_keys), null_placement); }
Ordering AsOrdering() const& { return Ordering(sort_keys, null_placement); }
/// Column key(s) to order by and how to order by these sort keys.
std::vector<SortKey> sort_keys;
/// Whether nulls and NaNs are placed at the start or at the end
NullPlacement null_placement;
};
/// \brief SelectK options
class ARROW_EXPORT SelectKOptions : public FunctionOptions {
public:
explicit SelectKOptions(int64_t k = -1, std::vector<SortKey> sort_keys = {});
static constexpr const char kTypeName[] = "SelectKOptions";
static SelectKOptions Defaults() { return SelectKOptions(); }
static SelectKOptions TopKDefault(int64_t k, std::vector<std::string> key_names = {}) {
std::vector<SortKey> keys;
for (const auto& name : key_names) {
keys.emplace_back(SortKey(name, SortOrder::Descending));
}
if (key_names.empty()) {
keys.emplace_back(SortKey("not-used", SortOrder::Descending));
}
return SelectKOptions{k, keys};
}
static SelectKOptions BottomKDefault(int64_t k,
std::vector<std::string> key_names = {}) {
std::vector<SortKey> keys;
for (const auto& name : key_names) {
keys.emplace_back(SortKey(name, SortOrder::Ascending));
}
if (key_names.empty()) {
keys.emplace_back(SortKey("not-used", SortOrder::Ascending));
}
return SelectKOptions{k, keys};
}
/// The number of `k` elements to keep.
int64_t k;
/// Column key(s) to order by and how to order by these sort keys.
std::vector<SortKey> sort_keys;
};
/// \brief Rank options
class ARROW_EXPORT RankOptions : public FunctionOptions {
public:
/// Configure how ties between equal values are handled
enum Tiebreaker {
/// Ties get the smallest possible rank in sorted order.
Min,
/// Ties get the largest possible rank in sorted order.
Max,
/// Ranks are assigned in order of when ties appear in the input.
/// This ensures the ranks are a stable permutation of the input.
First,
/// The ranks span a dense [1, M] interval where M is the number
/// of distinct values in the input.
Dense
};
explicit RankOptions(std::vector<SortKey> sort_keys = {},
NullPlacement null_placement = NullPlacement::AtEnd,
Tiebreaker tiebreaker = RankOptions::First);
/// Convenience constructor for array inputs
explicit RankOptions(SortOrder order,
NullPlacement null_placement = NullPlacement::AtEnd,
Tiebreaker tiebreaker = RankOptions::First)
: RankOptions({SortKey("", order)}, null_placement, tiebreaker) {}
static constexpr const char kTypeName[] = "RankOptions";
static RankOptions Defaults() { return RankOptions(); }
/// Column key(s) to order by and how to order by these sort keys.
std::vector<SortKey> sort_keys;
/// Whether nulls and NaNs are placed at the start or at the end
NullPlacement null_placement;
/// Tiebreaker for dealing with equal values in ranks
Tiebreaker tiebreaker;
};
/// \brief Quantile rank options
class ARROW_EXPORT RankQuantileOptions : public FunctionOptions {
public:
explicit RankQuantileOptions(std::vector<SortKey> sort_keys = {},
NullPlacement null_placement = NullPlacement::AtEnd);
/// Convenience constructor for array inputs
explicit RankQuantileOptions(SortOrder order,
NullPlacement null_placement = NullPlacement::AtEnd)
: RankQuantileOptions({SortKey("", order)}, null_placement) {}
static constexpr const char kTypeName[] = "RankQuantileOptions";
static RankQuantileOptions Defaults() { return RankQuantileOptions(); }
/// Column key(s) to order by and how to order by these sort keys.
std::vector<SortKey> sort_keys;
/// Whether nulls and NaNs are placed at the start or at the end
NullPlacement null_placement;
};
/// \brief Partitioning options for NthToIndices
class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
public:
explicit PartitionNthOptions(int64_t pivot,
NullPlacement null_placement = NullPlacement::AtEnd);
PartitionNthOptions() : PartitionNthOptions(0) {}
static constexpr const char kTypeName[] = "PartitionNthOptions";
/// The index into the equivalent sorted array of the partition pivot element.
int64_t pivot;
/// Whether nulls and NaNs are partitioned at the start or at the end
NullPlacement null_placement;
};
class ARROW_EXPORT WinsorizeOptions : public FunctionOptions {
public:
WinsorizeOptions(double lower_limit, double upper_limit);
WinsorizeOptions() : WinsorizeOptions(0, 1) {}
static constexpr const char kTypeName[] = "WinsorizeOptions";
/// The quantile below which all values are replaced with the quantile's value.
///
/// For example, if lower_limit = 0.05, then all values in the lower 5% percentile
/// will be replaced with the 5% percentile value.
double lower_limit;
/// The quantile above which all values are replaced with the quantile's value.
///
/// For example, if upper_limit = 0.95, then all values in the upper 95% percentile
/// will be replaced with the 95% percentile value.
double upper_limit;
};
/// \brief Options for cumulative functions
/// \note Also aliased as CumulativeSumOptions for backward compatibility
class ARROW_EXPORT CumulativeOptions : public FunctionOptions {
public:
explicit CumulativeOptions(bool skip_nulls = false);
explicit CumulativeOptions(double start, bool skip_nulls = false);
explicit CumulativeOptions(std::shared_ptr<Scalar> start, bool skip_nulls = false);
static constexpr const char kTypeName[] = "CumulativeOptions";
static CumulativeOptions Defaults() { return CumulativeOptions(); }
/// Optional starting value for cumulative operation computation, default depends on the
/// operation and input type.
/// - sum: 0
/// - prod: 1
/// - min: maximum of the input type
/// - max: minimum of the input type
/// - mean: start is ignored because it has no meaning for mean
std::optional<std::shared_ptr<Scalar>> start;
/// If true, nulls in the input are ignored and produce a corresponding null output.
/// When false, the first null encountered is propagated through the remaining output.
bool skip_nulls = false;
};
using CumulativeSumOptions = CumulativeOptions; // For backward compatibility
/// \brief Options for pairwise functions
class ARROW_EXPORT PairwiseOptions : public FunctionOptions {
public:
explicit PairwiseOptions(int64_t periods = 1);
static constexpr const char kTypeName[] = "PairwiseOptions";
static PairwiseOptions Defaults() { return PairwiseOptions(); }
/// Periods to shift for applying the binary operation, accepts negative values.
int64_t periods = 1;
};
/// \brief Options for list_flatten function
class ARROW_EXPORT ListFlattenOptions : public FunctionOptions {
public:
explicit ListFlattenOptions(bool recursive = false);
static constexpr const char kTypeName[] = "ListFlattenOptions";
static ListFlattenOptions Defaults() { return ListFlattenOptions(); }
/// \brief If true, the list is flattened recursively until a non-list
/// array is formed.
bool recursive = false;
};
/// \brief Options for inverse_permutation function
class ARROW_EXPORT InversePermutationOptions : public FunctionOptions {
public:
explicit InversePermutationOptions(int64_t max_index = -1,
std::shared_ptr<DataType> output_type = NULLPTR);
static constexpr const char kTypeName[] = "InversePermutationOptions";
static InversePermutationOptions Defaults() { return InversePermutationOptions(); }
/// \brief The max value in the input indices to allow. The length of the function's
/// output will be this value plus 1. If negative, this value will be set to the length
/// of the input indices minus 1 and the length of the function's output will be the
/// length of the input indices.
int64_t max_index = -1;
/// \brief The type of the output inverse permutation. If null, the output will be of
/// the same type as the input indices, otherwise must be signed integer type. An
/// invalid error will be reported if this type is not able to store the length of the
/// input indices.
std::shared_ptr<DataType> output_type = NULLPTR;
};
/// \brief Options for scatter function
class ARROW_EXPORT ScatterOptions : public FunctionOptions {
public:
explicit ScatterOptions(int64_t max_index = -1);
static constexpr const char kTypeName[] = "ScatterOptions";
static ScatterOptions Defaults() { return ScatterOptions(); }
/// \brief The max value in the input indices to allow. The length of the function's
/// output will be this value plus 1. If negative, this value will be set to the length
/// of the input indices minus 1 and the length of the function's output will be the
/// length of the input indices.
int64_t max_index = -1;
};
/// @}
/// \brief Filter with a boolean selection filter
///
/// The output will be populated with values from the input at positions
/// where the selection filter is not 0. Nulls in the filter will be handled
/// based on options.null_selection_behavior.
///
/// For example given values = ["a", "b", "c", null, "e", "f"] and
/// filter = [0, 1, 1, 0, null, 1], the output will be
/// (null_selection_behavior == DROP) = ["b", "c", "f"]
/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
///
/// \param[in] values array to filter
/// \param[in] filter indicates which values should be filtered out
/// \param[in] options configures null_selection_behavior
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
ARROW_EXPORT
Result<Datum> Filter(const Datum& values, const Datum& filter,
const FilterOptions& options = FilterOptions::Defaults(),
ExecContext* ctx = NULLPTR);
namespace internal {
// These internal functions are implemented in kernels/vector_selection.cc
/// \brief Return the number of selected indices in the boolean filter
///
/// \param filter a plain or run-end encoded boolean array with or without nulls
/// \param null_selection how to handle nulls in the filter
ARROW_EXPORT
int64_t GetFilterOutputSize(const ArraySpan& filter,
FilterOptions::NullSelectionBehavior null_selection);
/// \brief Compute uint64 selection indices for use with Take given a boolean
/// filter
///
/// \param filter a plain or run-end encoded boolean array with or without nulls
/// \param null_selection how to handle nulls in the filter
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> GetTakeIndices(
const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
MemoryPool* memory_pool = default_memory_pool());
} // namespace internal
/// \brief ReplaceWithMask replaces each value in the array corresponding
/// to a true value in the mask with the next element from `replacements`.
///
/// \param[in] values Array input to replace
/// \param[in] mask Array or Scalar of Boolean mask values
/// \param[in] replacements The replacement values to draw from. There must
/// be as many replacement values as true values in the mask.
/// \param[in] ctx the function execution context, optional
///
/// \return the resulting datum
///
/// \since 5.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
const Datum& replacements, ExecContext* ctx = NULLPTR);
/// \brief FillNullForward fill null values in forward direction
///
/// The output array will be of the same type as the input values
/// array, with replaced null values in forward direction.
///
/// For example given values = ["a", "b", "c", null, null, "f"],
/// the output will be = ["a", "b", "c", "c", "c", "f"]
///
/// \param[in] values datum from which to take
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
ARROW_EXPORT
Result<Datum> FillNullForward(const Datum& values, ExecContext* ctx = NULLPTR);
/// \brief FillNullBackward fill null values in backward direction
///
/// The output array will be of the same type as the input values
/// array, with replaced null values in backward direction.
///
/// For example given values = ["a", "b", "c", null, null, "f"],
/// the output will be = ["a", "b", "c", "f", "f", "f"]
///
/// \param[in] values datum from which to take
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
ARROW_EXPORT
Result<Datum> FillNullBackward(const Datum& values, ExecContext* ctx = NULLPTR);
/// \brief Take from an array of values at indices in another array
///
/// The output array will be of the same type as the input values
/// array, with elements taken from the values array at the given
/// indices. If an index is null then the taken element will be null.
///
/// For example given values = ["a", "b", "c", null, "e", "f"] and
/// indices = [2, 1, null, 3], the output will be
/// = [values[2], values[1], null, values[3]]
/// = ["c", "b", null, null]
///
/// \param[in] values datum from which to take
/// \param[in] indices which values to take
/// \param[in] options options
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
ARROW_EXPORT
Result<Datum> Take(const Datum& values, const Datum& indices,
const TakeOptions& options = TakeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Take with Array inputs and output
ARROW_EXPORT
Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
const TakeOptions& options = TakeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Drop Null from an array of values
///
/// The output array will be of the same type as the input values
/// array, with elements taken from the values array without nulls.
///
/// For example given values = ["a", "b", "c", null, "e", "f"],
/// the output will be = ["a", "b", "c", "e", "f"]
///
/// \param[in] values datum from which to take
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
ARROW_EXPORT
Result<Datum> DropNull(const Datum& values, ExecContext* ctx = NULLPTR);
/// \brief DropNull with Array inputs and output
ARROW_EXPORT
Result<std::shared_ptr<Array>> DropNull(const Array& values, ExecContext* ctx = NULLPTR);
/// \brief Return indices that partition an array around n-th sorted element.
///
/// Find index of n-th(0 based) smallest value and perform indirect
/// partition of an array around that element. Output indices[0 ~ n-1]
/// holds values no greater than n-th element, and indices[n+1 ~ end]
/// holds values no less than n-th element. Elements in each partition
/// is not sorted. Nulls will be partitioned to the end of the output.
/// Output is not guaranteed to be stable.
///
/// \param[in] values array to be partitioned
/// \param[in] n pivot array around sorted n-th element
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would partition an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
ExecContext* ctx = NULLPTR);
/// \brief Return indices that partition an array around n-th sorted element.
///
/// This overload takes a PartitionNthOptions specifying the pivot index
/// and the null handling.
///
/// \param[in] values array to be partitioned
/// \param[in] options options including pivot index and null handling
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would partition an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> NthToIndices(const Array& values,
const PartitionNthOptions& options,
ExecContext* ctx = NULLPTR);
/// \brief Return indices that would select the first `k` elements.
///
/// Perform an indirect sort of the datum, keeping only the first `k` elements. The output
/// array will contain indices such that the item indicated by the k-th index will be in
/// the position it would be if the datum were sorted by `options.sort_keys`. However,
/// indices of null values will not be part of the output. The sort is not guaranteed to
/// be stable.
///
/// \param[in] datum datum to be partitioned
/// \param[in] options options
/// \param[in] ctx the function execution context, optional
/// \return a datum with the same schema as the input
ARROW_EXPORT
Result<std::shared_ptr<Array>> SelectKUnstable(const Datum& datum,
const SelectKOptions& options,
ExecContext* ctx = NULLPTR);
/// \brief Return the indices that would sort an array.
///
/// Perform an indirect sort of array. The output array will contain
/// indices that would sort an array, which would be the same length
/// as input. Nulls will be stably partitioned to the end of the output
/// regardless of order.
///
/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
/// 3].
///
/// \param[in] array array to sort
/// \param[in] order ascending or descending
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const Array& array,
SortOrder order = SortOrder::Ascending,
ExecContext* ctx = NULLPTR);
/// \brief Return the indices that would sort an array.
///
/// This overload takes a ArraySortOptions specifying the sort order
/// and the null handling.
///
/// \param[in] array array to sort
/// \param[in] options options including sort order and null handling
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const Array& array,
const ArraySortOptions& options,
ExecContext* ctx = NULLPTR);
/// \brief Return the indices that would sort a chunked array.
///
/// Perform an indirect sort of chunked array. The output array will
/// contain indices that would sort a chunked array, which would be
/// the same length as input. Nulls will be stably partitioned to the
/// end of the output regardless of order.
///
/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
/// 4, 1, 0, 3].
///
/// \param[in] chunked_array chunked array to sort
/// \param[in] order ascending or descending
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
SortOrder order = SortOrder::Ascending,
ExecContext* ctx = NULLPTR);
/// \brief Return the indices that would sort a chunked array.
///
/// This overload takes a ArraySortOptions specifying the sort order
/// and the null handling.
///
/// \param[in] chunked_array chunked array to sort
/// \param[in] options options including sort order and null handling
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
const ArraySortOptions& options,
ExecContext* ctx = NULLPTR);
/// \brief Return the indices that would sort an input in the
/// specified order. Input is one of array, chunked array record batch
/// or table.
///
/// Perform an indirect sort of input. The output array will contain
/// indices that would sort an input, which would be the same length
/// as input. Nulls will be stably partitioned to the start or to the end
/// of the output depending on SortOrder::null_placement.
///
/// For example given input (table) = {
/// "column1": [[null, 1], [ 3, null, 2, 1]],
/// "column2": [[ 5], [3, null, null, 5, 5]],
/// } and options = {
/// {"column1", SortOrder::Ascending},
/// {"column2", SortOrder::Descending},
/// }, the output will be [5, 1, 4, 2, 0, 3].
///
/// \param[in] datum array, chunked array, record batch or table to sort
/// \param[in] options options
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort a table
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
ExecContext* ctx = NULLPTR);
/// \brief Compute unique elements from an array-like object
///
/// Note if a null occurs in the input it will NOT be included in the output.
///
/// \param[in] datum array-like input
/// \param[in] ctx the function execution context, optional
/// \return result as Array
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<std::shared_ptr<Array>> Unique(const Datum& datum, ExecContext* ctx = NULLPTR);
// Constants for accessing the output of ValueCounts
ARROW_EXPORT extern const char kValuesFieldName[];
ARROW_EXPORT extern const char kCountsFieldName[];
ARROW_EXPORT extern const int32_t kValuesFieldIndex;
ARROW_EXPORT extern const int32_t kCountsFieldIndex;
/// \brief Return counts of unique elements from an array-like object.
///
/// Note that the counts do not include counts for nulls in the array. These can be
/// obtained separately from metadata.
///
/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
/// which can lead to unexpected results if the input Array has these values.
///
/// \param[in] value array-like input
/// \param[in] ctx the function execution context, optional
/// \return counts An array of <input type "Values", int64_t "Counts"> structs.
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
ExecContext* ctx = NULLPTR);
/// \brief Dictionary-encode values in an array-like object
///
/// Any nulls encountered in the dictionary will be handled according to the
/// specified null encoding behavior.
///
/// For example, given values ["a", "b", null, "a", null] the output will be
/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
///
/// If the input is already dictionary encoded this function is a no-op unless
/// it needs to modify the null_encoding (TODO)
///
/// \param[in] data array-like input
/// \param[in] ctx the function execution context, optional
/// \param[in] options configures null encoding behavior
/// \return result with same shape and type as input
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> DictionaryEncode(
const Datum& data,
const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Run-end-encode values in an array-like object
///
/// The returned run-end encoded type uses the same value type of the input and
/// run-end type defined in the options.
///
/// \param[in] value array-like input
/// \param[in] options configures encoding behavior
/// \param[in] ctx the function execution context, optional
/// \return result with same shape but run-end encoded
///
/// \since 12.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> RunEndEncode(
const Datum& value,
const RunEndEncodeOptions& options = RunEndEncodeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Decode a Run-End Encoded array to a plain array
///
/// The output data type is the same as the values array type of run-end encoded
/// input.
///
/// \param[in] value run-end-encoded input
/// \param[in] ctx the function execution context, optional
/// \return plain array resulting from decoding the run-end encoded input
///
/// \since 12.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> RunEndDecode(const Datum& value, ExecContext* ctx = NULLPTR);
/// \brief Compute the cumulative sum of an array-like object
///
/// \param[in] values array-like input
/// \param[in] options configures cumulative sum behavior
/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
/// status on overflow, otherwise wrap around on overflow
/// \param[in] ctx the function execution context, optional
ARROW_EXPORT
Result<Datum> CumulativeSum(
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
bool check_overflow = false, ExecContext* ctx = NULLPTR);
/// \brief Compute the cumulative product of an array-like object
///
/// \param[in] values array-like input
/// \param[in] options configures cumulative prod behavior
/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
/// status on overflow, otherwise wrap around on overflow
/// \param[in] ctx the function execution context, optional
ARROW_EXPORT
Result<Datum> CumulativeProd(
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
bool check_overflow = false, ExecContext* ctx = NULLPTR);
/// \brief Compute the cumulative max of an array-like object
///
/// \param[in] values array-like input
/// \param[in] options configures cumulative max behavior
/// \param[in] ctx the function execution context, optional
ARROW_EXPORT
Result<Datum> CumulativeMax(
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Compute the cumulative min of an array-like object
///
/// \param[in] values array-like input
/// \param[in] options configures cumulative min behavior
/// \param[in] ctx the function execution context, optional
ARROW_EXPORT
Result<Datum> CumulativeMin(
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Compute the cumulative mean of an array-like object
///
/// \param[in] values array-like input
/// \param[in] options configures cumulative mean behavior, `start` is ignored
/// \param[in] ctx the function execution context, optional
ARROW_EXPORT
Result<Datum> CumulativeMean(
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Return the first order difference of an array.
///
/// Computes the first order difference of an array, i.e.
/// output[i] = input[i] - input[i - p] if i >= p
/// output[i] = null otherwise
/// where p is the period. For example, with p = 1,
/// Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5].
/// With p = 2,
/// Diff([1, 4, 9, 10, 15]) = [null, null, 8, 6, 6]
/// p can also be negative, in which case the diff is computed in
/// the opposite direction.
/// \param[in] array array input
/// \param[in] options options, specifying overflow behavior and period
/// \param[in] check_overflow whether to return error on overflow
/// \param[in] ctx the function execution context, optional
/// \return result as array
ARROW_EXPORT
Result<std::shared_ptr<Array>> PairwiseDiff(const Array& array,
const PairwiseOptions& options,
bool check_overflow = false,
ExecContext* ctx = NULLPTR);
/// \brief Return the inverse permutation of the given indices.
///
/// For indices[i] = x, inverse_permutation[x] = i. And inverse_permutation[x] = null if x
/// does not appear in the input indices. Indices must be in the range of [0, max_index],
/// or null, which will be ignored. If multiple indices point to the same value, the last
/// one is used.
///
/// For example, with
/// indices = [null, 0, null, 2, 4, 1, 1]
/// the inverse permutation is
/// [1, 6, 3, null, 4, null, null]
/// if max_index = 6.
///
/// \param[in] indices array-like indices
/// \param[in] options configures the max index and the output type
/// \param[in] ctx the function execution context, optional
/// \return the resulting inverse permutation
///
/// \since 20.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> InversePermutation(
const Datum& indices,
const InversePermutationOptions& options = InversePermutationOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Scatter the values into specified positions according to the indices.
///
/// For indices[i] = x, output[x] = values[i]. And output[x] = null if x does not appear
/// in the input indices. Indices must be in the range of [0, max_index], or null, in
/// which case the corresponding value will be ignored. If multiple indices point to the
/// same value, the last one is used.
///
/// For example, with
/// values = [a, b, c, d, e, f, g]
/// indices = [null, 0, null, 2, 4, 1, 1]
/// the output is
/// [b, g, d, null, e, null, null]
/// if max_index = 6.
///
/// \param[in] values datum to scatter
/// \param[in] indices array-like indices
/// \param[in] options configures the max index of to scatter
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 20.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Scatter(const Datum& values, const Datum& indices,
const ScatterOptions& options = ScatterOptions::Defaults(),
ExecContext* ctx = NULLPTR);
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,134 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/compute/function.h"
#include "arrow/compute/function_options.h"
#include "arrow/compute/type_fwd.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
class Array;
namespace compute {
class ExecContext;
/// \addtogroup compute-concrete-options
/// @{
class ARROW_EXPORT CastOptions : public FunctionOptions {
public:
explicit CastOptions(bool safe = true);
static constexpr const char kTypeName[] = "CastOptions";
static CastOptions Safe(TypeHolder to_type = {}) {
CastOptions safe(true);
safe.to_type = std::move(to_type);
return safe;
}
static CastOptions Unsafe(TypeHolder to_type = {}) {
CastOptions unsafe(false);
unsafe.to_type = std::move(to_type);
return unsafe;
}
// Type being casted to. May be passed separate to eager function
// compute::Cast
TypeHolder to_type;
bool allow_int_overflow;
bool allow_time_truncate;
bool allow_time_overflow;
bool allow_decimal_truncate;
bool allow_float_truncate;
// Indicate if conversions from Binary/FixedSizeBinary to string must
// validate the utf8 payload.
bool allow_invalid_utf8;
/// true if the safety options all match CastOptions::Safe
///
/// Note, if this returns false it does not mean is_unsafe will return true
bool is_safe() const;
/// true if the safety options all match CastOptions::Unsafe
///
/// Note, if this returns false it does not mean is_safe will return true
bool is_unsafe() const;
};
/// @}
/// \brief Return true if a cast function is defined
ARROW_EXPORT
bool CanCast(const DataType& from_type, const DataType& to_type);
// ----------------------------------------------------------------------
// Convenience invocation APIs for a number of kernels
/// \brief Cast from one array type to another
/// \param[in] value array to cast
/// \param[in] to_type type to cast to
/// \param[in] options casting options
/// \param[in] ctx the function execution context, optional
/// \return the resulting array
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<std::shared_ptr<Array>> Cast(const Array& value, const TypeHolder& to_type,
const CastOptions& options = CastOptions::Safe(),
ExecContext* ctx = NULLPTR);
/// \brief Cast from one array type to another
/// \param[in] value array to cast
/// \param[in] options casting options. The "to_type" field must be populated
/// \param[in] ctx the function execution context, optional
/// \return the resulting array
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Cast(const Datum& value, const CastOptions& options,
ExecContext* ctx = NULLPTR);
/// \brief Cast from one value to another
/// \param[in] value datum to cast
/// \param[in] to_type type to cast to
/// \param[in] options casting options
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Cast(const Datum& value, const TypeHolder& to_type,
const CastOptions& options = CastOptions::Safe(),
ExecContext* ctx = NULLPTR);
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,489 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// NOTE: API is EXPERIMENTAL and will change without going through a
// deprecation cycle
#pragma once
#include <atomic>
#include <cstdint>
#include <limits>
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>
#include "arrow/array/data.h"
#include "arrow/compute/expression.h"
#include "arrow/compute/type_fwd.h"
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace compute {
// It seems like 64K might be a good default chunksize to use for execution
// based on the experience of other query processing systems. The current
// default is not to chunk contiguous arrays, though, but this may change in
// the future once parallel execution is implemented
static constexpr int64_t kDefaultExecChunksize = UINT16_MAX;
/// \brief Context for expression-global variables and options used by
/// function evaluation
class ARROW_EXPORT ExecContext {
public:
// If no function registry passed, the default is used.
explicit ExecContext(MemoryPool* pool = default_memory_pool(),
::arrow::internal::Executor* executor = NULLPTR,
FunctionRegistry* func_registry = NULLPTR);
/// \brief The MemoryPool used for allocations, default is
/// default_memory_pool().
MemoryPool* memory_pool() const { return pool_; }
const ::arrow::internal::CpuInfo* cpu_info() const;
/// \brief An Executor which may be used to parallelize execution.
::arrow::internal::Executor* executor() const { return executor_; }
/// \brief The FunctionRegistry for looking up functions by name and
/// selecting kernels for execution. Defaults to the library-global function
/// registry provided by GetFunctionRegistry.
FunctionRegistry* func_registry() const { return func_registry_; }
// \brief Set maximum length unit of work for kernel execution. Larger
// contiguous array inputs will be split into smaller chunks, and, if
// possible and enabled, processed in parallel. The default chunksize is
// INT64_MAX, so contiguous arrays are not split.
void set_exec_chunksize(int64_t chunksize) { exec_chunksize_ = chunksize; }
// \brief Maximum length for ExecBatch data chunks processed by
// kernels. Contiguous array inputs with longer length will be split into
// smaller chunks.
int64_t exec_chunksize() const { return exec_chunksize_; }
/// \brief Set whether to use multiple threads for function execution. This
/// is not yet used.
void set_use_threads(bool use_threads = true) { use_threads_ = use_threads; }
/// \brief If true, then utilize multiple threads where relevant for function
/// execution. This is not yet used.
bool use_threads() const { return use_threads_; }
// Set the preallocation strategy for kernel execution as it relates to
// chunked execution. For chunked execution, whether via ChunkedArray inputs
// or splitting larger Array arguments into smaller pieces, contiguous
// allocation (if permitted by the kernel) will allocate one large array to
// write output into yielding it to the caller at the end. If this option is
// set to off, then preallocations will be performed independently for each
// chunk of execution
//
// TODO: At some point we might want the limit the size of contiguous
// preallocations. For example, even if the exec_chunksize is 64K or less, we
// might limit contiguous allocations to 1M records, say.
void set_preallocate_contiguous(bool preallocate) {
preallocate_contiguous_ = preallocate;
}
/// \brief If contiguous preallocations should be used when doing chunked
/// execution as specified by exec_chunksize(). See
/// set_preallocate_contiguous() for more information.
bool preallocate_contiguous() const { return preallocate_contiguous_; }
private:
MemoryPool* pool_;
::arrow::internal::Executor* executor_;
FunctionRegistry* func_registry_;
int64_t exec_chunksize_ = std::numeric_limits<int64_t>::max();
bool preallocate_contiguous_ = true;
bool use_threads_ = true;
};
// TODO: Consider standardizing on uint16 selection vectors and only use them
// when we can ensure that each value is 64K length or smaller
/// \brief Container for an array of value selection indices that were
/// materialized from a filter.
///
/// Columnar query engines (see e.g. [1]) have found that rather than
/// materializing filtered data, the filter can instead be converted to an
/// array of the "on" indices and then "fusing" these indices in operator
/// implementations. This is especially relevant for aggregations but also
/// applies to scalar operations.
///
/// We are not yet using this so this is mostly a placeholder for now.
///
/// [1]: http://cidrdb.org/cidr2005/papers/P19.pdf
class ARROW_EXPORT SelectionVector {
public:
explicit SelectionVector(std::shared_ptr<ArrayData> data);
explicit SelectionVector(const Array& arr);
/// \brief Create SelectionVector from boolean mask
static Result<std::shared_ptr<SelectionVector>> FromMask(const BooleanArray& arr);
const int32_t* indices() const { return indices_; }
int32_t length() const;
private:
std::shared_ptr<ArrayData> data_;
const int32_t* indices_;
};
/// An index to represent that a batch does not belong to an ordered stream
constexpr int64_t kUnsequencedIndex = -1;
/// \brief A unit of work for kernel execution. It contains a collection of
/// Array and Scalar values and an optional SelectionVector indicating that
/// there is an unmaterialized filter that either must be materialized, or (if
/// the kernel supports it) pushed down into the kernel implementation.
///
/// ExecBatch is semantically similar to RecordBatch in that in a SQL context
/// it represents a collection of records, but constant "columns" are
/// represented by Scalar values rather than having to be converted into arrays
/// with repeated values.
///
/// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight
/// than is desirable for this class. Microbenchmarks would help determine for
/// sure. See ARROW-8928.
/// \addtogroup acero-internals
/// @{
struct ARROW_EXPORT ExecBatch {
ExecBatch() = default;
ExecBatch(std::vector<Datum> values, int64_t length)
: values(std::move(values)), length(length) {}
explicit ExecBatch(const RecordBatch& batch);
/// \brief Infer the ExecBatch length from values.
static Result<int64_t> InferLength(const std::vector<Datum>& values);
/// Creates an ExecBatch with length-validation.
///
/// If any value is given, then all values must have a common length. If the given
/// length is negative, then the length of the ExecBatch is set to this common length,
/// or to 1 if no values are given. Otherwise, the given length must equal the common
/// length, if any value is given.
static Result<ExecBatch> Make(std::vector<Datum> values, int64_t length = -1);
Result<std::shared_ptr<RecordBatch>> ToRecordBatch(
std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()) const;
/// The values representing positional arguments to be passed to a kernel's
/// exec function for processing.
std::vector<Datum> values;
/// A deferred filter represented as an array of indices into the values.
///
/// For example, the filter [true, true, false, true] would be represented as
/// the selection vector [0, 1, 3]. When the selection vector is set,
/// ExecBatch::length is equal to the length of this array.
std::shared_ptr<SelectionVector> selection_vector;
/// A predicate Expression guaranteed to evaluate to true for all rows in this batch.
Expression guarantee = literal(true);
/// The semantic length of the ExecBatch. When the values are all scalars,
/// the length should be set to 1 for non-aggregate kernels, otherwise the
/// length is taken from the array values, except when there is a selection
/// vector. When there is a selection vector set, the length of the batch is
/// the length of the selection. Aggregate kernels can have an ExecBatch
/// formed by projecting just the partition columns from a batch in which
/// case, it would have scalar rows with length greater than 1.
///
/// If the array values are of length 0 then the length is 0 regardless of
/// whether any values are Scalar.
int64_t length = 0;
/// \brief index of this batch in a sorted stream of batches
///
/// This index must be strictly monotonic starting at 0 without gaps or
/// it can be set to kUnsequencedIndex if there is no meaningful order
int64_t index = kUnsequencedIndex;
/// \brief The sum of bytes in each buffer referenced by the batch
///
/// Note: Scalars are not counted
/// Note: Some values may referenced only part of a buffer, for
/// example, an array with an offset. The actual data
/// visible to this batch will be smaller than the total
/// buffer size in this case.
int64_t TotalBufferSize() const;
/// \brief Return the value at the i-th index
template <typename index_type>
inline const Datum& operator[](index_type i) const {
return values[i];
}
bool Equals(const ExecBatch& other) const;
/// \brief A convenience for the number of values / arguments.
int num_values() const { return static_cast<int>(values.size()); }
ExecBatch Slice(int64_t offset, int64_t length) const;
Result<ExecBatch> SelectValues(const std::vector<int>& ids) const;
/// \brief A convenience for returning the types from the batch.
std::vector<TypeHolder> GetTypes() const {
std::vector<TypeHolder> result;
for (const auto& value : this->values) {
result.emplace_back(value.type());
}
return result;
}
std::string ToString() const;
};
inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
ARROW_EXPORT void PrintTo(const ExecBatch&, std::ostream*);
/// @}
/// \defgroup compute-internals Utilities for calling functions, useful for those
/// extending the function registry
///
/// @{
struct ExecValue {
ArraySpan array = {};
const Scalar* scalar = NULLPTR;
ExecValue(const Scalar* scalar) // NOLINT implicit conversion
: scalar(scalar) {}
ExecValue(ArraySpan array) // NOLINT implicit conversion
: array(std::move(array)) {}
ExecValue(const ArrayData& array) { // NOLINT implicit conversion
this->array.SetMembers(array);
}
ExecValue() = default;
ExecValue(const ExecValue& other) = default;
ExecValue& operator=(const ExecValue& other) = default;
ExecValue(ExecValue&& other) = default;
ExecValue& operator=(ExecValue&& other) = default;
int64_t length() const { return this->is_array() ? this->array.length : 1; }
bool is_array() const { return this->scalar == NULLPTR; }
bool is_scalar() const { return !this->is_array(); }
void SetArray(const ArrayData& array) {
this->array.SetMembers(array);
this->scalar = NULLPTR;
}
void SetScalar(const Scalar* scalar) { this->scalar = scalar; }
template <typename ExactType>
const ExactType& scalar_as() const {
return ::arrow::internal::checked_cast<const ExactType&>(*this->scalar);
}
/// XXX: here temporarily for compatibility with datum, see
/// e.g. MakeStructExec in scalar_nested.cc
int64_t null_count() const {
if (this->is_array()) {
return this->array.GetNullCount();
} else {
return this->scalar->is_valid ? 0 : 1;
}
}
const DataType* type() const {
if (this->is_array()) {
return array.type;
} else {
return scalar->type.get();
}
}
};
struct ARROW_EXPORT ExecResult {
// The default value of the variant is ArraySpan
std::variant<ArraySpan, std::shared_ptr<ArrayData>> value;
int64_t length() const {
if (this->is_array_span()) {
return this->array_span()->length;
} else {
return this->array_data()->length;
}
}
const DataType* type() const {
if (this->is_array_span()) {
return this->array_span()->type;
} else {
return this->array_data()->type.get();
}
}
const ArraySpan* array_span() const { return &std::get<ArraySpan>(this->value); }
ArraySpan* array_span_mutable() { return &std::get<ArraySpan>(this->value); }
bool is_array_span() const { return this->value.index() == 0; }
const std::shared_ptr<ArrayData>& array_data() const {
return std::get<std::shared_ptr<ArrayData>>(this->value);
}
ArrayData* array_data_mutable() {
return std::get<std::shared_ptr<ArrayData>>(this->value).get();
}
bool is_array_data() const { return this->value.index() == 1; }
};
/// \brief A "lightweight" column batch object which contains no
/// std::shared_ptr objects and does not have any memory ownership
/// semantics. Can represent a view onto an "owning" ExecBatch.
struct ARROW_EXPORT ExecSpan {
ExecSpan() = default;
ExecSpan(const ExecSpan& other) = default;
ExecSpan& operator=(const ExecSpan& other) = default;
ExecSpan(ExecSpan&& other) = default;
ExecSpan& operator=(ExecSpan&& other) = default;
explicit ExecSpan(std::vector<ExecValue> values, int64_t length)
: length(length), values(std::move(values)) {}
explicit ExecSpan(const ExecBatch& batch) {
this->length = batch.length;
this->values.resize(batch.values.size());
for (size_t i = 0; i < batch.values.size(); ++i) {
const Datum& in_value = batch[i];
ExecValue* out_value = &this->values[i];
if (in_value.is_array()) {
out_value->SetArray(*in_value.array());
} else {
out_value->SetScalar(in_value.scalar().get());
}
}
}
/// \brief Return the value at the i-th index
template <typename index_type>
inline const ExecValue& operator[](index_type i) const {
return values[i];
}
/// \brief A convenience for the number of values / arguments.
int num_values() const { return static_cast<int>(values.size()); }
std::vector<TypeHolder> GetTypes() const {
std::vector<TypeHolder> result;
for (const auto& value : this->values) {
result.emplace_back(value.type());
}
return result;
}
ExecBatch ToExecBatch() const {
ExecBatch result;
result.length = this->length;
for (const ExecValue& value : this->values) {
if (value.is_array()) {
result.values.push_back(value.array.ToArrayData());
} else {
result.values.push_back(value.scalar->GetSharedPtr());
}
}
return result;
}
int64_t length = 0;
std::vector<ExecValue> values;
};
/// \defgroup compute-call-function One-shot calls to compute functions
///
/// @{
/// \brief One-shot invoker for all types of functions.
///
/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
/// and wrapping of outputs.
ARROW_EXPORT
Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
const FunctionOptions* options, ExecContext* ctx = NULLPTR);
/// \brief Variant of CallFunction which uses a function's default options.
///
/// NB: Some functions require FunctionOptions be provided.
ARROW_EXPORT
Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
ExecContext* ctx = NULLPTR);
/// \brief One-shot invoker for all types of functions.
///
/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
/// and wrapping of outputs.
ARROW_EXPORT
Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
const FunctionOptions* options, ExecContext* ctx = NULLPTR);
/// \brief Variant of CallFunction which uses a function's default options.
///
/// NB: Some functions require FunctionOptions be provided.
ARROW_EXPORT
Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
ExecContext* ctx = NULLPTR);
/// @}
/// \defgroup compute-function-executor One-shot calls to obtain function executors
///
/// @{
/// \brief One-shot executor provider for all types of functions.
///
/// This function creates and initializes a `FunctionExecutor` appropriate
/// for the given function name, input types and function options.
ARROW_EXPORT
Result<std::shared_ptr<FunctionExecutor>> GetFunctionExecutor(
const std::string& func_name, std::vector<TypeHolder> in_types,
const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR);
/// \brief One-shot executor provider for all types of functions.
///
/// This function creates and initializes a `FunctionExecutor` appropriate
/// for the given function name, input types (taken from the Datum arguments)
/// and function options.
ARROW_EXPORT
Result<std::shared_ptr<FunctionExecutor>> GetFunctionExecutor(
const std::string& func_name, const std::vector<Datum>& args,
const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR);
/// @}
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,295 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <string>
#include <utility>
#include <variant>
#include <vector>
#include "arrow/compute/type_fwd.h"
#include "arrow/datum.h"
#include "arrow/type_fwd.h"
#include "arrow/util/small_vector.h"
namespace arrow {
namespace compute {
/// \defgroup expression-core Expressions to describe data transformations
///
/// @{
/// An unbound expression which maps a single Datum to another Datum.
/// An expression is one of
/// - A literal Datum.
/// - A reference to a single (potentially nested) field of the input Datum.
/// - A call to a compute function, with arguments specified by other Expressions.
class ARROW_EXPORT Expression {
public:
struct Call {
std::string function_name;
std::vector<Expression> arguments;
std::shared_ptr<FunctionOptions> options;
// Cached hash value
size_t hash;
// post-Bind properties:
std::shared_ptr<Function> function;
const Kernel* kernel = NULLPTR;
std::shared_ptr<KernelState> kernel_state;
TypeHolder type;
void ComputeHash();
};
std::string ToString() const;
bool Equals(const Expression& other) const;
size_t hash() const;
struct Hash {
size_t operator()(const Expression& expr) const { return expr.hash(); }
};
/// Bind this expression to the given input type, looking up Kernels and field types.
/// Some expression simplification may be performed and implicit casts will be inserted.
/// Any state necessary for execution will be initialized and returned.
Result<Expression> Bind(const TypeHolder& in, ExecContext* = NULLPTR) const;
Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;
// XXX someday
// Clone all KernelState in this bound expression. If any function referenced by this
// expression has mutable KernelState, it is not safe to execute or apply simplification
// passes to it (or copies of it!) from multiple threads. Cloning state produces new
// KernelStates where necessary to ensure that Expressions may be manipulated safely
// on multiple threads.
// Result<ExpressionState> CloneState() const;
// Status SetState(ExpressionState);
/// Return true if all an expression's field references have explicit types
/// and all of its functions' kernels are looked up.
bool IsBound() const;
/// Return true if this expression is composed only of Scalar literals, field
/// references, and calls to ScalarFunctions.
bool IsScalarExpression() const;
/// Return true if this expression is literal and entirely null.
bool IsNullLiteral() const;
/// Return true if this expression could evaluate to true. Will return true for any
/// unbound or non-boolean Expressions. IsSatisfiable does not (currently) do any
/// canonicalization or simplification of the expression, so even Expressions
/// which are unsatisfiable may spuriously return `true` here. This function is
/// intended for use in predicate pushdown where a filter expression is simplified
/// by a guarantee, so it assumes that trying to simplify again would be redundant.
bool IsSatisfiable() const;
// XXX someday
// Result<PipelineGraph> GetPipelines();
bool is_valid() const { return impl_ != NULLPTR; }
/// Access a Call or return nullptr if this expression is not a call
const Call* call() const;
/// Access a Datum or return nullptr if this expression is not a literal
const Datum* literal() const;
/// Access a FieldRef or return nullptr if this expression is not a field_ref
const FieldRef* field_ref() const;
/// The type to which this expression will evaluate
const DataType* type() const;
// XXX someday
// NullGeneralization::type nullable() const;
struct Parameter {
FieldRef ref;
// post-bind properties
TypeHolder type;
::arrow::internal::SmallVector<int, 2> indices;
};
const Parameter* parameter() const;
Expression() = default;
explicit Expression(Call call);
explicit Expression(Datum literal);
explicit Expression(Parameter parameter);
static bool Identical(const Expression& l, const Expression& r);
private:
using Impl = std::variant<Datum, Parameter, Call>;
std::shared_ptr<Impl> impl_;
};
inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }
ARROW_EXPORT void PrintTo(const Expression&, std::ostream*);
// Factories
ARROW_EXPORT
Expression literal(Datum lit);
template <typename Arg>
Expression literal(Arg&& arg) {
return literal(Datum(std::forward<Arg>(arg)));
}
ARROW_EXPORT
Expression field_ref(FieldRef ref);
ARROW_EXPORT
Expression call(std::string function, std::vector<Expression> arguments,
std::shared_ptr<FunctionOptions> options = NULLPTR);
template <typename Options, typename = typename std::enable_if<
std::is_base_of<FunctionOptions, Options>::value>::type>
Expression call(std::string function, std::vector<Expression> arguments,
Options options) {
return call(std::move(function), std::move(arguments),
std::make_shared<Options>(std::move(options)));
}
/// Assemble a list of all fields referenced by an Expression at any depth.
ARROW_EXPORT
std::vector<FieldRef> FieldsInExpression(const Expression&);
/// Check if the expression references any fields.
ARROW_EXPORT
bool ExpressionHasFieldRefs(const Expression&);
struct ARROW_EXPORT KnownFieldValues;
/// Assemble a mapping from field references to known values. This derives known values
/// from "equal" and "is_null" Expressions referencing a field and a literal.
ARROW_EXPORT
Result<KnownFieldValues> ExtractKnownFieldValues(
const Expression& guaranteed_true_predicate);
/// @}
/// \defgroup expression-passes Functions for modification of Expressions
///
/// @{
///
/// These transform bound expressions. Some transforms utilize a guarantee, which is
/// provided as an Expression which is guaranteed to evaluate to true. The
/// guaranteed_true_predicate need not be bound, but canonicalization is currently
/// deferred to producers of guarantees. For example in order to be recognized as a
/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
/// other semantically identical Expressions will not be recognized.
/// Weak canonicalization which establishes guarantees for subsequent passes. Even
/// equivalent Expressions may result in different canonicalized expressions.
/// TODO this could be a strong canonicalization
ARROW_EXPORT
Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);
/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
/// be null so replace the call with a null literal). Includes early evaluation of all
/// calls whose arguments are entirely literal.
ARROW_EXPORT
Result<Expression> FoldConstants(Expression);
/// Simplify Expressions by replacing with known values of the fields which it references.
ARROW_EXPORT
Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
Expression);
/// Simplify an expression by replacing subexpressions based on a guarantee:
/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
/// used to remove redundant function calls from a filter expression or to replace a
/// reference to a constant-value field with a literal.
ARROW_EXPORT
Result<Expression> SimplifyWithGuarantee(Expression,
const Expression& guaranteed_true_predicate);
/// Replace all named field refs (e.g. "x" or "x.y") with field paths (e.g. [0] or [1,3])
///
/// This isn't usually needed and does not offer any simplification by itself. However,
/// it can be useful to normalize an expression to paths to make it simpler to work with.
ARROW_EXPORT Result<Expression> RemoveNamedRefs(Expression expression);
/// @}
// Execution
/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
/// RecordBatch which may have missing or incorrectly ordered columns.
/// Missing fields will be replaced with null scalars.
ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
const Datum& partial,
Expression guarantee = literal(true));
/// Execute a scalar expression against the provided state and input ExecBatch. This
/// expression must be bound.
ARROW_EXPORT
Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
ExecContext* = NULLPTR);
/// Convenience function for invoking against a RecordBatch
ARROW_EXPORT
Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
const Datum& partial_input, ExecContext* = NULLPTR);
// Serialization
ARROW_EXPORT
Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
ARROW_EXPORT
Result<Expression> Deserialize(std::shared_ptr<Buffer>);
/// \defgroup expression-convenience Helpers for convenient expression creation
///
/// @{
ARROW_EXPORT Expression project(std::vector<Expression> values,
std::vector<std::string> names);
ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);
ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);
ARROW_EXPORT Expression less(Expression lhs, Expression rhs);
ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);
ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);
ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false);
ARROW_EXPORT Expression is_valid(Expression lhs);
ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
ARROW_EXPORT Expression and_(const std::vector<Expression>&);
ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
ARROW_EXPORT Expression or_(const std::vector<Expression>&);
ARROW_EXPORT Expression not_(Expression operand);
/// @}
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,410 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// NOTE: API is EXPERIMENTAL and will change without going through a
// deprecation cycle.
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "arrow/compute/kernel.h"
#include "arrow/compute/type_fwd.h"
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/compare.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace compute {
/// \addtogroup compute-functions
/// @{
/// \brief Contains the number of required arguments for the function.
///
/// Naming conventions taken from https://en.wikipedia.org/wiki/Arity.
struct ARROW_EXPORT Arity {
/// \brief A function taking no arguments
static Arity Nullary() { return Arity(0, false); }
/// \brief A function taking 1 argument
static Arity Unary() { return Arity(1, false); }
/// \brief A function taking 2 arguments
static Arity Binary() { return Arity(2, false); }
/// \brief A function taking 3 arguments
static Arity Ternary() { return Arity(3, false); }
/// \brief A function taking a variable number of arguments
///
/// \param[in] min_args the minimum number of arguments required when
/// invoking the function
static Arity VarArgs(int min_args = 0) { return Arity(min_args, true); }
// NOTE: the 0-argument form (default constructor) is required for Cython
explicit Arity(int num_args = 0, bool is_varargs = false)
: num_args(num_args), is_varargs(is_varargs) {}
/// The number of required arguments (or the minimum number for varargs
/// functions).
int num_args;
/// If true, then the num_args is the minimum number of required arguments.
bool is_varargs = false;
};
struct ARROW_EXPORT FunctionDoc {
/// \brief A one-line summary of the function, using a verb.
///
/// For example, "Add two numeric arrays or scalars".
std::string summary;
/// \brief A detailed description of the function, meant to follow the summary.
std::string description;
/// \brief Symbolic names (identifiers) for the function arguments.
///
/// Some bindings may use this to generate nicer function signatures.
std::vector<std::string> arg_names;
// TODO add argument descriptions?
/// \brief Name of the options class, if any.
std::string options_class;
/// \brief Whether options are required for function execution
///
/// If false, then either the function does not have an options class
/// or there is a usable default options value.
bool options_required;
FunctionDoc() = default;
FunctionDoc(std::string summary, std::string description,
std::vector<std::string> arg_names, std::string options_class = "",
bool options_required = false)
: summary(std::move(summary)),
description(std::move(description)),
arg_names(std::move(arg_names)),
options_class(std::move(options_class)),
options_required(options_required) {}
static const FunctionDoc& Empty();
};
/// \brief An executor of a function with a preconfigured kernel
class ARROW_EXPORT FunctionExecutor {
public:
virtual ~FunctionExecutor() = default;
/// \brief Initialize or re-initialize the preconfigured kernel
///
/// This method may be called zero or more times. Depending on how
/// the FunctionExecutor was obtained, it may already have been initialized.
virtual Status Init(const FunctionOptions* options = NULLPTR,
ExecContext* exec_ctx = NULLPTR) = 0;
/// \brief Execute the preconfigured kernel with arguments that must fit it
///
/// The method requires the arguments be castable to the preconfigured types.
///
/// \param[in] args Arguments to execute the function on
/// \param[in] length Length of arguments batch or -1 to default it. If the
/// function has no parameters, this determines the batch length, defaulting
/// to 0. Otherwise, if the function is scalar, this must equal the argument
/// batch's inferred length or be -1 to default to it. This is ignored for
/// vector functions.
virtual Result<Datum> Execute(const std::vector<Datum>& args, int64_t length = -1) = 0;
};
/// \brief Base class for compute functions. Function implementations contain a
/// collection of "kernels" which are implementations of the function for
/// specific argument types. Selecting a viable kernel for executing a function
/// is referred to as "dispatching".
class ARROW_EXPORT Function {
public:
/// \brief The kind of function, which indicates in what contexts it is
/// valid for use.
enum Kind {
/// A function that performs scalar data operations on whole arrays of
/// data. Can generally process Array or Scalar values. The size of the
/// output will be the same as the size (or broadcasted size, in the case
/// of mixing Array and Scalar inputs) of the input.
SCALAR,
/// A function with array input and output whose behavior depends on the
/// values of the entire arrays passed, rather than the value of each scalar
/// value.
VECTOR,
/// A function that computes scalar summary statistics from array input.
SCALAR_AGGREGATE,
/// A function that computes grouped summary statistics from array input
/// and an array of group identifiers.
HASH_AGGREGATE,
/// A function that dispatches to other functions and does not contain its
/// own kernels.
META
};
virtual ~Function() = default;
/// \brief The name of the kernel. The registry enforces uniqueness of names.
const std::string& name() const { return name_; }
/// \brief The kind of kernel, which indicates in what contexts it is valid
/// for use.
Function::Kind kind() const { return kind_; }
/// \brief Contains the number of arguments the function requires, or if the
/// function accepts variable numbers of arguments.
const Arity& arity() const { return arity_; }
/// \brief Return the function documentation
const FunctionDoc& doc() const { return doc_; }
/// \brief Returns the number of registered kernels for this function.
virtual int num_kernels() const = 0;
/// \brief Return a kernel that can execute the function given the exact
/// argument types (without implicit type casts).
///
/// NB: This function is overridden in CastFunction.
virtual Result<const Kernel*> DispatchExact(const std::vector<TypeHolder>& types) const;
/// \brief Return a best-match kernel that can execute the function given the argument
/// types, after implicit casts are applied.
///
/// \param[in,out] values Argument types. An element may be modified to
/// indicate that the returned kernel only approximately matches the input
/// value descriptors; callers are responsible for casting inputs to the type
/// required by the kernel.
virtual Result<const Kernel*> DispatchBest(std::vector<TypeHolder>* values) const;
/// \brief Get a function executor with a best-matching kernel
///
/// The returned executor will by default work with the default FunctionOptions
/// and KernelContext. If you want to change that, call `FunctionExecutor::Init`.
virtual Result<std::shared_ptr<FunctionExecutor>> GetBestExecutor(
std::vector<TypeHolder> inputs) const;
/// \brief Execute the function eagerly with the passed input arguments with
/// kernel dispatch, batch iteration, and memory allocation details taken
/// care of.
///
/// If the `options` pointer is null, then `default_options()` will be used.
///
/// This function can be overridden in subclasses.
virtual Result<Datum> Execute(const std::vector<Datum>& args,
const FunctionOptions* options, ExecContext* ctx) const;
virtual Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
ExecContext* ctx) const;
/// \brief Returns the default options for this function.
///
/// Whatever option semantics a Function has, implementations must guarantee
/// that default_options() is valid to pass to Execute as options.
const FunctionOptions* default_options() const { return default_options_; }
virtual Status Validate() const;
/// \brief Returns the pure property for this function.
///
/// Impure functions are those that may return different results for the same
/// input arguments. For example, a function that returns a random number is
/// not pure. An expression containing only pure functions can be simplified by
/// pre-evaluating any sub-expressions that have constant arguments.
virtual bool is_pure() const { return true; }
protected:
Function(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
const FunctionOptions* default_options)
: name_(std::move(name)),
kind_(kind),
arity_(arity),
doc_(std::move(doc)),
default_options_(default_options) {}
Status CheckArity(size_t num_args) const;
std::string name_;
Function::Kind kind_;
Arity arity_;
const FunctionDoc doc_;
const FunctionOptions* default_options_ = NULLPTR;
};
namespace detail {
template <typename KernelType>
class FunctionImpl : public Function {
public:
/// \brief Return pointers to current-available kernels for inspection
std::vector<const KernelType*> kernels() const {
std::vector<const KernelType*> result;
for (const auto& kernel : kernels_) {
result.push_back(&kernel);
}
return result;
}
int num_kernels() const override { return static_cast<int>(kernels_.size()); }
protected:
FunctionImpl(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
const FunctionOptions* default_options)
: Function(std::move(name), kind, arity, std::move(doc), default_options) {}
std::vector<KernelType> kernels_;
};
/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
ARROW_EXPORT
const Kernel* DispatchExactImpl(const Function* func, const std::vector<TypeHolder>&);
/// \brief Return an error message if no Kernel is found.
ARROW_EXPORT
Status NoMatchingKernel(const Function* func, const std::vector<TypeHolder>&);
} // namespace detail
/// \brief A function that executes elementwise operations on arrays or
/// scalars, and therefore whose results generally do not depend on the order
/// of the values in the arguments. Accepts and returns arrays that are all of
/// the same size. These functions roughly correspond to the functions used in
/// SQL expressions.
class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
public:
using KernelType = ScalarKernel;
ScalarFunction(std::string name, const Arity& arity, FunctionDoc doc,
const FunctionOptions* default_options = NULLPTR, bool is_pure = true)
: detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity,
std::move(doc), default_options),
is_pure_(is_pure) {}
/// \brief Add a kernel with given input/output types, no required state
/// initialization, preallocation for fixed-width types, and default null
/// handling (intersect validity bitmaps of inputs).
Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, KernelInit init = NULLPTR,
std::shared_ptr<MatchConstraint> constraint = NULLPTR);
/// \brief Add a kernel (function implementation). Returns error if the
/// kernel's signature does not match the function's arity.
Status AddKernel(ScalarKernel kernel);
/// \brief Returns the pure property for this function.
bool is_pure() const override { return is_pure_; }
private:
const bool is_pure_;
};
/// \brief A function that executes general array operations that may yield
/// outputs of different sizes or have results that depend on the whole array
/// contents. These functions roughly correspond to the functions found in
/// non-SQL array languages like APL and its derivatives.
class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
public:
using KernelType = VectorKernel;
VectorFunction(std::string name, const Arity& arity, FunctionDoc doc,
const FunctionOptions* default_options = NULLPTR)
: detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity,
std::move(doc), default_options) {}
/// \brief Add a simple kernel with given input/output types, no required
/// state initialization, no data preallocation, and no preallocation of the
/// validity bitmap.
Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, KernelInit init = NULLPTR);
/// \brief Add a kernel (function implementation). Returns error if the
/// kernel's signature does not match the function's arity.
Status AddKernel(VectorKernel kernel);
};
class ARROW_EXPORT ScalarAggregateFunction
: public detail::FunctionImpl<ScalarAggregateKernel> {
public:
using KernelType = ScalarAggregateKernel;
ScalarAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc,
const FunctionOptions* default_options = NULLPTR)
: detail::FunctionImpl<ScalarAggregateKernel>(std::move(name),
Function::SCALAR_AGGREGATE, arity,
std::move(doc), default_options) {}
/// \brief Add a kernel (function implementation). Returns error if the
/// kernel's signature does not match the function's arity.
Status AddKernel(ScalarAggregateKernel kernel);
};
class ARROW_EXPORT HashAggregateFunction
: public detail::FunctionImpl<HashAggregateKernel> {
public:
using KernelType = HashAggregateKernel;
HashAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc,
const FunctionOptions* default_options = NULLPTR)
: detail::FunctionImpl<HashAggregateKernel>(std::move(name),
Function::HASH_AGGREGATE, arity,
std::move(doc), default_options) {}
/// \brief Add a kernel (function implementation). Returns error if the
/// kernel's signature does not match the function's arity.
Status AddKernel(HashAggregateKernel kernel);
};
/// \brief A function that dispatches to other functions. Must implement
/// MetaFunction::ExecuteImpl.
///
/// For Array, ChunkedArray, and Scalar Datum kinds, may rely on the execution
/// of concrete Function types, but must handle other Datum kinds on its own.
class ARROW_EXPORT MetaFunction : public Function {
public:
int num_kernels() const override { return 0; }
Result<Datum> Execute(const std::vector<Datum>& args, const FunctionOptions* options,
ExecContext* ctx) const override;
Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
ExecContext* ctx) const override;
protected:
virtual Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
ExecContext* ctx) const = 0;
MetaFunction(std::string name, const Arity& arity, FunctionDoc doc,
const FunctionOptions* default_options = NULLPTR)
: Function(std::move(name), Function::META, arity, std::move(doc),
default_options) {}
};
/// @}
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,81 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// NOTE: API is EXPERIMENTAL and will change without going through a
// deprecation cycle.
#pragma once
#include "arrow/compute/type_fwd.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace compute {
/// \addtogroup compute-functions
/// @{
/// \brief Extension point for defining options outside libarrow (but
/// still within this project).
class ARROW_EXPORT FunctionOptionsType {
public:
virtual ~FunctionOptionsType() = default;
virtual const char* type_name() const = 0;
virtual std::string Stringify(const FunctionOptions&) const = 0;
virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0;
virtual Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const;
virtual Result<std::unique_ptr<FunctionOptions>> Deserialize(
const Buffer& buffer) const;
virtual std::unique_ptr<FunctionOptions> Copy(const FunctionOptions&) const = 0;
};
/// \brief Base class for specifying options configuring a function's behavior,
/// such as error handling.
class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> {
public:
virtual ~FunctionOptions() = default;
const FunctionOptionsType* options_type() const { return options_type_; }
const char* type_name() const { return options_type()->type_name(); }
bool Equals(const FunctionOptions& other) const;
std::string ToString() const;
std::unique_ptr<FunctionOptions> Copy() const;
/// \brief Serialize an options struct to a buffer.
Result<std::shared_ptr<Buffer>> Serialize() const;
/// \brief Deserialize an options struct from a buffer.
/// Note: this will only look for `type_name` in the default FunctionRegistry;
/// to use a custom FunctionRegistry, look up the FunctionOptionsType, then
/// call FunctionOptionsType::Deserialize().
static Result<std::unique_ptr<FunctionOptions>> Deserialize(
const std::string& type_name, const Buffer& buffer);
protected:
explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {}
const FunctionOptionsType* options_type_;
};
ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*);
/// @}
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,32 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/compute/visibility.h"
#include "arrow/status.h"
namespace arrow::compute {
/// \brief Initialize the compute module.
///
/// Register the compute kernel functions to be available on the
/// global FunctionRegistry.
/// This function will only be available if ARROW_COMPUTE is enabled.
ARROW_COMPUTE_EXPORT Status Initialize();
} // namespace arrow::compute

View File

@@ -0,0 +1,772 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// NOTE: API is EXPERIMENTAL and will change without going through a
// deprecation cycle
#pragma once
#include <cstddef>
#include <cstdint>
#include <functional>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/buffer.h"
#include "arrow/compute/exec.h"
#include "arrow/datum.h"
#include "arrow/device_allocation_type_set.h"
#include "arrow/memory_pool.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
// macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h.
// No other BSD seems to do so. The name is used as an identifier in MemAllocation enum.
#if defined(__APPLE__) && defined(PREALLOCATE)
# undef PREALLOCATE
#endif
namespace arrow {
namespace compute {
class FunctionOptions;
/// \brief Base class for opaque kernel-specific state. For example, if there
/// is some kind of initialization required.
struct ARROW_EXPORT KernelState {
virtual ~KernelState() = default;
};
/// \brief Context/state for the execution of a particular kernel.
class ARROW_EXPORT KernelContext {
public:
// Can pass optional backreference; not used consistently for the
// moment but will be made so in the future
explicit KernelContext(ExecContext* exec_ctx, const Kernel* kernel = NULLPTR)
: exec_ctx_(exec_ctx), kernel_(kernel) {}
/// \brief Allocate buffer from the context's memory pool. The contents are
/// not initialized.
Result<std::shared_ptr<ResizableBuffer>> Allocate(int64_t nbytes);
/// \brief Allocate buffer for bitmap from the context's memory pool. Like
/// Allocate, the contents of the buffer are not initialized but the last
/// byte is preemptively zeroed to help avoid ASAN or valgrind issues.
Result<std::shared_ptr<ResizableBuffer>> AllocateBitmap(int64_t num_bits);
/// \brief Assign the active KernelState to be utilized for each stage of
/// kernel execution. Ownership and memory lifetime of the KernelState must
/// be minded separately.
void SetState(KernelState* state) { state_ = state; }
// Set kernel that is being invoked since some kernel
// implementations will examine the kernel state.
void SetKernel(const Kernel* kernel) { kernel_ = kernel; }
KernelState* state() { return state_; }
/// \brief Configuration related to function execution that is to be shared
/// across multiple kernels.
ExecContext* exec_context() { return exec_ctx_; }
/// \brief The memory pool to use for allocations. For now, it uses the
/// MemoryPool contained in the ExecContext used to create the KernelContext.
MemoryPool* memory_pool() { return exec_ctx_->memory_pool(); }
const Kernel* kernel() const { return kernel_; }
private:
ExecContext* exec_ctx_;
KernelState* state_ = NULLPTR;
const Kernel* kernel_ = NULLPTR;
};
/// \brief An type-checking interface to permit customizable validation rules
/// for use with InputType and KernelSignature. This is for scenarios where the
/// acceptance is not an exact type instance, such as a TIMESTAMP type for a
/// specific TimeUnit, but permitting any time zone.
struct ARROW_EXPORT TypeMatcher {
virtual ~TypeMatcher() = default;
/// \brief Return true if this matcher accepts the data type.
virtual bool Matches(const DataType& type) const = 0;
/// \brief A human-interpretable string representation of what the type
/// matcher checks for, usable when printing KernelSignature or formatting
/// error messages.
virtual std::string ToString() const = 0;
/// \brief Return true if this TypeMatcher contains the same matching rule as
/// the other. Currently depends on RTTI.
virtual bool Equals(const TypeMatcher& other) const = 0;
};
namespace match {
/// \brief Match any DataType instance having the same DataType::id.
ARROW_EXPORT std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id);
/// \brief Match any TimestampType instance having the same unit, but the time
/// zones can be different.
ARROW_EXPORT std::shared_ptr<TypeMatcher> TimestampTypeUnit(TimeUnit::type unit);
ARROW_EXPORT std::shared_ptr<TypeMatcher> Time32TypeUnit(TimeUnit::type unit);
ARROW_EXPORT std::shared_ptr<TypeMatcher> Time64TypeUnit(TimeUnit::type unit);
ARROW_EXPORT std::shared_ptr<TypeMatcher> DurationTypeUnit(TimeUnit::type unit);
// \brief Match any integer type
ARROW_EXPORT std::shared_ptr<TypeMatcher> Integer();
// Match types using 32-bit varbinary representation
ARROW_EXPORT std::shared_ptr<TypeMatcher> BinaryLike();
// Match types using 64-bit varbinary representation
ARROW_EXPORT std::shared_ptr<TypeMatcher> LargeBinaryLike();
// Match any fixed binary type
ARROW_EXPORT std::shared_ptr<TypeMatcher> FixedSizeBinaryLike();
// \brief Match any primitive type (boolean or any type representable as a C
// Type)
ARROW_EXPORT std::shared_ptr<TypeMatcher> Primitive();
// \brief Match any integer type that can be used as run-end in run-end encoded
// arrays
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndInteger();
/// \brief Match run-end encoded types that use any valid run-end type and
/// encode specific value types
///
/// @param[in] value_type_matcher a matcher that is applied to the values field
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
std::shared_ptr<TypeMatcher> value_type_matcher);
/// \brief Match run-end encoded types that use any valid run-end type and
/// encode specific value types
///
/// @param[in] value_type_id a type id that the type of the values field should match
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(Type::type value_type_id);
/// \brief Match run-end encoded types that encode specific run-end and value types
///
/// @param[in] run_end_type_matcher a matcher that is applied to the run_ends field
/// @param[in] value_type_matcher a matcher that is applied to the values field
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
std::shared_ptr<TypeMatcher> run_end_type_matcher,
std::shared_ptr<TypeMatcher> value_type_matcher);
} // namespace match
/// \brief An object used for type-checking arguments to be passed to a kernel
/// and stored in a KernelSignature. The type-checking rule can be supplied
/// either with an exact DataType instance or a custom TypeMatcher.
class ARROW_EXPORT InputType {
public:
/// \brief The kind of type-checking rule that the InputType contains.
enum Kind {
/// \brief Accept any value type.
ANY_TYPE,
/// \brief A fixed arrow::DataType and will only exact match having this
/// exact type (e.g. same TimestampType unit, same decimal scale and
/// precision, or same nested child types).
EXACT_TYPE,
/// \brief Uses a TypeMatcher implementation to check the type.
USE_TYPE_MATCHER
};
/// \brief Accept any value type
InputType() : kind_(ANY_TYPE) {}
/// \brief Accept an exact value type.
InputType(std::shared_ptr<DataType> type) // NOLINT implicit construction
: kind_(EXACT_TYPE), type_(std::move(type)) {}
/// \brief Use the passed TypeMatcher to type check.
InputType(std::shared_ptr<TypeMatcher> type_matcher) // NOLINT implicit construction
: kind_(USE_TYPE_MATCHER), type_matcher_(std::move(type_matcher)) {}
/// \brief Match any type with the given Type::type. Uses a TypeMatcher for
/// its implementation.
InputType(Type::type type_id) // NOLINT implicit construction
: InputType(match::SameTypeId(type_id)) {}
InputType(const InputType& other) { CopyInto(other); }
void operator=(const InputType& other) { CopyInto(other); }
InputType(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
void operator=(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
// \brief Match any input (array, scalar of any type)
static InputType Any() { return InputType(); }
/// \brief Return true if this input type matches the same type cases as the
/// other.
bool Equals(const InputType& other) const;
bool operator==(const InputType& other) const { return this->Equals(other); }
bool operator!=(const InputType& other) const { return !(*this == other); }
/// \brief Return hash code.
size_t Hash() const;
/// \brief Render a human-readable string representation.
std::string ToString() const;
/// \brief Return true if the Datum matches this argument kind in
/// type (and only allows scalar or array-like Datums).
bool Matches(const Datum& value) const;
/// \brief Return true if the type matches this InputType
bool Matches(const DataType& type) const;
/// \brief The type matching rule that this InputType uses.
Kind kind() const { return kind_; }
/// \brief For InputType::EXACT_TYPE kind, the exact type that this InputType
/// must match. Otherwise this function should not be used and will assert in
/// debug builds.
const std::shared_ptr<DataType>& type() const;
/// \brief For InputType::USE_TYPE_MATCHER, the TypeMatcher to be used for
/// checking the type of a value. Otherwise this function should not be used
/// and will assert in debug builds.
const TypeMatcher& type_matcher() const;
private:
void CopyInto(const InputType& other) {
this->kind_ = other.kind_;
this->type_ = other.type_;
this->type_matcher_ = other.type_matcher_;
}
void MoveInto(InputType&& other) {
this->kind_ = other.kind_;
this->type_ = std::move(other.type_);
this->type_matcher_ = std::move(other.type_matcher_);
}
Kind kind_;
// For EXACT_TYPE Kind
std::shared_ptr<DataType> type_;
// For USE_TYPE_MATCHER Kind
std::shared_ptr<TypeMatcher> type_matcher_;
};
/// \brief Container to capture both exact and input-dependent output types.
class ARROW_EXPORT OutputType {
public:
/// \brief An enum indicating whether the value type is an invariant fixed
/// value or one that's computed by a kernel-defined resolver function.
enum ResolveKind { FIXED, COMPUTED };
/// Type resolution function. Given input types, return output type. This
/// function MAY may use the kernel state to decide the output type based on
/// the FunctionOptions.
///
/// This function SHOULD _not_ be used to check for arity, that is to be
/// performed one or more layers above.
using Resolver =
std::function<Result<TypeHolder>(KernelContext*, const std::vector<TypeHolder>&)>;
/// \brief Output an exact type
OutputType(std::shared_ptr<DataType> type) // NOLINT implicit construction
: kind_(FIXED), type_(std::move(type)) {}
/// \brief Output a computed type depending on actual input types
template <typename Fn>
OutputType(Fn resolver) // NOLINT implicit construction
: kind_(COMPUTED), resolver_(std::move(resolver)) {}
OutputType(const OutputType& other) {
this->kind_ = other.kind_;
this->type_ = other.type_;
this->resolver_ = other.resolver_;
}
OutputType(OutputType&& other) {
this->kind_ = other.kind_;
this->type_ = std::move(other.type_);
this->resolver_ = other.resolver_;
}
OutputType& operator=(const OutputType&) = default;
OutputType& operator=(OutputType&&) = default;
/// \brief Return the type of the expected output value of the kernel given
/// the input argument types. The resolver may make use of state information
/// kept in the KernelContext.
Result<TypeHolder> Resolve(KernelContext* ctx,
const std::vector<TypeHolder>& args) const;
/// \brief The exact output value type for the FIXED kind.
const std::shared_ptr<DataType>& type() const;
/// \brief For use with COMPUTED resolution strategy. It may be more
/// convenient to invoke this with OutputType::Resolve returned from this
/// method.
const Resolver& resolver() const;
/// \brief Render a human-readable string representation.
std::string ToString() const;
/// \brief Return the kind of type resolution of this output type, whether
/// fixed/invariant or computed by a resolver.
ResolveKind kind() const { return kind_; }
private:
ResolveKind kind_;
// For FIXED resolution
std::shared_ptr<DataType> type_;
// For COMPUTED resolution
Resolver resolver_ = NULLPTR;
};
/// \brief Additional constraints to apply to the input types of a kernel when matching a
/// specific kernel signature.
class ARROW_EXPORT MatchConstraint {
public:
virtual ~MatchConstraint() = default;
/// \brief Return true if the input types satisfy the constraint.
virtual bool Matches(const std::vector<TypeHolder>& types) const = 0;
/// \brief Convenience function to create a MatchConstraint from a match function.
static std::shared_ptr<MatchConstraint> Make(
std::function<bool(const std::vector<TypeHolder>&)> matches);
};
/// \brief Constraint that all input types are decimal types and have the same scale.
ARROW_EXPORT std::shared_ptr<MatchConstraint> DecimalsHaveSameScale();
/// \brief Holds the input types, optional match constraint and output type of the kernel.
///
/// VarArgs functions with minimum N arguments should pass up to N input types to be
/// used to validate the input types of a function invocation. The first N-1 types
/// will be matched against the first N-1 arguments, and the last type will be
/// matched against the remaining arguments.
class ARROW_EXPORT KernelSignature {
public:
KernelSignature(std::vector<InputType> in_types, OutputType out_type,
bool is_varargs = false,
std::shared_ptr<MatchConstraint> constraint = NULLPTR);
/// \brief Convenience ctor since make_shared can be awkward
static std::shared_ptr<KernelSignature> Make(
std::vector<InputType> in_types, OutputType out_type, bool is_varargs = false,
std::shared_ptr<MatchConstraint> constraint = NULLPTR);
/// \brief Return true if the signature is compatible with the list of input
/// value descriptors and satisfies the match constraint, if any.
bool MatchesInputs(const std::vector<TypeHolder>& types) const;
/// \brief Returns true if the input types of each signature are
/// equal. Well-formed functions should have a deterministic output type
/// given input types, but currently it is the responsibility of the
/// developer to ensure this.
bool Equals(const KernelSignature& other) const;
bool operator==(const KernelSignature& other) const { return this->Equals(other); }
bool operator!=(const KernelSignature& other) const { return !(*this == other); }
/// \brief Compute a hash code for the signature
size_t Hash() const;
/// \brief The input types for the kernel. For VarArgs functions, this should
/// generally contain a single validator to use for validating all of the
/// function arguments.
const std::vector<InputType>& in_types() const { return in_types_; }
/// \brief The output type for the kernel. Use Resolve to return the
/// exact output given input argument types, since many kernels'
/// output types depend on their input types (or their type
/// metadata).
const OutputType& out_type() const { return out_type_; }
/// \brief Render a human-readable string representation
std::string ToString() const;
bool is_varargs() const { return is_varargs_; }
private:
std::vector<InputType> in_types_;
OutputType out_type_;
bool is_varargs_;
std::shared_ptr<MatchConstraint> constraint_;
// For caching the hash code after it's computed the first time
mutable uint64_t hash_code_;
};
/// \brief A function may contain multiple variants of a kernel for a given
/// type combination for different SIMD levels. Based on the active system's
/// CPU info or the user's preferences, we can elect to use one over the other.
struct SimdLevel {
enum type { NONE = 0, SSE4_2, AVX, AVX2, AVX512, NEON, MAX };
};
/// \brief The strategy to use for propagating or otherwise populating the
/// validity bitmap of a kernel output.
struct NullHandling {
enum type {
/// Compute the output validity bitmap by intersecting the validity bitmaps
/// of the arguments using bitwise-and operations. This means that values
/// in the output are valid/non-null only if the corresponding values in
/// all input arguments were valid/non-null. Kernel generally need not
/// touch the bitmap thereafter, but a kernel's exec function is permitted
/// to alter the bitmap after the null intersection is computed if it needs
/// to.
INTERSECTION,
/// Kernel expects a pre-allocated buffer to write the result bitmap
/// into. The preallocated memory is not zeroed (except for the last byte),
/// so the kernel should ensure to completely populate the bitmap.
COMPUTED_PREALLOCATE,
/// Kernel allocates and sets the validity bitmap of the output.
COMPUTED_NO_PREALLOCATE,
/// Kernel output is never null and a validity bitmap does not need to be
/// allocated.
OUTPUT_NOT_NULL
};
};
/// \brief The preference for memory preallocation of fixed-width type outputs
/// in kernel execution.
struct MemAllocation {
enum type {
// For data types that support pre-allocation (i.e. fixed-width), the
// kernel expects to be provided a pre-allocated data buffer to write
// into. Non-fixed-width types must always allocate their own data
// buffers. The allocation made for the same length as the execution batch,
// so vector kernels yielding differently sized output should not use this.
//
// It is valid for the data to not be preallocated but the validity bitmap
// is (or is computed using the intersection/bitwise-and method).
//
// For variable-size output types like BinaryType or StringType, or for
// nested types, this option has no effect.
PREALLOCATE,
// The kernel is responsible for allocating its own data buffer for
// fixed-width type outputs.
NO_PREALLOCATE
};
};
struct Kernel;
/// \brief Arguments to pass to an KernelInit function. A struct is used to help
/// avoid API breakage should the arguments passed need to be expanded.
struct KernelInitArgs {
/// \brief A pointer to the kernel being initialized. The init function may
/// depend on the kernel's KernelSignature or other data contained there.
const Kernel* kernel;
/// \brief The types of the input arguments that the kernel is
/// about to be executed against.
const std::vector<TypeHolder>& inputs;
/// \brief Opaque options specific to this kernel. May be nullptr for functions
/// that do not require options.
const FunctionOptions* options;
};
/// \brief Common initializer function for all kernel types.
using KernelInit = std::function<Result<std::unique_ptr<KernelState>>(
KernelContext*, const KernelInitArgs&)>;
/// \brief Base type for kernels. Contains the function signature and
/// optionally the state initialization function, along with some common
/// attributes
struct ARROW_EXPORT Kernel {
Kernel() = default;
Kernel(std::shared_ptr<KernelSignature> sig, KernelInit init)
: signature(std::move(sig)), init(std::move(init)) {}
Kernel(std::vector<InputType> in_types, OutputType out_type, KernelInit init)
: Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
std::move(init)) {}
/// \brief The "signature" of the kernel containing the InputType input
/// argument validators and OutputType output type resolver.
std::shared_ptr<KernelSignature> signature;
/// \brief Create a new KernelState for invocations of this kernel, e.g. to
/// set up any options or state relevant for execution.
KernelInit init;
/// \brief Create a vector of new KernelState for invocations of this kernel.
static Status InitAll(KernelContext*, const KernelInitArgs&,
std::vector<std::unique_ptr<KernelState>>*);
/// \brief Indicates whether execution can benefit from parallelization
/// (splitting large chunks into smaller chunks and using multiple
/// threads). Some kernels may not support parallel execution at
/// all. Synchronization and concurrency-related issues are currently the
/// responsibility of the Kernel's implementation.
bool parallelizable = true;
/// \brief Indicates the level of SIMD instruction support in the host CPU is
/// required to use the function. The intention is for functions to be able to
/// contain multiple kernels with the same signature but different levels of SIMD,
/// so that the most optimized kernel supported on a host's processor can be chosen.
SimdLevel::type simd_level = SimdLevel::NONE;
// Additional kernel-specific data
std::shared_ptr<KernelState> data;
};
/// \brief The scalar kernel execution API that must be implemented for SCALAR
/// kernel types. This includes both stateless and stateful kernels. Kernels
/// depending on some execution state access that state via subclasses of
/// KernelState set on the KernelContext object. Implementations should
/// endeavor to write into pre-allocated memory if they are able, though for
/// some kernels (e.g. in cases when a builder like StringBuilder) must be
/// employed this may not be possible.
using ArrayKernelExec = Status (*)(KernelContext*, const ExecSpan&, ExecResult*);
/// \brief Kernel data structure for implementations of ScalarFunction. In
/// addition to the members found in Kernel, contains the null handling
/// and memory pre-allocation preferences.
struct ARROW_EXPORT ScalarKernel : public Kernel {
ScalarKernel() = default;
ScalarKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
KernelInit init = NULLPTR)
: Kernel(std::move(sig), init), exec(exec) {}
ScalarKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
KernelInit init = NULLPTR)
: Kernel(std::move(in_types), std::move(out_type), std::move(init)), exec(exec) {}
/// \brief Perform a single invocation of this kernel. Depending on the
/// implementation, it may only write into preallocated memory, while in some
/// cases it will allocate its own memory. Any required state is managed
/// through the KernelContext.
ArrayKernelExec exec;
/// \brief Writing execution results into larger contiguous allocations
/// requires that the kernel be able to write into sliced output ArrayData*,
/// including sliced output validity bitmaps. Some kernel implementations may
/// not be able to do this, so setting this to false disables this
/// functionality.
bool can_write_into_slices = true;
// For scalar functions preallocated data and intersecting arg validity
// bitmaps is a reasonable default
NullHandling::type null_handling = NullHandling::INTERSECTION;
MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE;
};
// ----------------------------------------------------------------------
// VectorKernel (for VectorFunction)
/// \brief Kernel data structure for implementations of VectorFunction. In
/// contains an optional finalizer function, the null handling and memory
/// pre-allocation preferences (which have different defaults from
/// ScalarKernel), and some other execution-related options.
struct ARROW_EXPORT VectorKernel : public Kernel {
/// \brief See VectorKernel::finalize member for usage
using FinalizeFunc = std::function<Status(KernelContext*, std::vector<Datum>*)>;
/// \brief Function for executing a stateful VectorKernel against a
/// ChunkedArray input. Does not need to be defined for all VectorKernels
using ChunkedExec = Status (*)(KernelContext*, const ExecBatch&, Datum* out);
VectorKernel() = default;
VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
: Kernel(std::move(in_types), std::move(out_type), std::move(init)),
exec(exec),
finalize(std::move(finalize)) {}
VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
: Kernel(std::move(sig), std::move(init)),
exec(exec),
finalize(std::move(finalize)) {}
/// \brief Perform a single invocation of this kernel. Any required state is
/// managed through the KernelContext.
ArrayKernelExec exec;
/// \brief Execute the kernel on a ChunkedArray. Does not need to be defined
ChunkedExec exec_chunked = NULLPTR;
/// \brief For VectorKernel, convert intermediate results into finalized
/// results. Mutates input argument. Some kernels may accumulate state
/// (example: hashing-related functions) through processing chunked inputs, and
/// then need to attach some accumulated state to each of the outputs of
/// processing each chunk of data.
FinalizeFunc finalize;
/// Since vector kernels generally are implemented rather differently from
/// scalar/elementwise kernels (and they may not even yield arrays of the same
/// size), so we make the developer opt-in to any memory preallocation rather
/// than having to turn it off.
NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE;
/// \brief Writing execution results into larger contiguous allocations
/// requires that the kernel be able to write into sliced output ArrayData*,
/// including sliced output validity bitmaps. Some kernel implementations may
/// not be able to do this, so setting this to false disables this
/// functionality.
bool can_write_into_slices = true;
/// Some vector kernels can do chunkwise execution using ExecSpanIterator,
/// in some cases accumulating some state. Other kernels (like Take) need to
/// be passed whole arrays and don't work on ChunkedArray inputs
bool can_execute_chunkwise = true;
/// Some kernels (like unique and value_counts) yield non-chunked output from
/// chunked-array inputs. This option controls how the results are boxed when
/// returned from ExecVectorFunction
///
/// true -> ChunkedArray
/// false -> Array
bool output_chunked = true;
};
// ----------------------------------------------------------------------
// ScalarAggregateKernel (for ScalarAggregateFunction)
using ScalarAggregateConsume = Status (*)(KernelContext*, const ExecSpan&);
using ScalarAggregateMerge = Status (*)(KernelContext*, KernelState&&, KernelState*);
// Finalize returns Datum to permit multiple return values
using ScalarAggregateFinalize = Status (*)(KernelContext*, Datum*);
/// \brief Kernel data structure for implementations of
/// ScalarAggregateFunction. The four necessary components of an aggregation
/// kernel are the init, consume, merge, and finalize functions.
///
/// * init: creates a new KernelState for a kernel.
/// * consume: processes an ExecSpan and updates the KernelState found in the
/// KernelContext.
/// * merge: combines one KernelState with another.
/// * finalize: produces the end result of the aggregation using the
/// KernelState in the KernelContext.
struct ARROW_EXPORT ScalarAggregateKernel : public Kernel {
ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
ScalarAggregateConsume consume, ScalarAggregateMerge merge,
ScalarAggregateFinalize finalize, const bool ordered)
: Kernel(std::move(sig), std::move(init)),
consume(consume),
merge(merge),
finalize(finalize),
ordered(ordered) {}
ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
KernelInit init, ScalarAggregateConsume consume,
ScalarAggregateMerge merge, ScalarAggregateFinalize finalize,
const bool ordered)
: ScalarAggregateKernel(
KernelSignature::Make(std::move(in_types), std::move(out_type)),
std::move(init), consume, merge, finalize, ordered) {}
/// \brief Merge a vector of KernelStates into a single KernelState.
/// The merged state will be returned and will be set on the KernelContext.
static Result<std::unique_ptr<KernelState>> MergeAll(
const ScalarAggregateKernel* kernel, KernelContext* ctx,
std::vector<std::unique_ptr<KernelState>> states);
ScalarAggregateConsume consume;
ScalarAggregateMerge merge;
ScalarAggregateFinalize finalize;
/// \brief Whether this kernel requires ordering
/// Some aggregations, such as, "first", requires some kind of input order. The
/// order can be implicit, e.g., the order of the input data, or explicit, e.g.
/// the ordering specified with a window aggregation.
/// The caller of the aggregate kernel is responsible for passing data in some
/// defined order to the kernel. The flag here is a way for the kernel to tell
/// the caller that data passed to the kernel must be defined in some order.
bool ordered = false;
};
// ----------------------------------------------------------------------
// HashAggregateKernel (for HashAggregateFunction)
using HashAggregateResize = Status (*)(KernelContext*, int64_t);
using HashAggregateConsume = Status (*)(KernelContext*, const ExecSpan&);
using HashAggregateMerge = Status (*)(KernelContext*, KernelState&&, const ArrayData&);
// Finalize returns Datum to permit multiple return values
using HashAggregateFinalize = Status (*)(KernelContext*, Datum*);
/// \brief Kernel data structure for implementations of
/// HashAggregateFunction. The four necessary components of an aggregation
/// kernel are the init, consume, merge, and finalize functions.
///
/// * init: creates a new KernelState for a kernel.
/// * resize: ensure that the KernelState can accommodate the specified number of groups.
/// * consume: processes an ExecSpan (which includes the argument as well
/// as an array of group identifiers) and updates the KernelState found in the
/// KernelContext.
/// * merge: combines one KernelState with another.
/// * finalize: produces the end result of the aggregation using the
/// KernelState in the KernelContext.
struct ARROW_EXPORT HashAggregateKernel : public Kernel {
HashAggregateKernel() = default;
HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
HashAggregateResize resize, HashAggregateConsume consume,
HashAggregateMerge merge, HashAggregateFinalize finalize,
const bool ordered)
: Kernel(std::move(sig), std::move(init)),
resize(resize),
consume(consume),
merge(merge),
finalize(finalize),
ordered(ordered) {}
HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
KernelInit init, HashAggregateConsume consume,
HashAggregateResize resize, HashAggregateMerge merge,
HashAggregateFinalize finalize, const bool ordered)
: HashAggregateKernel(
KernelSignature::Make(std::move(in_types), std::move(out_type)),
std::move(init), resize, consume, merge, finalize, ordered) {}
HashAggregateResize resize;
HashAggregateConsume consume;
HashAggregateMerge merge;
HashAggregateFinalize finalize;
/// @brief whether the summarizer requires ordering
/// This is similar to ScalarAggregateKernel. See ScalarAggregateKernel
/// for detailed doc of this variable.
bool ordered = false;
};
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,120 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include <vector>
#include "arrow/type.h"
#include "arrow/util/compare.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace compute {
enum class SortOrder {
/// Arrange values in increasing order
Ascending,
/// Arrange values in decreasing order
Descending,
};
enum class NullPlacement {
/// Place nulls and NaNs before any non-null values.
/// NaNs will come after nulls.
AtStart,
/// Place nulls and NaNs after any non-null values.
/// NaNs will come before nulls.
AtEnd,
};
/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
public:
explicit SortKey(FieldRef target, SortOrder order = SortOrder::Ascending)
: target(std::move(target)), order(order) {}
bool Equals(const SortKey& other) const;
std::string ToString() const;
/// A FieldRef targeting the sort column.
FieldRef target;
/// How to order by this sort key.
SortOrder order;
};
class ARROW_EXPORT Ordering : public util::EqualityComparable<Ordering> {
public:
Ordering(std::vector<SortKey> sort_keys,
NullPlacement null_placement = NullPlacement::AtStart)
: sort_keys_(std::move(sort_keys)), null_placement_(null_placement) {}
/// true if data ordered by other is also ordered by this
///
/// For example, if data is ordered by [a, b, c] then it is also ordered
/// by [a, b] but not by [b, c] or [a, b, c, d].
///
/// [a, b].IsSuborderOf([a, b, c]) - true
/// [a, b, c].IsSuborderOf([a, b, c]) - true
/// [b, c].IsSuborderOf([a, b, c]) - false
/// [a, b, c, d].IsSuborderOf([a, b, c]) - false
///
/// The implicit ordering is not a suborder of any other ordering and
/// no other ordering is a suborder of it. The implicit ordering is not a
/// suborder of itself.
///
/// The unordered ordering is a suborder of all other orderings but no
/// other ordering is a suborder of it. The unordered ordering is a suborder
/// of itself.
///
/// The unordered ordering is a suborder of the implicit ordering.
bool IsSuborderOf(const Ordering& other) const;
bool Equals(const Ordering& other) const;
std::string ToString() const;
bool is_implicit() const { return is_implicit_; }
bool is_unordered() const { return !is_implicit_ && sort_keys_.empty(); }
const std::vector<SortKey>& sort_keys() const { return sort_keys_; }
NullPlacement null_placement() const { return null_placement_; }
static const Ordering& Implicit() {
static const Ordering kImplicit(true);
return kImplicit;
}
static const Ordering& Unordered() {
static const Ordering kUnordered(false);
// It is also possible to get an unordered ordering by passing in an empty vector
// using the normal constructor. This is ok and useful when ordering comes from user
// input.
return kUnordered;
}
private:
explicit Ordering(bool is_implicit)
: null_placement_(NullPlacement::AtStart), is_implicit_(is_implicit) {}
/// Column key(s) to order by and how to order by these sort keys.
std::vector<SortKey> sort_keys_;
/// Whether nulls and NaNs are placed at the start or at the end
NullPlacement null_placement_;
bool is_implicit_ = false;
};
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,126 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// NOTE: API is EXPERIMENTAL and will change without going through a
// deprecation cycle
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace compute {
class Function;
class FunctionOptionsType;
/// \brief A mutable central function registry for built-in functions as well
/// as user-defined functions. Functions are implementations of
/// arrow::compute::Function.
///
/// Generally, each function contains kernels which are implementations of a
/// function for a specific argument signature. After looking up a function in
/// the registry, one can either execute it eagerly with Function::Execute or
/// use one of the function's dispatch methods to pick a suitable kernel for
/// lower-level function execution.
class ARROW_EXPORT FunctionRegistry {
public:
~FunctionRegistry();
/// \brief Construct a new registry.
///
/// Most users only need to use the global registry.
static std::unique_ptr<FunctionRegistry> Make();
/// \brief Construct a new nested registry with the given parent.
///
/// Most users only need to use the global registry. The returned registry never changes
/// its parent, even when an operation allows overwriting.
static std::unique_ptr<FunctionRegistry> Make(FunctionRegistry* parent);
/// \brief Check whether a new function can be added to the registry.
///
/// \returns Status::KeyError if a function with the same name is already registered.
Status CanAddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
/// \brief Add a new function to the registry.
///
/// \returns Status::KeyError if a function with the same name is already registered.
Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
/// \brief Check whether an alias can be added for the given function name.
///
/// \returns Status::KeyError if the function with the given name is not registered.
Status CanAddAlias(const std::string& target_name, const std::string& source_name);
/// \brief Add alias for the given function name.
///
/// \returns Status::KeyError if the function with the given name is not registered.
Status AddAlias(const std::string& target_name, const std::string& source_name);
/// \brief Check whether a new function options type can be added to the registry.
///
/// \return Status::KeyError if a function options type with the same name is already
/// registered.
Status CanAddFunctionOptionsType(const FunctionOptionsType* options_type,
bool allow_overwrite = false);
/// \brief Add a new function options type to the registry.
///
/// \returns Status::KeyError if a function options type with the same name is already
/// registered.
Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
bool allow_overwrite = false);
/// \brief Retrieve a function by name from the registry.
Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const;
/// \brief Return vector of all entry names in the registry.
///
/// Helpful for displaying a manifest of available functions.
std::vector<std::string> GetFunctionNames() const;
/// \brief Retrieve a function options type by name from the registry.
Result<const FunctionOptionsType*> GetFunctionOptionsType(
const std::string& name) const;
/// \brief The number of currently registered functions.
int num_functions() const;
/// \brief The cast function object registered in AddFunction.
///
/// Helpful for get cast function as needed.
const Function* cast_function() const;
private:
FunctionRegistry();
// Use PIMPL pattern to not have std::unordered_map here
class FunctionRegistryImpl;
std::unique_ptr<FunctionRegistryImpl> impl_;
explicit FunctionRegistry(FunctionRegistryImpl* impl);
};
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,198 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <vector>
#include "arrow/compute/kernel.h"
#include "arrow/compute/visibility.h"
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace compute {
/// \brief A segment
/// A segment group is a chunk of continuous rows that have the same segment key. (For
/// example, in ordered time series processing, segment key can be "date", and a segment
/// group can be all the rows that belong to the same date.) A segment group can span
/// across multiple exec batches. A segment is a chunk of continuous rows that has the
/// same segment key within a given batch. When a segment group span cross batches, it
/// will have multiple segments. A segment never spans cross batches. The segment data
/// structure only makes sense when used along with a exec batch.
struct ARROW_COMPUTE_EXPORT Segment {
/// \brief the offset into the batch where the segment starts
int64_t offset;
/// \brief the length of the segment
int64_t length;
/// \brief whether the segment may be extended by a next one
bool is_open;
/// \brief whether the segment extends a preceeding one
bool extends;
};
inline bool operator==(const Segment& segment1, const Segment& segment2) {
return segment1.offset == segment2.offset && segment1.length == segment2.length &&
segment1.is_open == segment2.is_open && segment1.extends == segment2.extends;
}
inline bool operator!=(const Segment& segment1, const Segment& segment2) {
return !(segment1 == segment2);
}
/// \brief a helper class to divide a batch into segments of equal values
///
/// For example, given a batch with two columns specifed as segment keys:
///
/// A A [other columns]...
/// A A ...
/// A B ...
/// A B ...
/// A A ...
///
/// Then the batch could be divided into 3 segments. The first would be rows 0 & 1,
/// the second would be rows 2 & 3, and the third would be row 4.
///
/// Further, a segmenter keeps track of the last value seen. This allows it to calculate
/// segments which span batches. In our above example the last batch we emit would set
/// the "open" flag, which indicates whether the segment may extend into the next batch.
///
/// If the next call to the segmenter starts with `A A` then that segment would set the
/// "extends" flag, which indicates whether the segment continues the last open batch.
class ARROW_COMPUTE_EXPORT RowSegmenter {
public:
virtual ~RowSegmenter() = default;
/// \brief Construct a Segmenter which segments on the specified key types
///
/// \param[in] key_types the specified key types
/// \param[in] nullable_keys whether values of the specified keys may be null
/// \param[in] ctx the execution context to use
static Result<std::unique_ptr<RowSegmenter>> Make(
const std::vector<TypeHolder>& key_types, bool nullable_keys, ExecContext* ctx);
/// \brief Return the key types of this segmenter
virtual const std::vector<TypeHolder>& key_types() const = 0;
/// \brief Reset this segmenter
///
/// A segmenter normally extends (see `Segment`) a segment from one batch to the next.
/// If segment-extension is undesirable, for example when each batch is processed
/// independently, then `Reset` should be invoked before processing the next batch.
virtual Status Reset() = 0;
/// \brief Get all segments for the given batch
virtual Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) = 0;
};
/// Consumes batches of keys and yields batches of the group ids.
class ARROW_COMPUTE_EXPORT Grouper {
public:
virtual ~Grouper() = default;
/// Construct a Grouper which receives the specified key types
static Result<std::unique_ptr<Grouper>> Make(const std::vector<TypeHolder>& key_types,
ExecContext* ctx = default_exec_context());
/// Reset all intermediate state, make the grouper logically as just `Make`ed.
/// The underlying buffers, if any, may or may not be released though.
virtual Status Reset() = 0;
/// Consume a batch of keys, producing the corresponding group ids as an integer array,
/// over a slice defined by an offset and length, which defaults to the batch length.
/// Currently only uint32 indices will be produced, eventually the bit width will only
/// be as wide as necessary.
virtual Result<Datum> Consume(const ExecSpan& batch, int64_t offset = 0,
int64_t length = -1) = 0;
/// Like Consume, but groups not already encountered emit null instead of
/// generating a new group id.
virtual Result<Datum> Lookup(const ExecSpan& batch, int64_t offset = 0,
int64_t length = -1) = 0;
/// Like Consume, but only populates the Grouper without returning the group ids.
virtual Status Populate(const ExecSpan& batch, int64_t offset = 0,
int64_t length = -1) = 0;
/// Get current unique keys. May be called multiple times.
virtual Result<ExecBatch> GetUniques() = 0;
/// Get the current number of groups.
virtual uint32_t num_groups() const = 0;
/// \brief Assemble lists of indices of identical elements.
///
/// \param[in] ids An unsigned, all-valid integral array which will be
/// used as grouping criteria.
/// \param[in] num_groups An upper bound for the elements of ids
/// \param[in] ctx Execution context to use during the operation
/// \return A num_groups-long ListArray where the slot at i contains a
/// list of indices where i appears in ids.
///
/// MakeGroupings([
/// 2,
/// 2,
/// 5,
/// 5,
/// 2,
/// 3
/// ], 8) == [
/// [],
/// [],
/// [0, 1, 4],
/// [5],
/// [],
/// [2, 3],
/// [],
/// []
/// ]
static Result<std::shared_ptr<ListArray>> MakeGroupings(
const UInt32Array& ids, uint32_t num_groups,
ExecContext* ctx = default_exec_context());
/// \brief Produce a ListArray whose slots are selections of `array` which correspond to
/// the provided groupings.
///
/// For example,
/// ApplyGroupings([
/// [],
/// [],
/// [0, 1, 4],
/// [5],
/// [],
/// [2, 3],
/// [],
/// []
/// ], [2, 2, 5, 5, 2, 3]) == [
/// [],
/// [],
/// [2, 2, 2],
/// [3],
/// [],
/// [5, 5],
/// [],
/// []
/// ]
static Result<std::shared_ptr<ListArray>> ApplyGroupings(
const ListArray& groupings, const Array& array,
ExecContext* ctx = default_exec_context());
};
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,59 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/util/visibility.h"
namespace arrow {
struct Datum;
struct TypeHolder;
namespace compute {
class Function;
class ScalarAggregateFunction;
class FunctionExecutor;
class FunctionOptions;
class FunctionRegistry;
/// \brief Return the process-global function registry.
// Defined in registry.cc
ARROW_EXPORT FunctionRegistry* GetFunctionRegistry();
class CastOptions;
struct ExecBatch;
class ExecContext;
struct ExecValue;
class KernelContext;
struct Kernel;
struct ScalarKernel;
struct ScalarAggregateKernel;
struct VectorKernel;
struct KernelState;
class Expression;
ARROW_EXPORT ExecContext* default_exec_context();
ARROW_EXPORT ExecContext* threaded_exec_context();
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,221 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <atomic>
#include <cstdint>
#include <optional>
#include <thread>
#include <unordered_map>
#include <vector>
#include "arrow/compute/expression.h"
#include "arrow/compute/type_fwd.h"
#include "arrow/compute/visibility.h"
#include "arrow/result.h"
#include "arrow/util/cpu_info.h"
#include "arrow/util/simd.h"
#if defined(__clang__) || defined(__GNUC__)
# define BYTESWAP(x) __builtin_bswap64(x)
# define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
# define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
#elif defined(_MSC_VER)
# include <intrin.h>
# define BYTESWAP(x) _byteswap_uint64(x)
# define ROTL(x, n) _rotl((x), (n))
# define ROTL64(x, n) _rotl64((x), (n))
#endif
namespace arrow {
namespace util {
// Some platforms typedef int64_t as long int instead of long long int,
// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
// which need long long.
// We use the cast to the type below in these intrinsics to make the code
// compile in all cases.
//
using int64_for_gather_t = const long long int; // NOLINT runtime-int
// All MiniBatch... classes use TempVectorStack for vector allocations and can
// only work with vectors up to 1024 elements.
//
// They should only be allocated on the stack to guarantee the right sequence
// of allocation and deallocation of vectors from TempVectorStack.
//
class MiniBatch {
public:
static constexpr int kLogMiniBatchLength = 10;
static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
};
namespace bit_util {
ARROW_COMPUTE_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
const int num_bits, const uint8_t* bits,
int* num_indexes, uint16_t* indexes,
int bit_offset = 0);
ARROW_COMPUTE_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
const int num_bits, const uint8_t* bits,
const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes,
int bit_offset = 0);
// Input and output indexes may be pointing to the same data (in-place filtering).
ARROW_COMPUTE_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, int* num_indexes_bit0,
uint16_t* indexes_bit0,
uint16_t* indexes_bit1, int bit_offset = 0);
// Bit 1 is replaced with byte 0xFF.
ARROW_COMPUTE_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, uint8_t* bytes,
int bit_offset = 0);
// Return highest bit of each byte.
ARROW_COMPUTE_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits,
const uint8_t* bytes, uint8_t* bits,
int bit_offset = 0);
ARROW_COMPUTE_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
uint32_t num_bytes);
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
// The functions below use BMI2 instructions, be careful before calling!
namespace avx2 {
ARROW_COMPUTE_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
const uint8_t* bits,
const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes);
ARROW_COMPUTE_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
const uint8_t* bits, int* num_indexes,
uint16_t* indexes,
uint16_t base_index = 0);
ARROW_COMPUTE_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits,
uint8_t* bytes);
ARROW_COMPUTE_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes,
uint8_t* bits);
ARROW_COMPUTE_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes,
uint32_t num_bytes);
} // namespace avx2
#endif
} // namespace bit_util
} // namespace util
namespace compute {
/// Modify an Expression with pre-order and post-order visitation.
/// `pre` will be invoked on each Expression. `pre` will visit Calls before their
/// arguments, `post_call` will visit Calls (and no other Expressions) after their
/// arguments. Visitors should return the Identical expression to indicate no change; this
/// will prevent unnecessary construction in the common case where a modification is not
/// possible/necessary/...
///
/// If an argument was modified, `post_call` visits a reconstructed Call with the modified
/// arguments but also receives a pointer to the unmodified Expression as a second
/// argument. If no arguments were modified the unmodified Expression* will be nullptr.
template <typename PreVisit, typename PostVisitCall>
Result<Expression> ModifyExpression(Expression expr, const PreVisit& pre,
const PostVisitCall& post_call) {
ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
auto call = expr.call();
if (!call) return expr;
bool at_least_one_modified = false;
std::vector<Expression> modified_arguments;
for (size_t i = 0; i < call->arguments.size(); ++i) {
ARROW_ASSIGN_OR_RAISE(auto modified_argument,
ModifyExpression(call->arguments[i], pre, post_call));
if (Expression::Identical(modified_argument, call->arguments[i])) {
continue;
}
if (!at_least_one_modified) {
modified_arguments = call->arguments;
at_least_one_modified = true;
}
modified_arguments[i] = std::move(modified_argument);
}
if (at_least_one_modified) {
// reconstruct the call expression with the modified arguments
auto modified_call = *call;
modified_call.arguments = std::move(modified_arguments);
return post_call(Expression(std::move(modified_call)), &expr);
}
return post_call(std::move(expr), NULLPTR);
}
// Helper class to calculate the modified number of rows to process using SIMD.
//
// Some array elements at the end will be skipped in order to avoid buffer
// overrun, when doing memory loads and stores using larger word size than a
// single array element.
//
class TailSkipForSIMD {
public:
static int64_t FixBitAccess(int num_bytes_accessed_together, int64_t num_rows,
int bit_offset) {
int64_t num_bytes = bit_util::BytesForBits(num_rows + bit_offset);
int64_t num_bytes_safe =
std::max(static_cast<int64_t>(0LL), num_bytes - num_bytes_accessed_together + 1);
int64_t num_rows_safe =
std::max(static_cast<int64_t>(0LL), 8 * num_bytes_safe - bit_offset);
return std::min(num_rows_safe, num_rows);
}
static int64_t FixBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
int64_t length) {
int64_t num_rows_to_skip = bit_util::CeilDiv(length, num_bytes_accessed_together);
int64_t num_rows_safe =
std::max(static_cast<int64_t>(0LL), num_rows - num_rows_to_skip);
return num_rows_safe;
}
static int64_t FixVarBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
const uint32_t* offsets) {
// Do not process rows that could read past the end of the buffer using N
// byte loads/stores.
//
int64_t num_rows_safe = num_rows;
while (num_rows_safe > 0 &&
offsets[num_rows_safe] + num_bytes_accessed_together > offsets[num_rows]) {
--num_rows_safe;
}
return num_rows_safe;
}
static int FixSelection(int64_t num_rows_safe, int num_selected,
const uint16_t* selection) {
int num_selected_safe = num_selected;
while (num_selected_safe > 0 && selection[num_selected_safe - 1] >= num_rows_safe) {
--num_selected_safe;
}
return num_selected_safe;
}
};
} // namespace compute
} // namespace arrow

View File

@@ -0,0 +1,49 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#if defined(_WIN32) || defined(__CYGWIN__)
# if defined(_MSC_VER)
# pragma warning(push)
# pragma warning(disable : 4251)
# else
# pragma GCC diagnostic ignored "-Wattributes"
# endif
# ifdef ARROW_COMPUTE_STATIC
# define ARROW_COMPUTE_EXPORT
# elif defined(ARROW_COMPUTE_EXPORTING)
# define ARROW_COMPUTE_EXPORT __declspec(dllexport)
# else
# define ARROW_COMPUTE_EXPORT __declspec(dllimport)
# endif
# define ARROW_COMPUTE_NO_EXPORT
# if defined(_MSC_VER)
# pragma warning(pop)
# endif
#else // Not Windows
# ifndef ARROW_COMPUTE_EXPORT
# define ARROW_COMPUTE_EXPORT __attribute__((visibility("default")))
# endif
# ifndef ARROW_COMPUTE_NO_EXPORT
# define ARROW_COMPUTE_NO_EXPORT __attribute__((visibility("hidden")))
# endif
#endif