Initial commit
This commit is contained in:
@@ -0,0 +1,54 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle
|
||||
|
||||
#pragma once
|
||||
|
||||
/// \defgroup compute-functions Abstract compute function API
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup compute-concrete-options Concrete option classes for compute functions
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
#include "arrow/compute/api_aggregate.h" // IWYU pragma: export
|
||||
#include "arrow/compute/api_scalar.h" // IWYU pragma: export
|
||||
#include "arrow/compute/api_vector.h" // IWYU pragma: export
|
||||
#include "arrow/compute/cast.h" // IWYU pragma: export
|
||||
#include "arrow/compute/function.h" // IWYU pragma: export
|
||||
#include "arrow/compute/function_options.h" // IWYU pragma: export
|
||||
#include "arrow/compute/initialize.h" // IWYU pragma: export
|
||||
#include "arrow/compute/kernel.h" // IWYU pragma: export
|
||||
#include "arrow/compute/registry.h" // IWYU pragma: export
|
||||
#include "arrow/datum.h" // IWYU pragma: export
|
||||
|
||||
#include "arrow/compute/expression.h" // IWYU pragma: export
|
||||
|
||||
/// \defgroup execnode-row Utilities for working with data in a row-major format
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
#include "arrow/compute/row/grouper.h" // IWYU pragma: export
|
||||
|
||||
/// \defgroup acero-internals Acero internals, useful for those extending Acero
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
#include "arrow/compute/exec.h" // IWYU pragma: export
|
||||
@@ -0,0 +1,596 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Eager evaluation convenience APIs for invoking common functions, including
|
||||
// necessary memory allocations
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/function_options.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
|
||||
namespace compute {
|
||||
|
||||
class ExecContext;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Aggregate functions
|
||||
|
||||
/// \addtogroup compute-concrete-options
|
||||
/// @{
|
||||
|
||||
/// \brief Control general scalar aggregate kernel behavior
|
||||
///
|
||||
/// By default, null values are ignored (skip_nulls = true).
|
||||
class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
|
||||
static constexpr const char kTypeName[] = "ScalarAggregateOptions";
|
||||
static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
|
||||
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control count aggregate kernel behavior.
|
||||
///
|
||||
/// By default, only non-null values are counted.
|
||||
class ARROW_EXPORT CountOptions : public FunctionOptions {
|
||||
public:
|
||||
enum CountMode {
|
||||
/// Count only non-null values.
|
||||
ONLY_VALID = 0,
|
||||
/// Count only null values.
|
||||
ONLY_NULL,
|
||||
/// Count both non-null and null values.
|
||||
ALL,
|
||||
};
|
||||
explicit CountOptions(CountMode mode = CountMode::ONLY_VALID);
|
||||
static constexpr const char kTypeName[] = "CountOptions";
|
||||
static CountOptions Defaults() { return CountOptions{}; }
|
||||
|
||||
CountMode mode;
|
||||
};
|
||||
|
||||
/// \brief Control Mode kernel behavior
|
||||
///
|
||||
/// Returns top-n common values and counts.
|
||||
/// By default, returns the most common value and count.
|
||||
class ARROW_EXPORT ModeOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0);
|
||||
static constexpr const char kTypeName[] = "ModeOptions";
|
||||
static ModeOptions Defaults() { return ModeOptions{}; }
|
||||
|
||||
int64_t n = 1;
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
|
||||
///
|
||||
/// The divisor used in calculations is N - ddof, where N is the number of elements.
|
||||
/// By default, ddof is zero, and population variance or stddev is returned.
|
||||
class ARROW_EXPORT VarianceOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0);
|
||||
static constexpr const char kTypeName[] = "VarianceOptions";
|
||||
static VarianceOptions Defaults() { return VarianceOptions{}; }
|
||||
|
||||
int ddof = 0;
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control Skew and Kurtosis kernel behavior
|
||||
class ARROW_EXPORT SkewOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit SkewOptions(bool skip_nulls = true, bool biased = true,
|
||||
uint32_t min_count = 0);
|
||||
static constexpr const char kTypeName[] = "SkewOptions";
|
||||
static SkewOptions Defaults() { return SkewOptions{}; }
|
||||
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If true (the default), the calculated value is biased. If false, the calculated
|
||||
/// value includes a correction factor to reduce bias, making it more accurate for
|
||||
/// small sample sizes.
|
||||
bool biased;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control Quantile kernel behavior
|
||||
///
|
||||
/// By default, returns the median value.
|
||||
class ARROW_EXPORT QuantileOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Interpolation method to use when quantile lies between two data points
|
||||
enum Interpolation {
|
||||
LINEAR = 0,
|
||||
LOWER,
|
||||
HIGHER,
|
||||
NEAREST,
|
||||
MIDPOINT,
|
||||
};
|
||||
|
||||
explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR,
|
||||
bool skip_nulls = true, uint32_t min_count = 0);
|
||||
|
||||
explicit QuantileOptions(std::vector<double> q,
|
||||
enum Interpolation interpolation = LINEAR,
|
||||
bool skip_nulls = true, uint32_t min_count = 0);
|
||||
|
||||
static constexpr const char kTypeName[] = "QuantileOptions";
|
||||
static QuantileOptions Defaults() { return QuantileOptions{}; }
|
||||
|
||||
/// probability level of quantile must be between 0 and 1 inclusive
|
||||
std::vector<double> q;
|
||||
enum Interpolation interpolation;
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control TDigest approximate quantile kernel behavior
|
||||
///
|
||||
/// By default, returns the median value.
|
||||
class ARROW_EXPORT TDigestOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
|
||||
uint32_t buffer_size = 500, bool skip_nulls = true,
|
||||
uint32_t min_count = 0);
|
||||
explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
|
||||
uint32_t buffer_size = 500, bool skip_nulls = true,
|
||||
uint32_t min_count = 0);
|
||||
static constexpr const char kTypeName[] = "TDigestOptions";
|
||||
static TDigestOptions Defaults() { return TDigestOptions{}; }
|
||||
|
||||
/// probability level of quantile must be between 0 and 1 inclusive
|
||||
std::vector<double> q;
|
||||
/// compression parameter, default 100
|
||||
uint32_t delta;
|
||||
/// input buffer size, default 500
|
||||
uint32_t buffer_size;
|
||||
/// If true (the default), null values are ignored. Otherwise, if any value is null,
|
||||
/// emit null.
|
||||
bool skip_nulls;
|
||||
/// If less than this many non-null values are observed, emit null.
|
||||
uint32_t min_count;
|
||||
};
|
||||
|
||||
/// \brief Control Pivot kernel behavior
|
||||
///
|
||||
/// These options apply to the "pivot_wider" and "hash_pivot_wider" functions.
|
||||
///
|
||||
/// Constraints:
|
||||
/// - The corresponding `Aggregate::target` must have two FieldRef elements;
|
||||
/// the first one points to the pivot key column, the second points to the
|
||||
/// pivoted data column.
|
||||
/// - The pivot key column can be string, binary or integer; its values will be
|
||||
/// matched against `key_names` in order to dispatch the pivoted data into
|
||||
/// the output. If the pivot key column is not string-like, the `key_names`
|
||||
/// will be cast to the pivot key type.
|
||||
///
|
||||
/// "pivot_wider" example
|
||||
/// ---------------------
|
||||
///
|
||||
/// Assuming the following two input columns with types utf8 and int16 (respectively):
|
||||
/// ```
|
||||
/// width | 11
|
||||
/// height | 13
|
||||
/// ```
|
||||
/// and the options `PivotWiderOptions(.key_names = {"height", "width"})`
|
||||
///
|
||||
/// then the output will be a scalar with the type
|
||||
/// `struct{"height": int16, "width": int16}`
|
||||
/// and the value `{"height": 13, "width": 11}`.
|
||||
///
|
||||
/// "hash_pivot_wider" example
|
||||
/// --------------------------
|
||||
///
|
||||
/// Assuming the following input with schema
|
||||
/// `{"group": int32, "key": utf8, "value": int16}`:
|
||||
/// ```
|
||||
/// group | key | value
|
||||
/// -----------------------------
|
||||
/// 1 | height | 11
|
||||
/// 1 | width | 12
|
||||
/// 2 | width | 13
|
||||
/// 3 | height | 14
|
||||
/// 3 | depth | 15
|
||||
/// ```
|
||||
/// and the following settings:
|
||||
/// - a hash grouping key "group"
|
||||
/// - Aggregate(
|
||||
/// .function = "hash_pivot_wider",
|
||||
/// .options = PivotWiderOptions(.key_names = {"height", "width"}),
|
||||
/// .target = {"key", "value"},
|
||||
/// .name = {"properties"})
|
||||
///
|
||||
/// then the output will have the schema
|
||||
/// `{"group": int32, "properties": struct{"height": int16, "width": int16}}`
|
||||
/// and the following value:
|
||||
/// ```
|
||||
/// group | properties
|
||||
/// | height | width
|
||||
/// -----------------------------
|
||||
/// 1 | 11 | 12
|
||||
/// 2 | null | 13
|
||||
/// 3 | 14 | null
|
||||
/// ```
|
||||
class ARROW_EXPORT PivotWiderOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Configure the behavior of pivot keys not in `key_names`
|
||||
enum UnexpectedKeyBehavior {
|
||||
/// Unexpected pivot keys are ignored silently
|
||||
kIgnore,
|
||||
/// Unexpected pivot keys return a KeyError
|
||||
kRaise
|
||||
};
|
||||
|
||||
explicit PivotWiderOptions(std::vector<std::string> key_names,
|
||||
UnexpectedKeyBehavior unexpected_key_behavior = kIgnore);
|
||||
// Default constructor for serialization
|
||||
PivotWiderOptions();
|
||||
static constexpr const char kTypeName[] = "PivotWiderOptions";
|
||||
static PivotWiderOptions Defaults() { return PivotWiderOptions{}; }
|
||||
|
||||
/// The values expected in the pivot key column
|
||||
std::vector<std::string> key_names;
|
||||
/// The behavior when pivot keys not in `key_names` are encountered
|
||||
UnexpectedKeyBehavior unexpected_key_behavior = kIgnore;
|
||||
};
|
||||
|
||||
/// \brief Control Index kernel behavior
|
||||
class ARROW_EXPORT IndexOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit IndexOptions(std::shared_ptr<Scalar> value);
|
||||
// Default constructor for serialization
|
||||
IndexOptions();
|
||||
static constexpr const char kTypeName[] = "IndexOptions";
|
||||
|
||||
std::shared_ptr<Scalar> value;
|
||||
};
|
||||
|
||||
/// \brief Configure a grouped aggregation
|
||||
struct ARROW_EXPORT Aggregate {
|
||||
Aggregate() = default;
|
||||
|
||||
Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
|
||||
std::vector<FieldRef> target, std::string name = "")
|
||||
: function(std::move(function)),
|
||||
options(std::move(options)),
|
||||
target(std::move(target)),
|
||||
name(std::move(name)) {}
|
||||
|
||||
Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
|
||||
FieldRef target, std::string name = "")
|
||||
: Aggregate(std::move(function), std::move(options),
|
||||
std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
|
||||
|
||||
Aggregate(std::string function, FieldRef target, std::string name)
|
||||
: Aggregate(std::move(function), /*options=*/NULLPTR,
|
||||
std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
|
||||
|
||||
Aggregate(std::string function, std::string name)
|
||||
: Aggregate(std::move(function), /*options=*/NULLPTR,
|
||||
/*target=*/std::vector<FieldRef>{}, std::move(name)) {}
|
||||
|
||||
/// the name of the aggregation function
|
||||
std::string function;
|
||||
|
||||
/// options for the aggregation function
|
||||
std::shared_ptr<FunctionOptions> options;
|
||||
|
||||
/// zero or more fields to which aggregations will be applied
|
||||
std::vector<FieldRef> target;
|
||||
|
||||
/// optional output field name for aggregations
|
||||
std::string name;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Count values in an array.
|
||||
///
|
||||
/// \param[in] options counting options, see CountOptions for more information
|
||||
/// \param[in] datum to count
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return out resulting datum
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Count(const Datum& datum,
|
||||
const CountOptions& options = CountOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the mean of a numeric array.
|
||||
///
|
||||
/// \param[in] value datum to compute the mean, expecting Array
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed mean as a DoubleScalar
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Mean(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the product of values of a numeric array.
|
||||
///
|
||||
/// \param[in] value datum to compute product of, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed sum as a Scalar
|
||||
///
|
||||
/// \since 6.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Product(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Sum values of a numeric array.
|
||||
///
|
||||
/// \param[in] value datum to sum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed sum as a Scalar
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Sum(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the first value of an array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed first as Scalar
|
||||
///
|
||||
/// \since 13.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> First(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the last value of an array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed last as a Scalar
|
||||
///
|
||||
/// \since 13.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Last(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the min / max of a numeric array
|
||||
///
|
||||
/// This function returns both the min and max as a struct scalar, with type
|
||||
/// struct<min: T, max: T>, where T is the input type
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as a struct<min: T, max: T> scalar
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> MinMax(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Test whether any element in a boolean array evaluates to true.
|
||||
///
|
||||
/// This function returns true if any of the elements in the array evaluates
|
||||
/// to true and false otherwise. Null values are ignored by default.
|
||||
/// If null values are taken into account by setting ScalarAggregateOptions
|
||||
/// parameter skip_nulls = false then Kleene logic is used.
|
||||
/// See KleeneOr for more details on Kleene logic.
|
||||
///
|
||||
/// \param[in] value input datum, expecting a boolean array
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as a BooleanScalar
|
||||
///
|
||||
/// \since 3.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Any(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Test whether all elements in a boolean array evaluate to true.
|
||||
///
|
||||
/// This function returns true if all of the elements in the array evaluate
|
||||
/// to true and false otherwise. Null values are ignored by default.
|
||||
/// If null values are taken into account by setting ScalarAggregateOptions
|
||||
/// parameter skip_nulls = false then Kleene logic is used.
|
||||
/// See KleeneAnd for more details on Kleene logic.
|
||||
///
|
||||
/// \param[in] value input datum, expecting a boolean array
|
||||
/// \param[in] options see ScalarAggregateOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as a BooleanScalar
|
||||
|
||||
/// \since 3.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> All(
|
||||
const Datum& value,
|
||||
const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the modal (most common) value of a numeric array
|
||||
///
|
||||
/// This function returns top-n most common values and number of times they occur as
|
||||
/// an array of `struct<mode: T, count: int64>`, where T is the input type.
|
||||
/// Values with larger counts are returned before smaller ones.
|
||||
/// If there are more than one values with same count, smaller value is returned first.
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see ModeOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as an array of struct<mode: T, count: int64>
|
||||
///
|
||||
/// \since 2.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Mode(const Datum& value,
|
||||
const ModeOptions& options = ModeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the standard deviation of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see VarianceOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed standard deviation as a DoubleScalar
|
||||
///
|
||||
/// \since 2.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Stddev(const Datum& value,
|
||||
const VarianceOptions& options = VarianceOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the variance of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see VarianceOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed variance as a DoubleScalar
|
||||
///
|
||||
/// \since 2.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Variance(const Datum& value,
|
||||
const VarianceOptions& options = VarianceOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the skewness of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see SkewOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed skewness as a DoubleScalar
|
||||
///
|
||||
/// \since 20.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Skew(const Datum& value,
|
||||
const SkewOptions& options = SkewOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the kurtosis of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see SkewOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return datum of the computed kurtosis as a DoubleScalar
|
||||
///
|
||||
/// \since 20.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Kurtosis(const Datum& value,
|
||||
const SkewOptions& options = SkewOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the quantiles of a numeric array
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see QuantileOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as an array
|
||||
///
|
||||
/// \since 4.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Quantile(const Datum& value,
|
||||
const QuantileOptions& options = QuantileOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
|
||||
///
|
||||
/// \param[in] value input datum, expecting Array or ChunkedArray
|
||||
/// \param[in] options see TDigestOptions for more information
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return resulting datum as an array
|
||||
///
|
||||
/// \since 4.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> TDigest(const Datum& value,
|
||||
const TDigestOptions& options = TDigestOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Find the first index of a value in an array.
|
||||
///
|
||||
/// \param[in] value The array to search.
|
||||
/// \param[in] options The array to search for. See IndexOptions.
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return out a Scalar containing the index (or -1 if not found).
|
||||
///
|
||||
/// \since 5.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Index(const Datum& value, const IndexOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,834 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/compute/function_options.h"
|
||||
#include "arrow/compute/ordering.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
class ExecContext;
|
||||
|
||||
/// \addtogroup compute-concrete-options
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT FilterOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Configure the action taken when a slot of the selection mask is null
|
||||
enum NullSelectionBehavior {
|
||||
/// The corresponding filtered value will be removed in the output.
|
||||
DROP,
|
||||
/// The corresponding filtered value will be null in the output.
|
||||
EMIT_NULL,
|
||||
};
|
||||
|
||||
explicit FilterOptions(NullSelectionBehavior null_selection = DROP);
|
||||
static constexpr const char kTypeName[] = "FilterOptions";
|
||||
static FilterOptions Defaults() { return FilterOptions(); }
|
||||
|
||||
NullSelectionBehavior null_selection_behavior = DROP;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT TakeOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit TakeOptions(bool boundscheck = true);
|
||||
static constexpr const char kTypeName[] = "TakeOptions";
|
||||
static TakeOptions BoundsCheck() { return TakeOptions(true); }
|
||||
static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
|
||||
static TakeOptions Defaults() { return BoundsCheck(); }
|
||||
|
||||
bool boundscheck = true;
|
||||
};
|
||||
|
||||
/// \brief Options for the dictionary encode function
|
||||
class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Configure how null values will be encoded
|
||||
enum NullEncodingBehavior {
|
||||
/// The null value will be added to the dictionary with a proper index.
|
||||
ENCODE,
|
||||
/// The null value will be masked in the indices array.
|
||||
MASK
|
||||
};
|
||||
|
||||
explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK);
|
||||
static constexpr const char kTypeName[] = "DictionaryEncodeOptions";
|
||||
static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
|
||||
|
||||
NullEncodingBehavior null_encoding_behavior = MASK;
|
||||
};
|
||||
|
||||
/// \brief Options for the run-end encode function
|
||||
class ARROW_EXPORT RunEndEncodeOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit RunEndEncodeOptions(std::shared_ptr<DataType> run_end_type = int32());
|
||||
static constexpr const char kTypeName[] = "RunEndEncodeOptions";
|
||||
static RunEndEncodeOptions Defaults() { return RunEndEncodeOptions(); }
|
||||
|
||||
std::shared_ptr<DataType> run_end_type;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ArraySortOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ArraySortOptions(SortOrder order = SortOrder::Ascending,
|
||||
NullPlacement null_placement = NullPlacement::AtEnd);
|
||||
static constexpr const char kTypeName[] = "ArraySortOptions";
|
||||
static ArraySortOptions Defaults() { return ArraySortOptions(); }
|
||||
|
||||
/// Sorting order
|
||||
SortOrder order;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT SortOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit SortOptions(std::vector<SortKey> sort_keys = {},
|
||||
NullPlacement null_placement = NullPlacement::AtEnd);
|
||||
explicit SortOptions(const Ordering& ordering);
|
||||
static constexpr const char kTypeName[] = "SortOptions";
|
||||
static SortOptions Defaults() { return SortOptions(); }
|
||||
/// Convenience constructor to create an ordering from SortOptions
|
||||
///
|
||||
/// Note: Both classes contain the exact same information. However,
|
||||
/// sort_options should only be used in a "function options" context while Ordering
|
||||
/// is used more generally.
|
||||
Ordering AsOrdering() && { return Ordering(std::move(sort_keys), null_placement); }
|
||||
Ordering AsOrdering() const& { return Ordering(sort_keys, null_placement); }
|
||||
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
};
|
||||
|
||||
/// \brief SelectK options
|
||||
class ARROW_EXPORT SelectKOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit SelectKOptions(int64_t k = -1, std::vector<SortKey> sort_keys = {});
|
||||
static constexpr const char kTypeName[] = "SelectKOptions";
|
||||
static SelectKOptions Defaults() { return SelectKOptions(); }
|
||||
|
||||
static SelectKOptions TopKDefault(int64_t k, std::vector<std::string> key_names = {}) {
|
||||
std::vector<SortKey> keys;
|
||||
for (const auto& name : key_names) {
|
||||
keys.emplace_back(SortKey(name, SortOrder::Descending));
|
||||
}
|
||||
if (key_names.empty()) {
|
||||
keys.emplace_back(SortKey("not-used", SortOrder::Descending));
|
||||
}
|
||||
return SelectKOptions{k, keys};
|
||||
}
|
||||
static SelectKOptions BottomKDefault(int64_t k,
|
||||
std::vector<std::string> key_names = {}) {
|
||||
std::vector<SortKey> keys;
|
||||
for (const auto& name : key_names) {
|
||||
keys.emplace_back(SortKey(name, SortOrder::Ascending));
|
||||
}
|
||||
if (key_names.empty()) {
|
||||
keys.emplace_back(SortKey("not-used", SortOrder::Ascending));
|
||||
}
|
||||
return SelectKOptions{k, keys};
|
||||
}
|
||||
|
||||
/// The number of `k` elements to keep.
|
||||
int64_t k;
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys;
|
||||
};
|
||||
|
||||
/// \brief Rank options
|
||||
class ARROW_EXPORT RankOptions : public FunctionOptions {
|
||||
public:
|
||||
/// Configure how ties between equal values are handled
|
||||
enum Tiebreaker {
|
||||
/// Ties get the smallest possible rank in sorted order.
|
||||
Min,
|
||||
/// Ties get the largest possible rank in sorted order.
|
||||
Max,
|
||||
/// Ranks are assigned in order of when ties appear in the input.
|
||||
/// This ensures the ranks are a stable permutation of the input.
|
||||
First,
|
||||
/// The ranks span a dense [1, M] interval where M is the number
|
||||
/// of distinct values in the input.
|
||||
Dense
|
||||
};
|
||||
|
||||
explicit RankOptions(std::vector<SortKey> sort_keys = {},
|
||||
NullPlacement null_placement = NullPlacement::AtEnd,
|
||||
Tiebreaker tiebreaker = RankOptions::First);
|
||||
/// Convenience constructor for array inputs
|
||||
explicit RankOptions(SortOrder order,
|
||||
NullPlacement null_placement = NullPlacement::AtEnd,
|
||||
Tiebreaker tiebreaker = RankOptions::First)
|
||||
: RankOptions({SortKey("", order)}, null_placement, tiebreaker) {}
|
||||
|
||||
static constexpr const char kTypeName[] = "RankOptions";
|
||||
static RankOptions Defaults() { return RankOptions(); }
|
||||
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
/// Tiebreaker for dealing with equal values in ranks
|
||||
Tiebreaker tiebreaker;
|
||||
};
|
||||
|
||||
/// \brief Quantile rank options
|
||||
class ARROW_EXPORT RankQuantileOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit RankQuantileOptions(std::vector<SortKey> sort_keys = {},
|
||||
NullPlacement null_placement = NullPlacement::AtEnd);
|
||||
/// Convenience constructor for array inputs
|
||||
explicit RankQuantileOptions(SortOrder order,
|
||||
NullPlacement null_placement = NullPlacement::AtEnd)
|
||||
: RankQuantileOptions({SortKey("", order)}, null_placement) {}
|
||||
|
||||
static constexpr const char kTypeName[] = "RankQuantileOptions";
|
||||
static RankQuantileOptions Defaults() { return RankQuantileOptions(); }
|
||||
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
};
|
||||
|
||||
/// \brief Partitioning options for NthToIndices
|
||||
class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit PartitionNthOptions(int64_t pivot,
|
||||
NullPlacement null_placement = NullPlacement::AtEnd);
|
||||
PartitionNthOptions() : PartitionNthOptions(0) {}
|
||||
static constexpr const char kTypeName[] = "PartitionNthOptions";
|
||||
|
||||
/// The index into the equivalent sorted array of the partition pivot element.
|
||||
int64_t pivot;
|
||||
/// Whether nulls and NaNs are partitioned at the start or at the end
|
||||
NullPlacement null_placement;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT WinsorizeOptions : public FunctionOptions {
|
||||
public:
|
||||
WinsorizeOptions(double lower_limit, double upper_limit);
|
||||
WinsorizeOptions() : WinsorizeOptions(0, 1) {}
|
||||
static constexpr const char kTypeName[] = "WinsorizeOptions";
|
||||
|
||||
/// The quantile below which all values are replaced with the quantile's value.
|
||||
///
|
||||
/// For example, if lower_limit = 0.05, then all values in the lower 5% percentile
|
||||
/// will be replaced with the 5% percentile value.
|
||||
double lower_limit;
|
||||
|
||||
/// The quantile above which all values are replaced with the quantile's value.
|
||||
///
|
||||
/// For example, if upper_limit = 0.95, then all values in the upper 95% percentile
|
||||
/// will be replaced with the 95% percentile value.
|
||||
double upper_limit;
|
||||
};
|
||||
|
||||
/// \brief Options for cumulative functions
|
||||
/// \note Also aliased as CumulativeSumOptions for backward compatibility
|
||||
class ARROW_EXPORT CumulativeOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit CumulativeOptions(bool skip_nulls = false);
|
||||
explicit CumulativeOptions(double start, bool skip_nulls = false);
|
||||
explicit CumulativeOptions(std::shared_ptr<Scalar> start, bool skip_nulls = false);
|
||||
static constexpr const char kTypeName[] = "CumulativeOptions";
|
||||
static CumulativeOptions Defaults() { return CumulativeOptions(); }
|
||||
|
||||
/// Optional starting value for cumulative operation computation, default depends on the
|
||||
/// operation and input type.
|
||||
/// - sum: 0
|
||||
/// - prod: 1
|
||||
/// - min: maximum of the input type
|
||||
/// - max: minimum of the input type
|
||||
/// - mean: start is ignored because it has no meaning for mean
|
||||
std::optional<std::shared_ptr<Scalar>> start;
|
||||
|
||||
/// If true, nulls in the input are ignored and produce a corresponding null output.
|
||||
/// When false, the first null encountered is propagated through the remaining output.
|
||||
bool skip_nulls = false;
|
||||
};
|
||||
using CumulativeSumOptions = CumulativeOptions; // For backward compatibility
|
||||
|
||||
/// \brief Options for pairwise functions
|
||||
class ARROW_EXPORT PairwiseOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit PairwiseOptions(int64_t periods = 1);
|
||||
static constexpr const char kTypeName[] = "PairwiseOptions";
|
||||
static PairwiseOptions Defaults() { return PairwiseOptions(); }
|
||||
|
||||
/// Periods to shift for applying the binary operation, accepts negative values.
|
||||
int64_t periods = 1;
|
||||
};
|
||||
|
||||
/// \brief Options for list_flatten function
|
||||
class ARROW_EXPORT ListFlattenOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ListFlattenOptions(bool recursive = false);
|
||||
static constexpr const char kTypeName[] = "ListFlattenOptions";
|
||||
static ListFlattenOptions Defaults() { return ListFlattenOptions(); }
|
||||
|
||||
/// \brief If true, the list is flattened recursively until a non-list
|
||||
/// array is formed.
|
||||
bool recursive = false;
|
||||
};
|
||||
|
||||
/// \brief Options for inverse_permutation function
|
||||
class ARROW_EXPORT InversePermutationOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit InversePermutationOptions(int64_t max_index = -1,
|
||||
std::shared_ptr<DataType> output_type = NULLPTR);
|
||||
static constexpr const char kTypeName[] = "InversePermutationOptions";
|
||||
static InversePermutationOptions Defaults() { return InversePermutationOptions(); }
|
||||
|
||||
/// \brief The max value in the input indices to allow. The length of the function's
|
||||
/// output will be this value plus 1. If negative, this value will be set to the length
|
||||
/// of the input indices minus 1 and the length of the function's output will be the
|
||||
/// length of the input indices.
|
||||
int64_t max_index = -1;
|
||||
/// \brief The type of the output inverse permutation. If null, the output will be of
|
||||
/// the same type as the input indices, otherwise must be signed integer type. An
|
||||
/// invalid error will be reported if this type is not able to store the length of the
|
||||
/// input indices.
|
||||
std::shared_ptr<DataType> output_type = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief Options for scatter function
|
||||
class ARROW_EXPORT ScatterOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit ScatterOptions(int64_t max_index = -1);
|
||||
static constexpr const char kTypeName[] = "ScatterOptions";
|
||||
static ScatterOptions Defaults() { return ScatterOptions(); }
|
||||
|
||||
/// \brief The max value in the input indices to allow. The length of the function's
|
||||
/// output will be this value plus 1. If negative, this value will be set to the length
|
||||
/// of the input indices minus 1 and the length of the function's output will be the
|
||||
/// length of the input indices.
|
||||
int64_t max_index = -1;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Filter with a boolean selection filter
|
||||
///
|
||||
/// The output will be populated with values from the input at positions
|
||||
/// where the selection filter is not 0. Nulls in the filter will be handled
|
||||
/// based on options.null_selection_behavior.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, "e", "f"] and
|
||||
/// filter = [0, 1, 1, 0, null, 1], the output will be
|
||||
/// (null_selection_behavior == DROP) = ["b", "c", "f"]
|
||||
/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
|
||||
///
|
||||
/// \param[in] values array to filter
|
||||
/// \param[in] filter indicates which values should be filtered out
|
||||
/// \param[in] options configures null_selection_behavior
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Filter(const Datum& values, const Datum& filter,
|
||||
const FilterOptions& options = FilterOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
namespace internal {
|
||||
|
||||
// These internal functions are implemented in kernels/vector_selection.cc
|
||||
|
||||
/// \brief Return the number of selected indices in the boolean filter
|
||||
///
|
||||
/// \param filter a plain or run-end encoded boolean array with or without nulls
|
||||
/// \param null_selection how to handle nulls in the filter
|
||||
ARROW_EXPORT
|
||||
int64_t GetFilterOutputSize(const ArraySpan& filter,
|
||||
FilterOptions::NullSelectionBehavior null_selection);
|
||||
|
||||
/// \brief Compute uint64 selection indices for use with Take given a boolean
|
||||
/// filter
|
||||
///
|
||||
/// \param filter a plain or run-end encoded boolean array with or without nulls
|
||||
/// \param null_selection how to handle nulls in the filter
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> GetTakeIndices(
|
||||
const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
|
||||
MemoryPool* memory_pool = default_memory_pool());
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief ReplaceWithMask replaces each value in the array corresponding
|
||||
/// to a true value in the mask with the next element from `replacements`.
|
||||
///
|
||||
/// \param[in] values Array input to replace
|
||||
/// \param[in] mask Array or Scalar of Boolean mask values
|
||||
/// \param[in] replacements The replacement values to draw from. There must
|
||||
/// be as many replacement values as true values in the mask.
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
///
|
||||
/// \return the resulting datum
|
||||
///
|
||||
/// \since 5.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
|
||||
const Datum& replacements, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief FillNullForward fill null values in forward direction
|
||||
///
|
||||
/// The output array will be of the same type as the input values
|
||||
/// array, with replaced null values in forward direction.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, null, "f"],
|
||||
/// the output will be = ["a", "b", "c", "c", "c", "f"]
|
||||
///
|
||||
/// \param[in] values datum from which to take
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> FillNullForward(const Datum& values, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief FillNullBackward fill null values in backward direction
|
||||
///
|
||||
/// The output array will be of the same type as the input values
|
||||
/// array, with replaced null values in backward direction.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, null, "f"],
|
||||
/// the output will be = ["a", "b", "c", "f", "f", "f"]
|
||||
///
|
||||
/// \param[in] values datum from which to take
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> FillNullBackward(const Datum& values, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Take from an array of values at indices in another array
|
||||
///
|
||||
/// The output array will be of the same type as the input values
|
||||
/// array, with elements taken from the values array at the given
|
||||
/// indices. If an index is null then the taken element will be null.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, "e", "f"] and
|
||||
/// indices = [2, 1, null, 3], the output will be
|
||||
/// = [values[2], values[1], null, values[3]]
|
||||
/// = ["c", "b", null, null]
|
||||
///
|
||||
/// \param[in] values datum from which to take
|
||||
/// \param[in] indices which values to take
|
||||
/// \param[in] options options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Take(const Datum& values, const Datum& indices,
|
||||
const TakeOptions& options = TakeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Take with Array inputs and output
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
|
||||
const TakeOptions& options = TakeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Drop Null from an array of values
|
||||
///
|
||||
/// The output array will be of the same type as the input values
|
||||
/// array, with elements taken from the values array without nulls.
|
||||
///
|
||||
/// For example given values = ["a", "b", "c", null, "e", "f"],
|
||||
/// the output will be = ["a", "b", "c", "e", "f"]
|
||||
///
|
||||
/// \param[in] values datum from which to take
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
ARROW_EXPORT
|
||||
Result<Datum> DropNull(const Datum& values, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief DropNull with Array inputs and output
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> DropNull(const Array& values, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return indices that partition an array around n-th sorted element.
|
||||
///
|
||||
/// Find index of n-th(0 based) smallest value and perform indirect
|
||||
/// partition of an array around that element. Output indices[0 ~ n-1]
|
||||
/// holds values no greater than n-th element, and indices[n+1 ~ end]
|
||||
/// holds values no less than n-th element. Elements in each partition
|
||||
/// is not sorted. Nulls will be partitioned to the end of the output.
|
||||
/// Output is not guaranteed to be stable.
|
||||
///
|
||||
/// \param[in] values array to be partitioned
|
||||
/// \param[in] n pivot array around sorted n-th element
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would partition an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return indices that partition an array around n-th sorted element.
|
||||
///
|
||||
/// This overload takes a PartitionNthOptions specifying the pivot index
|
||||
/// and the null handling.
|
||||
///
|
||||
/// \param[in] values array to be partitioned
|
||||
/// \param[in] options options including pivot index and null handling
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would partition an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> NthToIndices(const Array& values,
|
||||
const PartitionNthOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return indices that would select the first `k` elements.
|
||||
///
|
||||
/// Perform an indirect sort of the datum, keeping only the first `k` elements. The output
|
||||
/// array will contain indices such that the item indicated by the k-th index will be in
|
||||
/// the position it would be if the datum were sorted by `options.sort_keys`. However,
|
||||
/// indices of null values will not be part of the output. The sort is not guaranteed to
|
||||
/// be stable.
|
||||
///
|
||||
/// \param[in] datum datum to be partitioned
|
||||
/// \param[in] options options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return a datum with the same schema as the input
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SelectKUnstable(const Datum& datum,
|
||||
const SelectKOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort an array.
|
||||
///
|
||||
/// Perform an indirect sort of array. The output array will contain
|
||||
/// indices that would sort an array, which would be the same length
|
||||
/// as input. Nulls will be stably partitioned to the end of the output
|
||||
/// regardless of order.
|
||||
///
|
||||
/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
|
||||
/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
|
||||
/// 3].
|
||||
///
|
||||
/// \param[in] array array to sort
|
||||
/// \param[in] order ascending or descending
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const Array& array,
|
||||
SortOrder order = SortOrder::Ascending,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort an array.
|
||||
///
|
||||
/// This overload takes a ArraySortOptions specifying the sort order
|
||||
/// and the null handling.
|
||||
///
|
||||
/// \param[in] array array to sort
|
||||
/// \param[in] options options including sort order and null handling
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const Array& array,
|
||||
const ArraySortOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort a chunked array.
|
||||
///
|
||||
/// Perform an indirect sort of chunked array. The output array will
|
||||
/// contain indices that would sort a chunked array, which would be
|
||||
/// the same length as input. Nulls will be stably partitioned to the
|
||||
/// end of the output regardless of order.
|
||||
///
|
||||
/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
|
||||
/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
|
||||
/// 4, 1, 0, 3].
|
||||
///
|
||||
/// \param[in] chunked_array chunked array to sort
|
||||
/// \param[in] order ascending or descending
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
|
||||
SortOrder order = SortOrder::Ascending,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort a chunked array.
|
||||
///
|
||||
/// This overload takes a ArraySortOptions specifying the sort order
|
||||
/// and the null handling.
|
||||
///
|
||||
/// \param[in] chunked_array chunked array to sort
|
||||
/// \param[in] options options including sort order and null handling
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort an array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
|
||||
const ArraySortOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the indices that would sort an input in the
|
||||
/// specified order. Input is one of array, chunked array record batch
|
||||
/// or table.
|
||||
///
|
||||
/// Perform an indirect sort of input. The output array will contain
|
||||
/// indices that would sort an input, which would be the same length
|
||||
/// as input. Nulls will be stably partitioned to the start or to the end
|
||||
/// of the output depending on SortOrder::null_placement.
|
||||
///
|
||||
/// For example given input (table) = {
|
||||
/// "column1": [[null, 1], [ 3, null, 2, 1]],
|
||||
/// "column2": [[ 5], [3, null, null, 5, 5]],
|
||||
/// } and options = {
|
||||
/// {"column1", SortOrder::Ascending},
|
||||
/// {"column2", SortOrder::Descending},
|
||||
/// }, the output will be [5, 1, 4, 2, 0, 3].
|
||||
///
|
||||
/// \param[in] datum array, chunked array, record batch or table to sort
|
||||
/// \param[in] options options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return offsets indices that would sort a table
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute unique elements from an array-like object
|
||||
///
|
||||
/// Note if a null occurs in the input it will NOT be included in the output.
|
||||
///
|
||||
/// \param[in] datum array-like input
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return result as Array
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Unique(const Datum& datum, ExecContext* ctx = NULLPTR);
|
||||
|
||||
// Constants for accessing the output of ValueCounts
|
||||
ARROW_EXPORT extern const char kValuesFieldName[];
|
||||
ARROW_EXPORT extern const char kCountsFieldName[];
|
||||
ARROW_EXPORT extern const int32_t kValuesFieldIndex;
|
||||
ARROW_EXPORT extern const int32_t kCountsFieldIndex;
|
||||
|
||||
/// \brief Return counts of unique elements from an array-like object.
|
||||
///
|
||||
/// Note that the counts do not include counts for nulls in the array. These can be
|
||||
/// obtained separately from metadata.
|
||||
///
|
||||
/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
|
||||
/// which can lead to unexpected results if the input Array has these values.
|
||||
///
|
||||
/// \param[in] value array-like input
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return counts An array of <input type "Values", int64_t "Counts"> structs.
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Dictionary-encode values in an array-like object
|
||||
///
|
||||
/// Any nulls encountered in the dictionary will be handled according to the
|
||||
/// specified null encoding behavior.
|
||||
///
|
||||
/// For example, given values ["a", "b", null, "a", null] the output will be
|
||||
/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
|
||||
/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
|
||||
///
|
||||
/// If the input is already dictionary encoded this function is a no-op unless
|
||||
/// it needs to modify the null_encoding (TODO)
|
||||
///
|
||||
/// \param[in] data array-like input
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \param[in] options configures null encoding behavior
|
||||
/// \return result with same shape and type as input
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> DictionaryEncode(
|
||||
const Datum& data,
|
||||
const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Run-end-encode values in an array-like object
|
||||
///
|
||||
/// The returned run-end encoded type uses the same value type of the input and
|
||||
/// run-end type defined in the options.
|
||||
///
|
||||
/// \param[in] value array-like input
|
||||
/// \param[in] options configures encoding behavior
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return result with same shape but run-end encoded
|
||||
///
|
||||
/// \since 12.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> RunEndEncode(
|
||||
const Datum& value,
|
||||
const RunEndEncodeOptions& options = RunEndEncodeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Decode a Run-End Encoded array to a plain array
|
||||
///
|
||||
/// The output data type is the same as the values array type of run-end encoded
|
||||
/// input.
|
||||
///
|
||||
/// \param[in] value run-end-encoded input
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return plain array resulting from decoding the run-end encoded input
|
||||
///
|
||||
/// \since 12.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> RunEndDecode(const Datum& value, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative sum of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative sum behavior
|
||||
/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
|
||||
/// status on overflow, otherwise wrap around on overflow
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeSum(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
bool check_overflow = false, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative product of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative prod behavior
|
||||
/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
|
||||
/// status on overflow, otherwise wrap around on overflow
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeProd(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
bool check_overflow = false, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative max of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative max behavior
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeMax(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative min of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative min behavior
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeMin(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Compute the cumulative mean of an array-like object
|
||||
///
|
||||
/// \param[in] values array-like input
|
||||
/// \param[in] options configures cumulative mean behavior, `start` is ignored
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CumulativeMean(
|
||||
const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the first order difference of an array.
|
||||
///
|
||||
/// Computes the first order difference of an array, i.e.
|
||||
/// output[i] = input[i] - input[i - p] if i >= p
|
||||
/// output[i] = null otherwise
|
||||
/// where p is the period. For example, with p = 1,
|
||||
/// Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5].
|
||||
/// With p = 2,
|
||||
/// Diff([1, 4, 9, 10, 15]) = [null, null, 8, 6, 6]
|
||||
/// p can also be negative, in which case the diff is computed in
|
||||
/// the opposite direction.
|
||||
/// \param[in] array array input
|
||||
/// \param[in] options options, specifying overflow behavior and period
|
||||
/// \param[in] check_overflow whether to return error on overflow
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return result as array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> PairwiseDiff(const Array& array,
|
||||
const PairwiseOptions& options,
|
||||
bool check_overflow = false,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Return the inverse permutation of the given indices.
|
||||
///
|
||||
/// For indices[i] = x, inverse_permutation[x] = i. And inverse_permutation[x] = null if x
|
||||
/// does not appear in the input indices. Indices must be in the range of [0, max_index],
|
||||
/// or null, which will be ignored. If multiple indices point to the same value, the last
|
||||
/// one is used.
|
||||
///
|
||||
/// For example, with
|
||||
/// indices = [null, 0, null, 2, 4, 1, 1]
|
||||
/// the inverse permutation is
|
||||
/// [1, 6, 3, null, 4, null, null]
|
||||
/// if max_index = 6.
|
||||
///
|
||||
/// \param[in] indices array-like indices
|
||||
/// \param[in] options configures the max index and the output type
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting inverse permutation
|
||||
///
|
||||
/// \since 20.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> InversePermutation(
|
||||
const Datum& indices,
|
||||
const InversePermutationOptions& options = InversePermutationOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Scatter the values into specified positions according to the indices.
|
||||
///
|
||||
/// For indices[i] = x, output[x] = values[i]. And output[x] = null if x does not appear
|
||||
/// in the input indices. Indices must be in the range of [0, max_index], or null, in
|
||||
/// which case the corresponding value will be ignored. If multiple indices point to the
|
||||
/// same value, the last one is used.
|
||||
///
|
||||
/// For example, with
|
||||
/// values = [a, b, c, d, e, f, g]
|
||||
/// indices = [null, 0, null, 2, 4, 1, 1]
|
||||
/// the output is
|
||||
/// [b, g, d, null, e, null, null]
|
||||
/// if max_index = 6.
|
||||
///
|
||||
/// \param[in] values datum to scatter
|
||||
/// \param[in] indices array-like indices
|
||||
/// \param[in] options configures the max index of to scatter
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
///
|
||||
/// \since 20.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Scatter(const Datum& values, const Datum& indices,
|
||||
const ScatterOptions& options = ScatterOptions::Defaults(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,134 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/function.h"
|
||||
#include "arrow/compute/function_options.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
|
||||
namespace compute {
|
||||
|
||||
class ExecContext;
|
||||
|
||||
/// \addtogroup compute-concrete-options
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT CastOptions : public FunctionOptions {
|
||||
public:
|
||||
explicit CastOptions(bool safe = true);
|
||||
|
||||
static constexpr const char kTypeName[] = "CastOptions";
|
||||
static CastOptions Safe(TypeHolder to_type = {}) {
|
||||
CastOptions safe(true);
|
||||
safe.to_type = std::move(to_type);
|
||||
return safe;
|
||||
}
|
||||
|
||||
static CastOptions Unsafe(TypeHolder to_type = {}) {
|
||||
CastOptions unsafe(false);
|
||||
unsafe.to_type = std::move(to_type);
|
||||
return unsafe;
|
||||
}
|
||||
|
||||
// Type being casted to. May be passed separate to eager function
|
||||
// compute::Cast
|
||||
TypeHolder to_type;
|
||||
|
||||
bool allow_int_overflow;
|
||||
bool allow_time_truncate;
|
||||
bool allow_time_overflow;
|
||||
bool allow_decimal_truncate;
|
||||
bool allow_float_truncate;
|
||||
// Indicate if conversions from Binary/FixedSizeBinary to string must
|
||||
// validate the utf8 payload.
|
||||
bool allow_invalid_utf8;
|
||||
|
||||
/// true if the safety options all match CastOptions::Safe
|
||||
///
|
||||
/// Note, if this returns false it does not mean is_unsafe will return true
|
||||
bool is_safe() const;
|
||||
/// true if the safety options all match CastOptions::Unsafe
|
||||
///
|
||||
/// Note, if this returns false it does not mean is_safe will return true
|
||||
bool is_unsafe() const;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Return true if a cast function is defined
|
||||
ARROW_EXPORT
|
||||
bool CanCast(const DataType& from_type, const DataType& to_type);
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Convenience invocation APIs for a number of kernels
|
||||
|
||||
/// \brief Cast from one array type to another
|
||||
/// \param[in] value array to cast
|
||||
/// \param[in] to_type type to cast to
|
||||
/// \param[in] options casting options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting array
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Cast(const Array& value, const TypeHolder& to_type,
|
||||
const CastOptions& options = CastOptions::Safe(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Cast from one array type to another
|
||||
/// \param[in] value array to cast
|
||||
/// \param[in] options casting options. The "to_type" field must be populated
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting array
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Cast(const Datum& value, const CastOptions& options,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Cast from one value to another
|
||||
/// \param[in] value datum to cast
|
||||
/// \param[in] to_type type to cast to
|
||||
/// \param[in] options casting options
|
||||
/// \param[in] ctx the function execution context, optional
|
||||
/// \return the resulting datum
|
||||
///
|
||||
/// \since 1.0.0
|
||||
/// \note API not yet finalized
|
||||
ARROW_EXPORT
|
||||
Result<Datum> Cast(const Datum& value, const TypeHolder& to_type,
|
||||
const CastOptions& options = CastOptions::Safe(),
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,489 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
// It seems like 64K might be a good default chunksize to use for execution
|
||||
// based on the experience of other query processing systems. The current
|
||||
// default is not to chunk contiguous arrays, though, but this may change in
|
||||
// the future once parallel execution is implemented
|
||||
static constexpr int64_t kDefaultExecChunksize = UINT16_MAX;
|
||||
|
||||
/// \brief Context for expression-global variables and options used by
|
||||
/// function evaluation
|
||||
class ARROW_EXPORT ExecContext {
|
||||
public:
|
||||
// If no function registry passed, the default is used.
|
||||
explicit ExecContext(MemoryPool* pool = default_memory_pool(),
|
||||
::arrow::internal::Executor* executor = NULLPTR,
|
||||
FunctionRegistry* func_registry = NULLPTR);
|
||||
|
||||
/// \brief The MemoryPool used for allocations, default is
|
||||
/// default_memory_pool().
|
||||
MemoryPool* memory_pool() const { return pool_; }
|
||||
|
||||
const ::arrow::internal::CpuInfo* cpu_info() const;
|
||||
|
||||
/// \brief An Executor which may be used to parallelize execution.
|
||||
::arrow::internal::Executor* executor() const { return executor_; }
|
||||
|
||||
/// \brief The FunctionRegistry for looking up functions by name and
|
||||
/// selecting kernels for execution. Defaults to the library-global function
|
||||
/// registry provided by GetFunctionRegistry.
|
||||
FunctionRegistry* func_registry() const { return func_registry_; }
|
||||
|
||||
// \brief Set maximum length unit of work for kernel execution. Larger
|
||||
// contiguous array inputs will be split into smaller chunks, and, if
|
||||
// possible and enabled, processed in parallel. The default chunksize is
|
||||
// INT64_MAX, so contiguous arrays are not split.
|
||||
void set_exec_chunksize(int64_t chunksize) { exec_chunksize_ = chunksize; }
|
||||
|
||||
// \brief Maximum length for ExecBatch data chunks processed by
|
||||
// kernels. Contiguous array inputs with longer length will be split into
|
||||
// smaller chunks.
|
||||
int64_t exec_chunksize() const { return exec_chunksize_; }
|
||||
|
||||
/// \brief Set whether to use multiple threads for function execution. This
|
||||
/// is not yet used.
|
||||
void set_use_threads(bool use_threads = true) { use_threads_ = use_threads; }
|
||||
|
||||
/// \brief If true, then utilize multiple threads where relevant for function
|
||||
/// execution. This is not yet used.
|
||||
bool use_threads() const { return use_threads_; }
|
||||
|
||||
// Set the preallocation strategy for kernel execution as it relates to
|
||||
// chunked execution. For chunked execution, whether via ChunkedArray inputs
|
||||
// or splitting larger Array arguments into smaller pieces, contiguous
|
||||
// allocation (if permitted by the kernel) will allocate one large array to
|
||||
// write output into yielding it to the caller at the end. If this option is
|
||||
// set to off, then preallocations will be performed independently for each
|
||||
// chunk of execution
|
||||
//
|
||||
// TODO: At some point we might want the limit the size of contiguous
|
||||
// preallocations. For example, even if the exec_chunksize is 64K or less, we
|
||||
// might limit contiguous allocations to 1M records, say.
|
||||
void set_preallocate_contiguous(bool preallocate) {
|
||||
preallocate_contiguous_ = preallocate;
|
||||
}
|
||||
|
||||
/// \brief If contiguous preallocations should be used when doing chunked
|
||||
/// execution as specified by exec_chunksize(). See
|
||||
/// set_preallocate_contiguous() for more information.
|
||||
bool preallocate_contiguous() const { return preallocate_contiguous_; }
|
||||
|
||||
private:
|
||||
MemoryPool* pool_;
|
||||
::arrow::internal::Executor* executor_;
|
||||
FunctionRegistry* func_registry_;
|
||||
int64_t exec_chunksize_ = std::numeric_limits<int64_t>::max();
|
||||
bool preallocate_contiguous_ = true;
|
||||
bool use_threads_ = true;
|
||||
};
|
||||
|
||||
// TODO: Consider standardizing on uint16 selection vectors and only use them
|
||||
// when we can ensure that each value is 64K length or smaller
|
||||
|
||||
/// \brief Container for an array of value selection indices that were
|
||||
/// materialized from a filter.
|
||||
///
|
||||
/// Columnar query engines (see e.g. [1]) have found that rather than
|
||||
/// materializing filtered data, the filter can instead be converted to an
|
||||
/// array of the "on" indices and then "fusing" these indices in operator
|
||||
/// implementations. This is especially relevant for aggregations but also
|
||||
/// applies to scalar operations.
|
||||
///
|
||||
/// We are not yet using this so this is mostly a placeholder for now.
|
||||
///
|
||||
/// [1]: http://cidrdb.org/cidr2005/papers/P19.pdf
|
||||
class ARROW_EXPORT SelectionVector {
|
||||
public:
|
||||
explicit SelectionVector(std::shared_ptr<ArrayData> data);
|
||||
|
||||
explicit SelectionVector(const Array& arr);
|
||||
|
||||
/// \brief Create SelectionVector from boolean mask
|
||||
static Result<std::shared_ptr<SelectionVector>> FromMask(const BooleanArray& arr);
|
||||
|
||||
const int32_t* indices() const { return indices_; }
|
||||
int32_t length() const;
|
||||
|
||||
private:
|
||||
std::shared_ptr<ArrayData> data_;
|
||||
const int32_t* indices_;
|
||||
};
|
||||
|
||||
/// An index to represent that a batch does not belong to an ordered stream
|
||||
constexpr int64_t kUnsequencedIndex = -1;
|
||||
|
||||
/// \brief A unit of work for kernel execution. It contains a collection of
|
||||
/// Array and Scalar values and an optional SelectionVector indicating that
|
||||
/// there is an unmaterialized filter that either must be materialized, or (if
|
||||
/// the kernel supports it) pushed down into the kernel implementation.
|
||||
///
|
||||
/// ExecBatch is semantically similar to RecordBatch in that in a SQL context
|
||||
/// it represents a collection of records, but constant "columns" are
|
||||
/// represented by Scalar values rather than having to be converted into arrays
|
||||
/// with repeated values.
|
||||
///
|
||||
/// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight
|
||||
/// than is desirable for this class. Microbenchmarks would help determine for
|
||||
/// sure. See ARROW-8928.
|
||||
|
||||
/// \addtogroup acero-internals
|
||||
/// @{
|
||||
|
||||
struct ARROW_EXPORT ExecBatch {
|
||||
ExecBatch() = default;
|
||||
ExecBatch(std::vector<Datum> values, int64_t length)
|
||||
: values(std::move(values)), length(length) {}
|
||||
|
||||
explicit ExecBatch(const RecordBatch& batch);
|
||||
|
||||
/// \brief Infer the ExecBatch length from values.
|
||||
static Result<int64_t> InferLength(const std::vector<Datum>& values);
|
||||
|
||||
/// Creates an ExecBatch with length-validation.
|
||||
///
|
||||
/// If any value is given, then all values must have a common length. If the given
|
||||
/// length is negative, then the length of the ExecBatch is set to this common length,
|
||||
/// or to 1 if no values are given. Otherwise, the given length must equal the common
|
||||
/// length, if any value is given.
|
||||
static Result<ExecBatch> Make(std::vector<Datum> values, int64_t length = -1);
|
||||
|
||||
Result<std::shared_ptr<RecordBatch>> ToRecordBatch(
|
||||
std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// The values representing positional arguments to be passed to a kernel's
|
||||
/// exec function for processing.
|
||||
std::vector<Datum> values;
|
||||
|
||||
/// A deferred filter represented as an array of indices into the values.
|
||||
///
|
||||
/// For example, the filter [true, true, false, true] would be represented as
|
||||
/// the selection vector [0, 1, 3]. When the selection vector is set,
|
||||
/// ExecBatch::length is equal to the length of this array.
|
||||
std::shared_ptr<SelectionVector> selection_vector;
|
||||
|
||||
/// A predicate Expression guaranteed to evaluate to true for all rows in this batch.
|
||||
Expression guarantee = literal(true);
|
||||
|
||||
/// The semantic length of the ExecBatch. When the values are all scalars,
|
||||
/// the length should be set to 1 for non-aggregate kernels, otherwise the
|
||||
/// length is taken from the array values, except when there is a selection
|
||||
/// vector. When there is a selection vector set, the length of the batch is
|
||||
/// the length of the selection. Aggregate kernels can have an ExecBatch
|
||||
/// formed by projecting just the partition columns from a batch in which
|
||||
/// case, it would have scalar rows with length greater than 1.
|
||||
///
|
||||
/// If the array values are of length 0 then the length is 0 regardless of
|
||||
/// whether any values are Scalar.
|
||||
int64_t length = 0;
|
||||
|
||||
/// \brief index of this batch in a sorted stream of batches
|
||||
///
|
||||
/// This index must be strictly monotonic starting at 0 without gaps or
|
||||
/// it can be set to kUnsequencedIndex if there is no meaningful order
|
||||
int64_t index = kUnsequencedIndex;
|
||||
|
||||
/// \brief The sum of bytes in each buffer referenced by the batch
|
||||
///
|
||||
/// Note: Scalars are not counted
|
||||
/// Note: Some values may referenced only part of a buffer, for
|
||||
/// example, an array with an offset. The actual data
|
||||
/// visible to this batch will be smaller than the total
|
||||
/// buffer size in this case.
|
||||
int64_t TotalBufferSize() const;
|
||||
|
||||
/// \brief Return the value at the i-th index
|
||||
template <typename index_type>
|
||||
inline const Datum& operator[](index_type i) const {
|
||||
return values[i];
|
||||
}
|
||||
|
||||
bool Equals(const ExecBatch& other) const;
|
||||
|
||||
/// \brief A convenience for the number of values / arguments.
|
||||
int num_values() const { return static_cast<int>(values.size()); }
|
||||
|
||||
ExecBatch Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
Result<ExecBatch> SelectValues(const std::vector<int>& ids) const;
|
||||
|
||||
/// \brief A convenience for returning the types from the batch.
|
||||
std::vector<TypeHolder> GetTypes() const {
|
||||
std::vector<TypeHolder> result;
|
||||
for (const auto& value : this->values) {
|
||||
result.emplace_back(value.type());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
|
||||
inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
|
||||
|
||||
ARROW_EXPORT void PrintTo(const ExecBatch&, std::ostream*);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup compute-internals Utilities for calling functions, useful for those
|
||||
/// extending the function registry
|
||||
///
|
||||
/// @{
|
||||
|
||||
struct ExecValue {
|
||||
ArraySpan array = {};
|
||||
const Scalar* scalar = NULLPTR;
|
||||
|
||||
ExecValue(const Scalar* scalar) // NOLINT implicit conversion
|
||||
: scalar(scalar) {}
|
||||
|
||||
ExecValue(ArraySpan array) // NOLINT implicit conversion
|
||||
: array(std::move(array)) {}
|
||||
|
||||
ExecValue(const ArrayData& array) { // NOLINT implicit conversion
|
||||
this->array.SetMembers(array);
|
||||
}
|
||||
|
||||
ExecValue() = default;
|
||||
ExecValue(const ExecValue& other) = default;
|
||||
ExecValue& operator=(const ExecValue& other) = default;
|
||||
ExecValue(ExecValue&& other) = default;
|
||||
ExecValue& operator=(ExecValue&& other) = default;
|
||||
|
||||
int64_t length() const { return this->is_array() ? this->array.length : 1; }
|
||||
|
||||
bool is_array() const { return this->scalar == NULLPTR; }
|
||||
bool is_scalar() const { return !this->is_array(); }
|
||||
|
||||
void SetArray(const ArrayData& array) {
|
||||
this->array.SetMembers(array);
|
||||
this->scalar = NULLPTR;
|
||||
}
|
||||
|
||||
void SetScalar(const Scalar* scalar) { this->scalar = scalar; }
|
||||
|
||||
template <typename ExactType>
|
||||
const ExactType& scalar_as() const {
|
||||
return ::arrow::internal::checked_cast<const ExactType&>(*this->scalar);
|
||||
}
|
||||
|
||||
/// XXX: here temporarily for compatibility with datum, see
|
||||
/// e.g. MakeStructExec in scalar_nested.cc
|
||||
int64_t null_count() const {
|
||||
if (this->is_array()) {
|
||||
return this->array.GetNullCount();
|
||||
} else {
|
||||
return this->scalar->is_valid ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
const DataType* type() const {
|
||||
if (this->is_array()) {
|
||||
return array.type;
|
||||
} else {
|
||||
return scalar->type.get();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ARROW_EXPORT ExecResult {
|
||||
// The default value of the variant is ArraySpan
|
||||
std::variant<ArraySpan, std::shared_ptr<ArrayData>> value;
|
||||
|
||||
int64_t length() const {
|
||||
if (this->is_array_span()) {
|
||||
return this->array_span()->length;
|
||||
} else {
|
||||
return this->array_data()->length;
|
||||
}
|
||||
}
|
||||
|
||||
const DataType* type() const {
|
||||
if (this->is_array_span()) {
|
||||
return this->array_span()->type;
|
||||
} else {
|
||||
return this->array_data()->type.get();
|
||||
}
|
||||
}
|
||||
|
||||
const ArraySpan* array_span() const { return &std::get<ArraySpan>(this->value); }
|
||||
ArraySpan* array_span_mutable() { return &std::get<ArraySpan>(this->value); }
|
||||
|
||||
bool is_array_span() const { return this->value.index() == 0; }
|
||||
|
||||
const std::shared_ptr<ArrayData>& array_data() const {
|
||||
return std::get<std::shared_ptr<ArrayData>>(this->value);
|
||||
}
|
||||
ArrayData* array_data_mutable() {
|
||||
return std::get<std::shared_ptr<ArrayData>>(this->value).get();
|
||||
}
|
||||
|
||||
bool is_array_data() const { return this->value.index() == 1; }
|
||||
};
|
||||
|
||||
/// \brief A "lightweight" column batch object which contains no
|
||||
/// std::shared_ptr objects and does not have any memory ownership
|
||||
/// semantics. Can represent a view onto an "owning" ExecBatch.
|
||||
struct ARROW_EXPORT ExecSpan {
|
||||
ExecSpan() = default;
|
||||
ExecSpan(const ExecSpan& other) = default;
|
||||
ExecSpan& operator=(const ExecSpan& other) = default;
|
||||
ExecSpan(ExecSpan&& other) = default;
|
||||
ExecSpan& operator=(ExecSpan&& other) = default;
|
||||
|
||||
explicit ExecSpan(std::vector<ExecValue> values, int64_t length)
|
||||
: length(length), values(std::move(values)) {}
|
||||
|
||||
explicit ExecSpan(const ExecBatch& batch) {
|
||||
this->length = batch.length;
|
||||
this->values.resize(batch.values.size());
|
||||
for (size_t i = 0; i < batch.values.size(); ++i) {
|
||||
const Datum& in_value = batch[i];
|
||||
ExecValue* out_value = &this->values[i];
|
||||
if (in_value.is_array()) {
|
||||
out_value->SetArray(*in_value.array());
|
||||
} else {
|
||||
out_value->SetScalar(in_value.scalar().get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Return the value at the i-th index
|
||||
template <typename index_type>
|
||||
inline const ExecValue& operator[](index_type i) const {
|
||||
return values[i];
|
||||
}
|
||||
|
||||
/// \brief A convenience for the number of values / arguments.
|
||||
int num_values() const { return static_cast<int>(values.size()); }
|
||||
|
||||
std::vector<TypeHolder> GetTypes() const {
|
||||
std::vector<TypeHolder> result;
|
||||
for (const auto& value : this->values) {
|
||||
result.emplace_back(value.type());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
ExecBatch ToExecBatch() const {
|
||||
ExecBatch result;
|
||||
result.length = this->length;
|
||||
for (const ExecValue& value : this->values) {
|
||||
if (value.is_array()) {
|
||||
result.values.push_back(value.array.ToArrayData());
|
||||
} else {
|
||||
result.values.push_back(value.scalar->GetSharedPtr());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int64_t length = 0;
|
||||
std::vector<ExecValue> values;
|
||||
};
|
||||
|
||||
/// \defgroup compute-call-function One-shot calls to compute functions
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief One-shot invoker for all types of functions.
|
||||
///
|
||||
/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
|
||||
/// and wrapping of outputs.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
|
||||
const FunctionOptions* options, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Variant of CallFunction which uses a function's default options.
|
||||
///
|
||||
/// NB: Some functions require FunctionOptions be provided.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief One-shot invoker for all types of functions.
|
||||
///
|
||||
/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
|
||||
/// and wrapping of outputs.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
|
||||
const FunctionOptions* options, ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// \brief Variant of CallFunction which uses a function's default options.
|
||||
///
|
||||
/// NB: Some functions require FunctionOptions be provided.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
|
||||
ExecContext* ctx = NULLPTR);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup compute-function-executor One-shot calls to obtain function executors
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief One-shot executor provider for all types of functions.
|
||||
///
|
||||
/// This function creates and initializes a `FunctionExecutor` appropriate
|
||||
/// for the given function name, input types and function options.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<FunctionExecutor>> GetFunctionExecutor(
|
||||
const std::string& func_name, std::vector<TypeHolder> in_types,
|
||||
const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR);
|
||||
|
||||
/// \brief One-shot executor provider for all types of functions.
|
||||
///
|
||||
/// This function creates and initializes a `FunctionExecutor` appropriate
|
||||
/// for the given function name, input types (taken from the Datum arguments)
|
||||
/// and function options.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<FunctionExecutor>> GetFunctionExecutor(
|
||||
const std::string& func_name, const std::vector<Datum>& args,
|
||||
const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR);
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,295 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/small_vector.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
/// \defgroup expression-core Expressions to describe data transformations
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// An unbound expression which maps a single Datum to another Datum.
|
||||
/// An expression is one of
|
||||
/// - A literal Datum.
|
||||
/// - A reference to a single (potentially nested) field of the input Datum.
|
||||
/// - A call to a compute function, with arguments specified by other Expressions.
|
||||
class ARROW_EXPORT Expression {
|
||||
public:
|
||||
struct Call {
|
||||
std::string function_name;
|
||||
std::vector<Expression> arguments;
|
||||
std::shared_ptr<FunctionOptions> options;
|
||||
// Cached hash value
|
||||
size_t hash;
|
||||
|
||||
// post-Bind properties:
|
||||
std::shared_ptr<Function> function;
|
||||
const Kernel* kernel = NULLPTR;
|
||||
std::shared_ptr<KernelState> kernel_state;
|
||||
TypeHolder type;
|
||||
|
||||
void ComputeHash();
|
||||
};
|
||||
|
||||
std::string ToString() const;
|
||||
bool Equals(const Expression& other) const;
|
||||
size_t hash() const;
|
||||
struct Hash {
|
||||
size_t operator()(const Expression& expr) const { return expr.hash(); }
|
||||
};
|
||||
|
||||
/// Bind this expression to the given input type, looking up Kernels and field types.
|
||||
/// Some expression simplification may be performed and implicit casts will be inserted.
|
||||
/// Any state necessary for execution will be initialized and returned.
|
||||
Result<Expression> Bind(const TypeHolder& in, ExecContext* = NULLPTR) const;
|
||||
Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;
|
||||
|
||||
// XXX someday
|
||||
// Clone all KernelState in this bound expression. If any function referenced by this
|
||||
// expression has mutable KernelState, it is not safe to execute or apply simplification
|
||||
// passes to it (or copies of it!) from multiple threads. Cloning state produces new
|
||||
// KernelStates where necessary to ensure that Expressions may be manipulated safely
|
||||
// on multiple threads.
|
||||
// Result<ExpressionState> CloneState() const;
|
||||
// Status SetState(ExpressionState);
|
||||
|
||||
/// Return true if all an expression's field references have explicit types
|
||||
/// and all of its functions' kernels are looked up.
|
||||
bool IsBound() const;
|
||||
|
||||
/// Return true if this expression is composed only of Scalar literals, field
|
||||
/// references, and calls to ScalarFunctions.
|
||||
bool IsScalarExpression() const;
|
||||
|
||||
/// Return true if this expression is literal and entirely null.
|
||||
bool IsNullLiteral() const;
|
||||
|
||||
/// Return true if this expression could evaluate to true. Will return true for any
|
||||
/// unbound or non-boolean Expressions. IsSatisfiable does not (currently) do any
|
||||
/// canonicalization or simplification of the expression, so even Expressions
|
||||
/// which are unsatisfiable may spuriously return `true` here. This function is
|
||||
/// intended for use in predicate pushdown where a filter expression is simplified
|
||||
/// by a guarantee, so it assumes that trying to simplify again would be redundant.
|
||||
bool IsSatisfiable() const;
|
||||
|
||||
// XXX someday
|
||||
// Result<PipelineGraph> GetPipelines();
|
||||
|
||||
bool is_valid() const { return impl_ != NULLPTR; }
|
||||
|
||||
/// Access a Call or return nullptr if this expression is not a call
|
||||
const Call* call() const;
|
||||
/// Access a Datum or return nullptr if this expression is not a literal
|
||||
const Datum* literal() const;
|
||||
/// Access a FieldRef or return nullptr if this expression is not a field_ref
|
||||
const FieldRef* field_ref() const;
|
||||
|
||||
/// The type to which this expression will evaluate
|
||||
const DataType* type() const;
|
||||
// XXX someday
|
||||
// NullGeneralization::type nullable() const;
|
||||
|
||||
struct Parameter {
|
||||
FieldRef ref;
|
||||
|
||||
// post-bind properties
|
||||
TypeHolder type;
|
||||
::arrow::internal::SmallVector<int, 2> indices;
|
||||
};
|
||||
const Parameter* parameter() const;
|
||||
|
||||
Expression() = default;
|
||||
explicit Expression(Call call);
|
||||
explicit Expression(Datum literal);
|
||||
explicit Expression(Parameter parameter);
|
||||
|
||||
static bool Identical(const Expression& l, const Expression& r);
|
||||
|
||||
private:
|
||||
using Impl = std::variant<Datum, Parameter, Call>;
|
||||
std::shared_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
|
||||
inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }
|
||||
|
||||
ARROW_EXPORT void PrintTo(const Expression&, std::ostream*);
|
||||
|
||||
// Factories
|
||||
|
||||
ARROW_EXPORT
|
||||
Expression literal(Datum lit);
|
||||
|
||||
template <typename Arg>
|
||||
Expression literal(Arg&& arg) {
|
||||
return literal(Datum(std::forward<Arg>(arg)));
|
||||
}
|
||||
|
||||
ARROW_EXPORT
|
||||
Expression field_ref(FieldRef ref);
|
||||
|
||||
ARROW_EXPORT
|
||||
Expression call(std::string function, std::vector<Expression> arguments,
|
||||
std::shared_ptr<FunctionOptions> options = NULLPTR);
|
||||
|
||||
template <typename Options, typename = typename std::enable_if<
|
||||
std::is_base_of<FunctionOptions, Options>::value>::type>
|
||||
Expression call(std::string function, std::vector<Expression> arguments,
|
||||
Options options) {
|
||||
return call(std::move(function), std::move(arguments),
|
||||
std::make_shared<Options>(std::move(options)));
|
||||
}
|
||||
|
||||
/// Assemble a list of all fields referenced by an Expression at any depth.
|
||||
ARROW_EXPORT
|
||||
std::vector<FieldRef> FieldsInExpression(const Expression&);
|
||||
|
||||
/// Check if the expression references any fields.
|
||||
ARROW_EXPORT
|
||||
bool ExpressionHasFieldRefs(const Expression&);
|
||||
|
||||
struct ARROW_EXPORT KnownFieldValues;
|
||||
|
||||
/// Assemble a mapping from field references to known values. This derives known values
|
||||
/// from "equal" and "is_null" Expressions referencing a field and a literal.
|
||||
ARROW_EXPORT
|
||||
Result<KnownFieldValues> ExtractKnownFieldValues(
|
||||
const Expression& guaranteed_true_predicate);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup expression-passes Functions for modification of Expressions
|
||||
///
|
||||
/// @{
|
||||
///
|
||||
/// These transform bound expressions. Some transforms utilize a guarantee, which is
|
||||
/// provided as an Expression which is guaranteed to evaluate to true. The
|
||||
/// guaranteed_true_predicate need not be bound, but canonicalization is currently
|
||||
/// deferred to producers of guarantees. For example in order to be recognized as a
|
||||
/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
|
||||
/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
|
||||
/// other semantically identical Expressions will not be recognized.
|
||||
|
||||
/// Weak canonicalization which establishes guarantees for subsequent passes. Even
|
||||
/// equivalent Expressions may result in different canonicalized expressions.
|
||||
/// TODO this could be a strong canonicalization
|
||||
ARROW_EXPORT
|
||||
Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);
|
||||
|
||||
/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
|
||||
/// be null so replace the call with a null literal). Includes early evaluation of all
|
||||
/// calls whose arguments are entirely literal.
|
||||
ARROW_EXPORT
|
||||
Result<Expression> FoldConstants(Expression);
|
||||
|
||||
/// Simplify Expressions by replacing with known values of the fields which it references.
|
||||
ARROW_EXPORT
|
||||
Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
|
||||
Expression);
|
||||
|
||||
/// Simplify an expression by replacing subexpressions based on a guarantee:
|
||||
/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
|
||||
/// used to remove redundant function calls from a filter expression or to replace a
|
||||
/// reference to a constant-value field with a literal.
|
||||
ARROW_EXPORT
|
||||
Result<Expression> SimplifyWithGuarantee(Expression,
|
||||
const Expression& guaranteed_true_predicate);
|
||||
|
||||
/// Replace all named field refs (e.g. "x" or "x.y") with field paths (e.g. [0] or [1,3])
|
||||
///
|
||||
/// This isn't usually needed and does not offer any simplification by itself. However,
|
||||
/// it can be useful to normalize an expression to paths to make it simpler to work with.
|
||||
ARROW_EXPORT Result<Expression> RemoveNamedRefs(Expression expression);
|
||||
|
||||
/// @}
|
||||
|
||||
// Execution
|
||||
|
||||
/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
|
||||
/// RecordBatch which may have missing or incorrectly ordered columns.
|
||||
/// Missing fields will be replaced with null scalars.
|
||||
ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
|
||||
const Datum& partial,
|
||||
Expression guarantee = literal(true));
|
||||
|
||||
/// Execute a scalar expression against the provided state and input ExecBatch. This
|
||||
/// expression must be bound.
|
||||
ARROW_EXPORT
|
||||
Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
|
||||
ExecContext* = NULLPTR);
|
||||
|
||||
/// Convenience function for invoking against a RecordBatch
|
||||
ARROW_EXPORT
|
||||
Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
|
||||
const Datum& partial_input, ExecContext* = NULLPTR);
|
||||
|
||||
// Serialization
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<Expression> Deserialize(std::shared_ptr<Buffer>);
|
||||
|
||||
/// \defgroup expression-convenience Helpers for convenient expression creation
|
||||
///
|
||||
/// @{
|
||||
|
||||
ARROW_EXPORT Expression project(std::vector<Expression> values,
|
||||
std::vector<std::string> names);
|
||||
|
||||
ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression less(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
|
||||
|
||||
ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false);
|
||||
|
||||
ARROW_EXPORT Expression is_valid(Expression lhs);
|
||||
|
||||
ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
|
||||
ARROW_EXPORT Expression and_(const std::vector<Expression>&);
|
||||
ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
|
||||
ARROW_EXPORT Expression or_(const std::vector<Expression>&);
|
||||
ARROW_EXPORT Expression not_(Expression operand);
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,410 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/kernel.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/compare.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
/// \addtogroup compute-functions
|
||||
/// @{
|
||||
|
||||
/// \brief Contains the number of required arguments for the function.
|
||||
///
|
||||
/// Naming conventions taken from https://en.wikipedia.org/wiki/Arity.
|
||||
struct ARROW_EXPORT Arity {
|
||||
/// \brief A function taking no arguments
|
||||
static Arity Nullary() { return Arity(0, false); }
|
||||
|
||||
/// \brief A function taking 1 argument
|
||||
static Arity Unary() { return Arity(1, false); }
|
||||
|
||||
/// \brief A function taking 2 arguments
|
||||
static Arity Binary() { return Arity(2, false); }
|
||||
|
||||
/// \brief A function taking 3 arguments
|
||||
static Arity Ternary() { return Arity(3, false); }
|
||||
|
||||
/// \brief A function taking a variable number of arguments
|
||||
///
|
||||
/// \param[in] min_args the minimum number of arguments required when
|
||||
/// invoking the function
|
||||
static Arity VarArgs(int min_args = 0) { return Arity(min_args, true); }
|
||||
|
||||
// NOTE: the 0-argument form (default constructor) is required for Cython
|
||||
explicit Arity(int num_args = 0, bool is_varargs = false)
|
||||
: num_args(num_args), is_varargs(is_varargs) {}
|
||||
|
||||
/// The number of required arguments (or the minimum number for varargs
|
||||
/// functions).
|
||||
int num_args;
|
||||
|
||||
/// If true, then the num_args is the minimum number of required arguments.
|
||||
bool is_varargs = false;
|
||||
};
|
||||
|
||||
struct ARROW_EXPORT FunctionDoc {
|
||||
/// \brief A one-line summary of the function, using a verb.
|
||||
///
|
||||
/// For example, "Add two numeric arrays or scalars".
|
||||
std::string summary;
|
||||
|
||||
/// \brief A detailed description of the function, meant to follow the summary.
|
||||
std::string description;
|
||||
|
||||
/// \brief Symbolic names (identifiers) for the function arguments.
|
||||
///
|
||||
/// Some bindings may use this to generate nicer function signatures.
|
||||
std::vector<std::string> arg_names;
|
||||
|
||||
// TODO add argument descriptions?
|
||||
|
||||
/// \brief Name of the options class, if any.
|
||||
std::string options_class;
|
||||
|
||||
/// \brief Whether options are required for function execution
|
||||
///
|
||||
/// If false, then either the function does not have an options class
|
||||
/// or there is a usable default options value.
|
||||
bool options_required;
|
||||
|
||||
FunctionDoc() = default;
|
||||
|
||||
FunctionDoc(std::string summary, std::string description,
|
||||
std::vector<std::string> arg_names, std::string options_class = "",
|
||||
bool options_required = false)
|
||||
: summary(std::move(summary)),
|
||||
description(std::move(description)),
|
||||
arg_names(std::move(arg_names)),
|
||||
options_class(std::move(options_class)),
|
||||
options_required(options_required) {}
|
||||
|
||||
static const FunctionDoc& Empty();
|
||||
};
|
||||
|
||||
/// \brief An executor of a function with a preconfigured kernel
|
||||
class ARROW_EXPORT FunctionExecutor {
|
||||
public:
|
||||
virtual ~FunctionExecutor() = default;
|
||||
/// \brief Initialize or re-initialize the preconfigured kernel
|
||||
///
|
||||
/// This method may be called zero or more times. Depending on how
|
||||
/// the FunctionExecutor was obtained, it may already have been initialized.
|
||||
virtual Status Init(const FunctionOptions* options = NULLPTR,
|
||||
ExecContext* exec_ctx = NULLPTR) = 0;
|
||||
/// \brief Execute the preconfigured kernel with arguments that must fit it
|
||||
///
|
||||
/// The method requires the arguments be castable to the preconfigured types.
|
||||
///
|
||||
/// \param[in] args Arguments to execute the function on
|
||||
/// \param[in] length Length of arguments batch or -1 to default it. If the
|
||||
/// function has no parameters, this determines the batch length, defaulting
|
||||
/// to 0. Otherwise, if the function is scalar, this must equal the argument
|
||||
/// batch's inferred length or be -1 to default to it. This is ignored for
|
||||
/// vector functions.
|
||||
virtual Result<Datum> Execute(const std::vector<Datum>& args, int64_t length = -1) = 0;
|
||||
};
|
||||
|
||||
/// \brief Base class for compute functions. Function implementations contain a
|
||||
/// collection of "kernels" which are implementations of the function for
|
||||
/// specific argument types. Selecting a viable kernel for executing a function
|
||||
/// is referred to as "dispatching".
|
||||
class ARROW_EXPORT Function {
|
||||
public:
|
||||
/// \brief The kind of function, which indicates in what contexts it is
|
||||
/// valid for use.
|
||||
enum Kind {
|
||||
/// A function that performs scalar data operations on whole arrays of
|
||||
/// data. Can generally process Array or Scalar values. The size of the
|
||||
/// output will be the same as the size (or broadcasted size, in the case
|
||||
/// of mixing Array and Scalar inputs) of the input.
|
||||
SCALAR,
|
||||
|
||||
/// A function with array input and output whose behavior depends on the
|
||||
/// values of the entire arrays passed, rather than the value of each scalar
|
||||
/// value.
|
||||
VECTOR,
|
||||
|
||||
/// A function that computes scalar summary statistics from array input.
|
||||
SCALAR_AGGREGATE,
|
||||
|
||||
/// A function that computes grouped summary statistics from array input
|
||||
/// and an array of group identifiers.
|
||||
HASH_AGGREGATE,
|
||||
|
||||
/// A function that dispatches to other functions and does not contain its
|
||||
/// own kernels.
|
||||
META
|
||||
};
|
||||
|
||||
virtual ~Function() = default;
|
||||
|
||||
/// \brief The name of the kernel. The registry enforces uniqueness of names.
|
||||
const std::string& name() const { return name_; }
|
||||
|
||||
/// \brief The kind of kernel, which indicates in what contexts it is valid
|
||||
/// for use.
|
||||
Function::Kind kind() const { return kind_; }
|
||||
|
||||
/// \brief Contains the number of arguments the function requires, or if the
|
||||
/// function accepts variable numbers of arguments.
|
||||
const Arity& arity() const { return arity_; }
|
||||
|
||||
/// \brief Return the function documentation
|
||||
const FunctionDoc& doc() const { return doc_; }
|
||||
|
||||
/// \brief Returns the number of registered kernels for this function.
|
||||
virtual int num_kernels() const = 0;
|
||||
|
||||
/// \brief Return a kernel that can execute the function given the exact
|
||||
/// argument types (without implicit type casts).
|
||||
///
|
||||
/// NB: This function is overridden in CastFunction.
|
||||
virtual Result<const Kernel*> DispatchExact(const std::vector<TypeHolder>& types) const;
|
||||
|
||||
/// \brief Return a best-match kernel that can execute the function given the argument
|
||||
/// types, after implicit casts are applied.
|
||||
///
|
||||
/// \param[in,out] values Argument types. An element may be modified to
|
||||
/// indicate that the returned kernel only approximately matches the input
|
||||
/// value descriptors; callers are responsible for casting inputs to the type
|
||||
/// required by the kernel.
|
||||
virtual Result<const Kernel*> DispatchBest(std::vector<TypeHolder>* values) const;
|
||||
|
||||
/// \brief Get a function executor with a best-matching kernel
|
||||
///
|
||||
/// The returned executor will by default work with the default FunctionOptions
|
||||
/// and KernelContext. If you want to change that, call `FunctionExecutor::Init`.
|
||||
virtual Result<std::shared_ptr<FunctionExecutor>> GetBestExecutor(
|
||||
std::vector<TypeHolder> inputs) const;
|
||||
|
||||
/// \brief Execute the function eagerly with the passed input arguments with
|
||||
/// kernel dispatch, batch iteration, and memory allocation details taken
|
||||
/// care of.
|
||||
///
|
||||
/// If the `options` pointer is null, then `default_options()` will be used.
|
||||
///
|
||||
/// This function can be overridden in subclasses.
|
||||
virtual Result<Datum> Execute(const std::vector<Datum>& args,
|
||||
const FunctionOptions* options, ExecContext* ctx) const;
|
||||
|
||||
virtual Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
|
||||
ExecContext* ctx) const;
|
||||
|
||||
/// \brief Returns the default options for this function.
|
||||
///
|
||||
/// Whatever option semantics a Function has, implementations must guarantee
|
||||
/// that default_options() is valid to pass to Execute as options.
|
||||
const FunctionOptions* default_options() const { return default_options_; }
|
||||
|
||||
virtual Status Validate() const;
|
||||
|
||||
/// \brief Returns the pure property for this function.
|
||||
///
|
||||
/// Impure functions are those that may return different results for the same
|
||||
/// input arguments. For example, a function that returns a random number is
|
||||
/// not pure. An expression containing only pure functions can be simplified by
|
||||
/// pre-evaluating any sub-expressions that have constant arguments.
|
||||
virtual bool is_pure() const { return true; }
|
||||
|
||||
protected:
|
||||
Function(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options)
|
||||
: name_(std::move(name)),
|
||||
kind_(kind),
|
||||
arity_(arity),
|
||||
doc_(std::move(doc)),
|
||||
default_options_(default_options) {}
|
||||
|
||||
Status CheckArity(size_t num_args) const;
|
||||
|
||||
std::string name_;
|
||||
Function::Kind kind_;
|
||||
Arity arity_;
|
||||
const FunctionDoc doc_;
|
||||
const FunctionOptions* default_options_ = NULLPTR;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename KernelType>
|
||||
class FunctionImpl : public Function {
|
||||
public:
|
||||
/// \brief Return pointers to current-available kernels for inspection
|
||||
std::vector<const KernelType*> kernels() const {
|
||||
std::vector<const KernelType*> result;
|
||||
for (const auto& kernel : kernels_) {
|
||||
result.push_back(&kernel);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int num_kernels() const override { return static_cast<int>(kernels_.size()); }
|
||||
|
||||
protected:
|
||||
FunctionImpl(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options)
|
||||
: Function(std::move(name), kind, arity, std::move(doc), default_options) {}
|
||||
|
||||
std::vector<KernelType> kernels_;
|
||||
};
|
||||
|
||||
/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
|
||||
ARROW_EXPORT
|
||||
const Kernel* DispatchExactImpl(const Function* func, const std::vector<TypeHolder>&);
|
||||
|
||||
/// \brief Return an error message if no Kernel is found.
|
||||
ARROW_EXPORT
|
||||
Status NoMatchingKernel(const Function* func, const std::vector<TypeHolder>&);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// \brief A function that executes elementwise operations on arrays or
|
||||
/// scalars, and therefore whose results generally do not depend on the order
|
||||
/// of the values in the arguments. Accepts and returns arrays that are all of
|
||||
/// the same size. These functions roughly correspond to the functions used in
|
||||
/// SQL expressions.
|
||||
class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
|
||||
public:
|
||||
using KernelType = ScalarKernel;
|
||||
|
||||
ScalarFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR, bool is_pure = true)
|
||||
: detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity,
|
||||
std::move(doc), default_options),
|
||||
is_pure_(is_pure) {}
|
||||
|
||||
/// \brief Add a kernel with given input/output types, no required state
|
||||
/// initialization, preallocation for fixed-width types, and default null
|
||||
/// handling (intersect validity bitmaps of inputs).
|
||||
Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
|
||||
ArrayKernelExec exec, KernelInit init = NULLPTR,
|
||||
std::shared_ptr<MatchConstraint> constraint = NULLPTR);
|
||||
|
||||
/// \brief Add a kernel (function implementation). Returns error if the
|
||||
/// kernel's signature does not match the function's arity.
|
||||
Status AddKernel(ScalarKernel kernel);
|
||||
|
||||
/// \brief Returns the pure property for this function.
|
||||
bool is_pure() const override { return is_pure_; }
|
||||
|
||||
private:
|
||||
const bool is_pure_;
|
||||
};
|
||||
|
||||
/// \brief A function that executes general array operations that may yield
|
||||
/// outputs of different sizes or have results that depend on the whole array
|
||||
/// contents. These functions roughly correspond to the functions found in
|
||||
/// non-SQL array languages like APL and its derivatives.
|
||||
class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
|
||||
public:
|
||||
using KernelType = VectorKernel;
|
||||
|
||||
VectorFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR)
|
||||
: detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity,
|
||||
std::move(doc), default_options) {}
|
||||
|
||||
/// \brief Add a simple kernel with given input/output types, no required
|
||||
/// state initialization, no data preallocation, and no preallocation of the
|
||||
/// validity bitmap.
|
||||
Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
|
||||
ArrayKernelExec exec, KernelInit init = NULLPTR);
|
||||
|
||||
/// \brief Add a kernel (function implementation). Returns error if the
|
||||
/// kernel's signature does not match the function's arity.
|
||||
Status AddKernel(VectorKernel kernel);
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ScalarAggregateFunction
|
||||
: public detail::FunctionImpl<ScalarAggregateKernel> {
|
||||
public:
|
||||
using KernelType = ScalarAggregateKernel;
|
||||
|
||||
ScalarAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR)
|
||||
: detail::FunctionImpl<ScalarAggregateKernel>(std::move(name),
|
||||
Function::SCALAR_AGGREGATE, arity,
|
||||
std::move(doc), default_options) {}
|
||||
|
||||
/// \brief Add a kernel (function implementation). Returns error if the
|
||||
/// kernel's signature does not match the function's arity.
|
||||
Status AddKernel(ScalarAggregateKernel kernel);
|
||||
};
|
||||
|
||||
class ARROW_EXPORT HashAggregateFunction
|
||||
: public detail::FunctionImpl<HashAggregateKernel> {
|
||||
public:
|
||||
using KernelType = HashAggregateKernel;
|
||||
|
||||
HashAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR)
|
||||
: detail::FunctionImpl<HashAggregateKernel>(std::move(name),
|
||||
Function::HASH_AGGREGATE, arity,
|
||||
std::move(doc), default_options) {}
|
||||
|
||||
/// \brief Add a kernel (function implementation). Returns error if the
|
||||
/// kernel's signature does not match the function's arity.
|
||||
Status AddKernel(HashAggregateKernel kernel);
|
||||
};
|
||||
|
||||
/// \brief A function that dispatches to other functions. Must implement
|
||||
/// MetaFunction::ExecuteImpl.
|
||||
///
|
||||
/// For Array, ChunkedArray, and Scalar Datum kinds, may rely on the execution
|
||||
/// of concrete Function types, but must handle other Datum kinds on its own.
|
||||
class ARROW_EXPORT MetaFunction : public Function {
|
||||
public:
|
||||
int num_kernels() const override { return 0; }
|
||||
|
||||
Result<Datum> Execute(const std::vector<Datum>& args, const FunctionOptions* options,
|
||||
ExecContext* ctx) const override;
|
||||
|
||||
Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
|
||||
ExecContext* ctx) const override;
|
||||
|
||||
protected:
|
||||
virtual Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
|
||||
const FunctionOptions* options,
|
||||
ExecContext* ctx) const = 0;
|
||||
|
||||
MetaFunction(std::string name, const Arity& arity, FunctionDoc doc,
|
||||
const FunctionOptions* default_options = NULLPTR)
|
||||
: Function(std::move(name), Function::META, arity, std::move(doc),
|
||||
default_options) {}
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,81 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
/// \addtogroup compute-functions
|
||||
/// @{
|
||||
|
||||
/// \brief Extension point for defining options outside libarrow (but
|
||||
/// still within this project).
|
||||
class ARROW_EXPORT FunctionOptionsType {
|
||||
public:
|
||||
virtual ~FunctionOptionsType() = default;
|
||||
|
||||
virtual const char* type_name() const = 0;
|
||||
virtual std::string Stringify(const FunctionOptions&) const = 0;
|
||||
virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0;
|
||||
virtual Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const;
|
||||
virtual Result<std::unique_ptr<FunctionOptions>> Deserialize(
|
||||
const Buffer& buffer) const;
|
||||
virtual std::unique_ptr<FunctionOptions> Copy(const FunctionOptions&) const = 0;
|
||||
};
|
||||
|
||||
/// \brief Base class for specifying options configuring a function's behavior,
|
||||
/// such as error handling.
|
||||
class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> {
|
||||
public:
|
||||
virtual ~FunctionOptions() = default;
|
||||
|
||||
const FunctionOptionsType* options_type() const { return options_type_; }
|
||||
const char* type_name() const { return options_type()->type_name(); }
|
||||
|
||||
bool Equals(const FunctionOptions& other) const;
|
||||
std::string ToString() const;
|
||||
std::unique_ptr<FunctionOptions> Copy() const;
|
||||
/// \brief Serialize an options struct to a buffer.
|
||||
Result<std::shared_ptr<Buffer>> Serialize() const;
|
||||
/// \brief Deserialize an options struct from a buffer.
|
||||
/// Note: this will only look for `type_name` in the default FunctionRegistry;
|
||||
/// to use a custom FunctionRegistry, look up the FunctionOptionsType, then
|
||||
/// call FunctionOptionsType::Deserialize().
|
||||
static Result<std::unique_ptr<FunctionOptions>> Deserialize(
|
||||
const std::string& type_name, const Buffer& buffer);
|
||||
|
||||
protected:
|
||||
explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {}
|
||||
const FunctionOptionsType* options_type_;
|
||||
};
|
||||
|
||||
ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*);
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,32 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/compute/visibility.h"
|
||||
#include "arrow/status.h"
|
||||
|
||||
namespace arrow::compute {
|
||||
|
||||
/// \brief Initialize the compute module.
|
||||
///
|
||||
/// Register the compute kernel functions to be available on the
|
||||
/// global FunctionRegistry.
|
||||
/// This function will only be available if ARROW_COMPUTE is enabled.
|
||||
ARROW_COMPUTE_EXPORT Status Initialize();
|
||||
|
||||
} // namespace arrow::compute
|
||||
@@ -0,0 +1,772 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/compute/exec.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/device_allocation_type_set.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
// macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h.
|
||||
// No other BSD seems to do so. The name is used as an identifier in MemAllocation enum.
|
||||
#if defined(__APPLE__) && defined(PREALLOCATE)
|
||||
# undef PREALLOCATE
|
||||
#endif
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
class FunctionOptions;
|
||||
|
||||
/// \brief Base class for opaque kernel-specific state. For example, if there
|
||||
/// is some kind of initialization required.
|
||||
struct ARROW_EXPORT KernelState {
|
||||
virtual ~KernelState() = default;
|
||||
};
|
||||
|
||||
/// \brief Context/state for the execution of a particular kernel.
|
||||
class ARROW_EXPORT KernelContext {
|
||||
public:
|
||||
// Can pass optional backreference; not used consistently for the
|
||||
// moment but will be made so in the future
|
||||
explicit KernelContext(ExecContext* exec_ctx, const Kernel* kernel = NULLPTR)
|
||||
: exec_ctx_(exec_ctx), kernel_(kernel) {}
|
||||
|
||||
/// \brief Allocate buffer from the context's memory pool. The contents are
|
||||
/// not initialized.
|
||||
Result<std::shared_ptr<ResizableBuffer>> Allocate(int64_t nbytes);
|
||||
|
||||
/// \brief Allocate buffer for bitmap from the context's memory pool. Like
|
||||
/// Allocate, the contents of the buffer are not initialized but the last
|
||||
/// byte is preemptively zeroed to help avoid ASAN or valgrind issues.
|
||||
Result<std::shared_ptr<ResizableBuffer>> AllocateBitmap(int64_t num_bits);
|
||||
|
||||
/// \brief Assign the active KernelState to be utilized for each stage of
|
||||
/// kernel execution. Ownership and memory lifetime of the KernelState must
|
||||
/// be minded separately.
|
||||
void SetState(KernelState* state) { state_ = state; }
|
||||
|
||||
// Set kernel that is being invoked since some kernel
|
||||
// implementations will examine the kernel state.
|
||||
void SetKernel(const Kernel* kernel) { kernel_ = kernel; }
|
||||
|
||||
KernelState* state() { return state_; }
|
||||
|
||||
/// \brief Configuration related to function execution that is to be shared
|
||||
/// across multiple kernels.
|
||||
ExecContext* exec_context() { return exec_ctx_; }
|
||||
|
||||
/// \brief The memory pool to use for allocations. For now, it uses the
|
||||
/// MemoryPool contained in the ExecContext used to create the KernelContext.
|
||||
MemoryPool* memory_pool() { return exec_ctx_->memory_pool(); }
|
||||
|
||||
const Kernel* kernel() const { return kernel_; }
|
||||
|
||||
private:
|
||||
ExecContext* exec_ctx_;
|
||||
KernelState* state_ = NULLPTR;
|
||||
const Kernel* kernel_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief An type-checking interface to permit customizable validation rules
|
||||
/// for use with InputType and KernelSignature. This is for scenarios where the
|
||||
/// acceptance is not an exact type instance, such as a TIMESTAMP type for a
|
||||
/// specific TimeUnit, but permitting any time zone.
|
||||
struct ARROW_EXPORT TypeMatcher {
|
||||
virtual ~TypeMatcher() = default;
|
||||
|
||||
/// \brief Return true if this matcher accepts the data type.
|
||||
virtual bool Matches(const DataType& type) const = 0;
|
||||
|
||||
/// \brief A human-interpretable string representation of what the type
|
||||
/// matcher checks for, usable when printing KernelSignature or formatting
|
||||
/// error messages.
|
||||
virtual std::string ToString() const = 0;
|
||||
|
||||
/// \brief Return true if this TypeMatcher contains the same matching rule as
|
||||
/// the other. Currently depends on RTTI.
|
||||
virtual bool Equals(const TypeMatcher& other) const = 0;
|
||||
};
|
||||
|
||||
namespace match {
|
||||
|
||||
/// \brief Match any DataType instance having the same DataType::id.
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id);
|
||||
|
||||
/// \brief Match any TimestampType instance having the same unit, but the time
|
||||
/// zones can be different.
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> TimestampTypeUnit(TimeUnit::type unit);
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> Time32TypeUnit(TimeUnit::type unit);
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> Time64TypeUnit(TimeUnit::type unit);
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> DurationTypeUnit(TimeUnit::type unit);
|
||||
|
||||
// \brief Match any integer type
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> Integer();
|
||||
|
||||
// Match types using 32-bit varbinary representation
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> BinaryLike();
|
||||
|
||||
// Match types using 64-bit varbinary representation
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> LargeBinaryLike();
|
||||
|
||||
// Match any fixed binary type
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> FixedSizeBinaryLike();
|
||||
|
||||
// \brief Match any primitive type (boolean or any type representable as a C
|
||||
// Type)
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> Primitive();
|
||||
|
||||
// \brief Match any integer type that can be used as run-end in run-end encoded
|
||||
// arrays
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndInteger();
|
||||
|
||||
/// \brief Match run-end encoded types that use any valid run-end type and
|
||||
/// encode specific value types
|
||||
///
|
||||
/// @param[in] value_type_matcher a matcher that is applied to the values field
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
|
||||
std::shared_ptr<TypeMatcher> value_type_matcher);
|
||||
|
||||
/// \brief Match run-end encoded types that use any valid run-end type and
|
||||
/// encode specific value types
|
||||
///
|
||||
/// @param[in] value_type_id a type id that the type of the values field should match
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(Type::type value_type_id);
|
||||
|
||||
/// \brief Match run-end encoded types that encode specific run-end and value types
|
||||
///
|
||||
/// @param[in] run_end_type_matcher a matcher that is applied to the run_ends field
|
||||
/// @param[in] value_type_matcher a matcher that is applied to the values field
|
||||
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
|
||||
std::shared_ptr<TypeMatcher> run_end_type_matcher,
|
||||
std::shared_ptr<TypeMatcher> value_type_matcher);
|
||||
|
||||
} // namespace match
|
||||
|
||||
/// \brief An object used for type-checking arguments to be passed to a kernel
|
||||
/// and stored in a KernelSignature. The type-checking rule can be supplied
|
||||
/// either with an exact DataType instance or a custom TypeMatcher.
|
||||
class ARROW_EXPORT InputType {
|
||||
public:
|
||||
/// \brief The kind of type-checking rule that the InputType contains.
|
||||
enum Kind {
|
||||
/// \brief Accept any value type.
|
||||
ANY_TYPE,
|
||||
|
||||
/// \brief A fixed arrow::DataType and will only exact match having this
|
||||
/// exact type (e.g. same TimestampType unit, same decimal scale and
|
||||
/// precision, or same nested child types).
|
||||
EXACT_TYPE,
|
||||
|
||||
/// \brief Uses a TypeMatcher implementation to check the type.
|
||||
USE_TYPE_MATCHER
|
||||
};
|
||||
|
||||
/// \brief Accept any value type
|
||||
InputType() : kind_(ANY_TYPE) {}
|
||||
|
||||
/// \brief Accept an exact value type.
|
||||
InputType(std::shared_ptr<DataType> type) // NOLINT implicit construction
|
||||
: kind_(EXACT_TYPE), type_(std::move(type)) {}
|
||||
|
||||
/// \brief Use the passed TypeMatcher to type check.
|
||||
InputType(std::shared_ptr<TypeMatcher> type_matcher) // NOLINT implicit construction
|
||||
: kind_(USE_TYPE_MATCHER), type_matcher_(std::move(type_matcher)) {}
|
||||
|
||||
/// \brief Match any type with the given Type::type. Uses a TypeMatcher for
|
||||
/// its implementation.
|
||||
InputType(Type::type type_id) // NOLINT implicit construction
|
||||
: InputType(match::SameTypeId(type_id)) {}
|
||||
|
||||
InputType(const InputType& other) { CopyInto(other); }
|
||||
|
||||
void operator=(const InputType& other) { CopyInto(other); }
|
||||
|
||||
InputType(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
|
||||
|
||||
void operator=(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
|
||||
|
||||
// \brief Match any input (array, scalar of any type)
|
||||
static InputType Any() { return InputType(); }
|
||||
|
||||
/// \brief Return true if this input type matches the same type cases as the
|
||||
/// other.
|
||||
bool Equals(const InputType& other) const;
|
||||
|
||||
bool operator==(const InputType& other) const { return this->Equals(other); }
|
||||
|
||||
bool operator!=(const InputType& other) const { return !(*this == other); }
|
||||
|
||||
/// \brief Return hash code.
|
||||
size_t Hash() const;
|
||||
|
||||
/// \brief Render a human-readable string representation.
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Return true if the Datum matches this argument kind in
|
||||
/// type (and only allows scalar or array-like Datums).
|
||||
bool Matches(const Datum& value) const;
|
||||
|
||||
/// \brief Return true if the type matches this InputType
|
||||
bool Matches(const DataType& type) const;
|
||||
|
||||
/// \brief The type matching rule that this InputType uses.
|
||||
Kind kind() const { return kind_; }
|
||||
|
||||
/// \brief For InputType::EXACT_TYPE kind, the exact type that this InputType
|
||||
/// must match. Otherwise this function should not be used and will assert in
|
||||
/// debug builds.
|
||||
const std::shared_ptr<DataType>& type() const;
|
||||
|
||||
/// \brief For InputType::USE_TYPE_MATCHER, the TypeMatcher to be used for
|
||||
/// checking the type of a value. Otherwise this function should not be used
|
||||
/// and will assert in debug builds.
|
||||
const TypeMatcher& type_matcher() const;
|
||||
|
||||
private:
|
||||
void CopyInto(const InputType& other) {
|
||||
this->kind_ = other.kind_;
|
||||
this->type_ = other.type_;
|
||||
this->type_matcher_ = other.type_matcher_;
|
||||
}
|
||||
|
||||
void MoveInto(InputType&& other) {
|
||||
this->kind_ = other.kind_;
|
||||
this->type_ = std::move(other.type_);
|
||||
this->type_matcher_ = std::move(other.type_matcher_);
|
||||
}
|
||||
|
||||
Kind kind_;
|
||||
|
||||
// For EXACT_TYPE Kind
|
||||
std::shared_ptr<DataType> type_;
|
||||
|
||||
// For USE_TYPE_MATCHER Kind
|
||||
std::shared_ptr<TypeMatcher> type_matcher_;
|
||||
};
|
||||
|
||||
/// \brief Container to capture both exact and input-dependent output types.
|
||||
class ARROW_EXPORT OutputType {
|
||||
public:
|
||||
/// \brief An enum indicating whether the value type is an invariant fixed
|
||||
/// value or one that's computed by a kernel-defined resolver function.
|
||||
enum ResolveKind { FIXED, COMPUTED };
|
||||
|
||||
/// Type resolution function. Given input types, return output type. This
|
||||
/// function MAY may use the kernel state to decide the output type based on
|
||||
/// the FunctionOptions.
|
||||
///
|
||||
/// This function SHOULD _not_ be used to check for arity, that is to be
|
||||
/// performed one or more layers above.
|
||||
using Resolver =
|
||||
std::function<Result<TypeHolder>(KernelContext*, const std::vector<TypeHolder>&)>;
|
||||
|
||||
/// \brief Output an exact type
|
||||
OutputType(std::shared_ptr<DataType> type) // NOLINT implicit construction
|
||||
: kind_(FIXED), type_(std::move(type)) {}
|
||||
|
||||
/// \brief Output a computed type depending on actual input types
|
||||
template <typename Fn>
|
||||
OutputType(Fn resolver) // NOLINT implicit construction
|
||||
: kind_(COMPUTED), resolver_(std::move(resolver)) {}
|
||||
|
||||
OutputType(const OutputType& other) {
|
||||
this->kind_ = other.kind_;
|
||||
this->type_ = other.type_;
|
||||
this->resolver_ = other.resolver_;
|
||||
}
|
||||
|
||||
OutputType(OutputType&& other) {
|
||||
this->kind_ = other.kind_;
|
||||
this->type_ = std::move(other.type_);
|
||||
this->resolver_ = other.resolver_;
|
||||
}
|
||||
|
||||
OutputType& operator=(const OutputType&) = default;
|
||||
OutputType& operator=(OutputType&&) = default;
|
||||
|
||||
/// \brief Return the type of the expected output value of the kernel given
|
||||
/// the input argument types. The resolver may make use of state information
|
||||
/// kept in the KernelContext.
|
||||
Result<TypeHolder> Resolve(KernelContext* ctx,
|
||||
const std::vector<TypeHolder>& args) const;
|
||||
|
||||
/// \brief The exact output value type for the FIXED kind.
|
||||
const std::shared_ptr<DataType>& type() const;
|
||||
|
||||
/// \brief For use with COMPUTED resolution strategy. It may be more
|
||||
/// convenient to invoke this with OutputType::Resolve returned from this
|
||||
/// method.
|
||||
const Resolver& resolver() const;
|
||||
|
||||
/// \brief Render a human-readable string representation.
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Return the kind of type resolution of this output type, whether
|
||||
/// fixed/invariant or computed by a resolver.
|
||||
ResolveKind kind() const { return kind_; }
|
||||
|
||||
private:
|
||||
ResolveKind kind_;
|
||||
|
||||
// For FIXED resolution
|
||||
std::shared_ptr<DataType> type_;
|
||||
|
||||
// For COMPUTED resolution
|
||||
Resolver resolver_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief Additional constraints to apply to the input types of a kernel when matching a
|
||||
/// specific kernel signature.
|
||||
class ARROW_EXPORT MatchConstraint {
|
||||
public:
|
||||
virtual ~MatchConstraint() = default;
|
||||
|
||||
/// \brief Return true if the input types satisfy the constraint.
|
||||
virtual bool Matches(const std::vector<TypeHolder>& types) const = 0;
|
||||
|
||||
/// \brief Convenience function to create a MatchConstraint from a match function.
|
||||
static std::shared_ptr<MatchConstraint> Make(
|
||||
std::function<bool(const std::vector<TypeHolder>&)> matches);
|
||||
};
|
||||
|
||||
/// \brief Constraint that all input types are decimal types and have the same scale.
|
||||
ARROW_EXPORT std::shared_ptr<MatchConstraint> DecimalsHaveSameScale();
|
||||
|
||||
/// \brief Holds the input types, optional match constraint and output type of the kernel.
|
||||
///
|
||||
/// VarArgs functions with minimum N arguments should pass up to N input types to be
|
||||
/// used to validate the input types of a function invocation. The first N-1 types
|
||||
/// will be matched against the first N-1 arguments, and the last type will be
|
||||
/// matched against the remaining arguments.
|
||||
class ARROW_EXPORT KernelSignature {
|
||||
public:
|
||||
KernelSignature(std::vector<InputType> in_types, OutputType out_type,
|
||||
bool is_varargs = false,
|
||||
std::shared_ptr<MatchConstraint> constraint = NULLPTR);
|
||||
|
||||
/// \brief Convenience ctor since make_shared can be awkward
|
||||
static std::shared_ptr<KernelSignature> Make(
|
||||
std::vector<InputType> in_types, OutputType out_type, bool is_varargs = false,
|
||||
std::shared_ptr<MatchConstraint> constraint = NULLPTR);
|
||||
|
||||
/// \brief Return true if the signature is compatible with the list of input
|
||||
/// value descriptors and satisfies the match constraint, if any.
|
||||
bool MatchesInputs(const std::vector<TypeHolder>& types) const;
|
||||
|
||||
/// \brief Returns true if the input types of each signature are
|
||||
/// equal. Well-formed functions should have a deterministic output type
|
||||
/// given input types, but currently it is the responsibility of the
|
||||
/// developer to ensure this.
|
||||
bool Equals(const KernelSignature& other) const;
|
||||
|
||||
bool operator==(const KernelSignature& other) const { return this->Equals(other); }
|
||||
|
||||
bool operator!=(const KernelSignature& other) const { return !(*this == other); }
|
||||
|
||||
/// \brief Compute a hash code for the signature
|
||||
size_t Hash() const;
|
||||
|
||||
/// \brief The input types for the kernel. For VarArgs functions, this should
|
||||
/// generally contain a single validator to use for validating all of the
|
||||
/// function arguments.
|
||||
const std::vector<InputType>& in_types() const { return in_types_; }
|
||||
|
||||
/// \brief The output type for the kernel. Use Resolve to return the
|
||||
/// exact output given input argument types, since many kernels'
|
||||
/// output types depend on their input types (or their type
|
||||
/// metadata).
|
||||
const OutputType& out_type() const { return out_type_; }
|
||||
|
||||
/// \brief Render a human-readable string representation
|
||||
std::string ToString() const;
|
||||
|
||||
bool is_varargs() const { return is_varargs_; }
|
||||
|
||||
private:
|
||||
std::vector<InputType> in_types_;
|
||||
OutputType out_type_;
|
||||
bool is_varargs_;
|
||||
std::shared_ptr<MatchConstraint> constraint_;
|
||||
|
||||
// For caching the hash code after it's computed the first time
|
||||
mutable uint64_t hash_code_;
|
||||
};
|
||||
|
||||
/// \brief A function may contain multiple variants of a kernel for a given
|
||||
/// type combination for different SIMD levels. Based on the active system's
|
||||
/// CPU info or the user's preferences, we can elect to use one over the other.
|
||||
struct SimdLevel {
|
||||
enum type { NONE = 0, SSE4_2, AVX, AVX2, AVX512, NEON, MAX };
|
||||
};
|
||||
|
||||
/// \brief The strategy to use for propagating or otherwise populating the
|
||||
/// validity bitmap of a kernel output.
|
||||
struct NullHandling {
|
||||
enum type {
|
||||
/// Compute the output validity bitmap by intersecting the validity bitmaps
|
||||
/// of the arguments using bitwise-and operations. This means that values
|
||||
/// in the output are valid/non-null only if the corresponding values in
|
||||
/// all input arguments were valid/non-null. Kernel generally need not
|
||||
/// touch the bitmap thereafter, but a kernel's exec function is permitted
|
||||
/// to alter the bitmap after the null intersection is computed if it needs
|
||||
/// to.
|
||||
INTERSECTION,
|
||||
|
||||
/// Kernel expects a pre-allocated buffer to write the result bitmap
|
||||
/// into. The preallocated memory is not zeroed (except for the last byte),
|
||||
/// so the kernel should ensure to completely populate the bitmap.
|
||||
COMPUTED_PREALLOCATE,
|
||||
|
||||
/// Kernel allocates and sets the validity bitmap of the output.
|
||||
COMPUTED_NO_PREALLOCATE,
|
||||
|
||||
/// Kernel output is never null and a validity bitmap does not need to be
|
||||
/// allocated.
|
||||
OUTPUT_NOT_NULL
|
||||
};
|
||||
};
|
||||
|
||||
/// \brief The preference for memory preallocation of fixed-width type outputs
|
||||
/// in kernel execution.
|
||||
struct MemAllocation {
|
||||
enum type {
|
||||
// For data types that support pre-allocation (i.e. fixed-width), the
|
||||
// kernel expects to be provided a pre-allocated data buffer to write
|
||||
// into. Non-fixed-width types must always allocate their own data
|
||||
// buffers. The allocation made for the same length as the execution batch,
|
||||
// so vector kernels yielding differently sized output should not use this.
|
||||
//
|
||||
// It is valid for the data to not be preallocated but the validity bitmap
|
||||
// is (or is computed using the intersection/bitwise-and method).
|
||||
//
|
||||
// For variable-size output types like BinaryType or StringType, or for
|
||||
// nested types, this option has no effect.
|
||||
PREALLOCATE,
|
||||
|
||||
// The kernel is responsible for allocating its own data buffer for
|
||||
// fixed-width type outputs.
|
||||
NO_PREALLOCATE
|
||||
};
|
||||
};
|
||||
|
||||
struct Kernel;
|
||||
|
||||
/// \brief Arguments to pass to an KernelInit function. A struct is used to help
|
||||
/// avoid API breakage should the arguments passed need to be expanded.
|
||||
struct KernelInitArgs {
|
||||
/// \brief A pointer to the kernel being initialized. The init function may
|
||||
/// depend on the kernel's KernelSignature or other data contained there.
|
||||
const Kernel* kernel;
|
||||
|
||||
/// \brief The types of the input arguments that the kernel is
|
||||
/// about to be executed against.
|
||||
const std::vector<TypeHolder>& inputs;
|
||||
|
||||
/// \brief Opaque options specific to this kernel. May be nullptr for functions
|
||||
/// that do not require options.
|
||||
const FunctionOptions* options;
|
||||
};
|
||||
|
||||
/// \brief Common initializer function for all kernel types.
|
||||
using KernelInit = std::function<Result<std::unique_ptr<KernelState>>(
|
||||
KernelContext*, const KernelInitArgs&)>;
|
||||
|
||||
/// \brief Base type for kernels. Contains the function signature and
|
||||
/// optionally the state initialization function, along with some common
|
||||
/// attributes
|
||||
struct ARROW_EXPORT Kernel {
|
||||
Kernel() = default;
|
||||
|
||||
Kernel(std::shared_ptr<KernelSignature> sig, KernelInit init)
|
||||
: signature(std::move(sig)), init(std::move(init)) {}
|
||||
|
||||
Kernel(std::vector<InputType> in_types, OutputType out_type, KernelInit init)
|
||||
: Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
|
||||
std::move(init)) {}
|
||||
|
||||
/// \brief The "signature" of the kernel containing the InputType input
|
||||
/// argument validators and OutputType output type resolver.
|
||||
std::shared_ptr<KernelSignature> signature;
|
||||
|
||||
/// \brief Create a new KernelState for invocations of this kernel, e.g. to
|
||||
/// set up any options or state relevant for execution.
|
||||
KernelInit init;
|
||||
|
||||
/// \brief Create a vector of new KernelState for invocations of this kernel.
|
||||
static Status InitAll(KernelContext*, const KernelInitArgs&,
|
||||
std::vector<std::unique_ptr<KernelState>>*);
|
||||
|
||||
/// \brief Indicates whether execution can benefit from parallelization
|
||||
/// (splitting large chunks into smaller chunks and using multiple
|
||||
/// threads). Some kernels may not support parallel execution at
|
||||
/// all. Synchronization and concurrency-related issues are currently the
|
||||
/// responsibility of the Kernel's implementation.
|
||||
bool parallelizable = true;
|
||||
|
||||
/// \brief Indicates the level of SIMD instruction support in the host CPU is
|
||||
/// required to use the function. The intention is for functions to be able to
|
||||
/// contain multiple kernels with the same signature but different levels of SIMD,
|
||||
/// so that the most optimized kernel supported on a host's processor can be chosen.
|
||||
SimdLevel::type simd_level = SimdLevel::NONE;
|
||||
|
||||
// Additional kernel-specific data
|
||||
std::shared_ptr<KernelState> data;
|
||||
};
|
||||
|
||||
/// \brief The scalar kernel execution API that must be implemented for SCALAR
|
||||
/// kernel types. This includes both stateless and stateful kernels. Kernels
|
||||
/// depending on some execution state access that state via subclasses of
|
||||
/// KernelState set on the KernelContext object. Implementations should
|
||||
/// endeavor to write into pre-allocated memory if they are able, though for
|
||||
/// some kernels (e.g. in cases when a builder like StringBuilder) must be
|
||||
/// employed this may not be possible.
|
||||
using ArrayKernelExec = Status (*)(KernelContext*, const ExecSpan&, ExecResult*);
|
||||
|
||||
/// \brief Kernel data structure for implementations of ScalarFunction. In
|
||||
/// addition to the members found in Kernel, contains the null handling
|
||||
/// and memory pre-allocation preferences.
|
||||
struct ARROW_EXPORT ScalarKernel : public Kernel {
|
||||
ScalarKernel() = default;
|
||||
|
||||
ScalarKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
|
||||
KernelInit init = NULLPTR)
|
||||
: Kernel(std::move(sig), init), exec(exec) {}
|
||||
|
||||
ScalarKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
|
||||
KernelInit init = NULLPTR)
|
||||
: Kernel(std::move(in_types), std::move(out_type), std::move(init)), exec(exec) {}
|
||||
|
||||
/// \brief Perform a single invocation of this kernel. Depending on the
|
||||
/// implementation, it may only write into preallocated memory, while in some
|
||||
/// cases it will allocate its own memory. Any required state is managed
|
||||
/// through the KernelContext.
|
||||
ArrayKernelExec exec;
|
||||
|
||||
/// \brief Writing execution results into larger contiguous allocations
|
||||
/// requires that the kernel be able to write into sliced output ArrayData*,
|
||||
/// including sliced output validity bitmaps. Some kernel implementations may
|
||||
/// not be able to do this, so setting this to false disables this
|
||||
/// functionality.
|
||||
bool can_write_into_slices = true;
|
||||
|
||||
// For scalar functions preallocated data and intersecting arg validity
|
||||
// bitmaps is a reasonable default
|
||||
NullHandling::type null_handling = NullHandling::INTERSECTION;
|
||||
MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// VectorKernel (for VectorFunction)
|
||||
|
||||
/// \brief Kernel data structure for implementations of VectorFunction. In
|
||||
/// contains an optional finalizer function, the null handling and memory
|
||||
/// pre-allocation preferences (which have different defaults from
|
||||
/// ScalarKernel), and some other execution-related options.
|
||||
struct ARROW_EXPORT VectorKernel : public Kernel {
|
||||
/// \brief See VectorKernel::finalize member for usage
|
||||
using FinalizeFunc = std::function<Status(KernelContext*, std::vector<Datum>*)>;
|
||||
|
||||
/// \brief Function for executing a stateful VectorKernel against a
|
||||
/// ChunkedArray input. Does not need to be defined for all VectorKernels
|
||||
using ChunkedExec = Status (*)(KernelContext*, const ExecBatch&, Datum* out);
|
||||
|
||||
VectorKernel() = default;
|
||||
|
||||
VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
|
||||
KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
|
||||
: Kernel(std::move(in_types), std::move(out_type), std::move(init)),
|
||||
exec(exec),
|
||||
finalize(std::move(finalize)) {}
|
||||
|
||||
VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
|
||||
KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
|
||||
: Kernel(std::move(sig), std::move(init)),
|
||||
exec(exec),
|
||||
finalize(std::move(finalize)) {}
|
||||
|
||||
/// \brief Perform a single invocation of this kernel. Any required state is
|
||||
/// managed through the KernelContext.
|
||||
ArrayKernelExec exec;
|
||||
|
||||
/// \brief Execute the kernel on a ChunkedArray. Does not need to be defined
|
||||
ChunkedExec exec_chunked = NULLPTR;
|
||||
|
||||
/// \brief For VectorKernel, convert intermediate results into finalized
|
||||
/// results. Mutates input argument. Some kernels may accumulate state
|
||||
/// (example: hashing-related functions) through processing chunked inputs, and
|
||||
/// then need to attach some accumulated state to each of the outputs of
|
||||
/// processing each chunk of data.
|
||||
FinalizeFunc finalize;
|
||||
|
||||
/// Since vector kernels generally are implemented rather differently from
|
||||
/// scalar/elementwise kernels (and they may not even yield arrays of the same
|
||||
/// size), so we make the developer opt-in to any memory preallocation rather
|
||||
/// than having to turn it off.
|
||||
NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
|
||||
MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE;
|
||||
|
||||
/// \brief Writing execution results into larger contiguous allocations
|
||||
/// requires that the kernel be able to write into sliced output ArrayData*,
|
||||
/// including sliced output validity bitmaps. Some kernel implementations may
|
||||
/// not be able to do this, so setting this to false disables this
|
||||
/// functionality.
|
||||
bool can_write_into_slices = true;
|
||||
|
||||
/// Some vector kernels can do chunkwise execution using ExecSpanIterator,
|
||||
/// in some cases accumulating some state. Other kernels (like Take) need to
|
||||
/// be passed whole arrays and don't work on ChunkedArray inputs
|
||||
bool can_execute_chunkwise = true;
|
||||
|
||||
/// Some kernels (like unique and value_counts) yield non-chunked output from
|
||||
/// chunked-array inputs. This option controls how the results are boxed when
|
||||
/// returned from ExecVectorFunction
|
||||
///
|
||||
/// true -> ChunkedArray
|
||||
/// false -> Array
|
||||
bool output_chunked = true;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ScalarAggregateKernel (for ScalarAggregateFunction)
|
||||
|
||||
using ScalarAggregateConsume = Status (*)(KernelContext*, const ExecSpan&);
|
||||
using ScalarAggregateMerge = Status (*)(KernelContext*, KernelState&&, KernelState*);
|
||||
// Finalize returns Datum to permit multiple return values
|
||||
using ScalarAggregateFinalize = Status (*)(KernelContext*, Datum*);
|
||||
|
||||
/// \brief Kernel data structure for implementations of
|
||||
/// ScalarAggregateFunction. The four necessary components of an aggregation
|
||||
/// kernel are the init, consume, merge, and finalize functions.
|
||||
///
|
||||
/// * init: creates a new KernelState for a kernel.
|
||||
/// * consume: processes an ExecSpan and updates the KernelState found in the
|
||||
/// KernelContext.
|
||||
/// * merge: combines one KernelState with another.
|
||||
/// * finalize: produces the end result of the aggregation using the
|
||||
/// KernelState in the KernelContext.
|
||||
struct ARROW_EXPORT ScalarAggregateKernel : public Kernel {
|
||||
ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
|
||||
ScalarAggregateConsume consume, ScalarAggregateMerge merge,
|
||||
ScalarAggregateFinalize finalize, const bool ordered)
|
||||
: Kernel(std::move(sig), std::move(init)),
|
||||
consume(consume),
|
||||
merge(merge),
|
||||
finalize(finalize),
|
||||
ordered(ordered) {}
|
||||
|
||||
ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
|
||||
KernelInit init, ScalarAggregateConsume consume,
|
||||
ScalarAggregateMerge merge, ScalarAggregateFinalize finalize,
|
||||
const bool ordered)
|
||||
: ScalarAggregateKernel(
|
||||
KernelSignature::Make(std::move(in_types), std::move(out_type)),
|
||||
std::move(init), consume, merge, finalize, ordered) {}
|
||||
|
||||
/// \brief Merge a vector of KernelStates into a single KernelState.
|
||||
/// The merged state will be returned and will be set on the KernelContext.
|
||||
static Result<std::unique_ptr<KernelState>> MergeAll(
|
||||
const ScalarAggregateKernel* kernel, KernelContext* ctx,
|
||||
std::vector<std::unique_ptr<KernelState>> states);
|
||||
|
||||
ScalarAggregateConsume consume;
|
||||
ScalarAggregateMerge merge;
|
||||
ScalarAggregateFinalize finalize;
|
||||
/// \brief Whether this kernel requires ordering
|
||||
/// Some aggregations, such as, "first", requires some kind of input order. The
|
||||
/// order can be implicit, e.g., the order of the input data, or explicit, e.g.
|
||||
/// the ordering specified with a window aggregation.
|
||||
/// The caller of the aggregate kernel is responsible for passing data in some
|
||||
/// defined order to the kernel. The flag here is a way for the kernel to tell
|
||||
/// the caller that data passed to the kernel must be defined in some order.
|
||||
bool ordered = false;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// HashAggregateKernel (for HashAggregateFunction)
|
||||
|
||||
using HashAggregateResize = Status (*)(KernelContext*, int64_t);
|
||||
using HashAggregateConsume = Status (*)(KernelContext*, const ExecSpan&);
|
||||
using HashAggregateMerge = Status (*)(KernelContext*, KernelState&&, const ArrayData&);
|
||||
|
||||
// Finalize returns Datum to permit multiple return values
|
||||
using HashAggregateFinalize = Status (*)(KernelContext*, Datum*);
|
||||
|
||||
/// \brief Kernel data structure for implementations of
|
||||
/// HashAggregateFunction. The four necessary components of an aggregation
|
||||
/// kernel are the init, consume, merge, and finalize functions.
|
||||
///
|
||||
/// * init: creates a new KernelState for a kernel.
|
||||
/// * resize: ensure that the KernelState can accommodate the specified number of groups.
|
||||
/// * consume: processes an ExecSpan (which includes the argument as well
|
||||
/// as an array of group identifiers) and updates the KernelState found in the
|
||||
/// KernelContext.
|
||||
/// * merge: combines one KernelState with another.
|
||||
/// * finalize: produces the end result of the aggregation using the
|
||||
/// KernelState in the KernelContext.
|
||||
struct ARROW_EXPORT HashAggregateKernel : public Kernel {
|
||||
HashAggregateKernel() = default;
|
||||
|
||||
HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
|
||||
HashAggregateResize resize, HashAggregateConsume consume,
|
||||
HashAggregateMerge merge, HashAggregateFinalize finalize,
|
||||
const bool ordered)
|
||||
: Kernel(std::move(sig), std::move(init)),
|
||||
resize(resize),
|
||||
consume(consume),
|
||||
merge(merge),
|
||||
finalize(finalize),
|
||||
ordered(ordered) {}
|
||||
|
||||
HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
|
||||
KernelInit init, HashAggregateConsume consume,
|
||||
HashAggregateResize resize, HashAggregateMerge merge,
|
||||
HashAggregateFinalize finalize, const bool ordered)
|
||||
: HashAggregateKernel(
|
||||
KernelSignature::Make(std::move(in_types), std::move(out_type)),
|
||||
std::move(init), resize, consume, merge, finalize, ordered) {}
|
||||
|
||||
HashAggregateResize resize;
|
||||
HashAggregateConsume consume;
|
||||
HashAggregateMerge merge;
|
||||
HashAggregateFinalize finalize;
|
||||
/// @brief whether the summarizer requires ordering
|
||||
/// This is similar to ScalarAggregateKernel. See ScalarAggregateKernel
|
||||
/// for detailed doc of this variable.
|
||||
bool ordered = false;
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,120 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/compare.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
enum class SortOrder {
|
||||
/// Arrange values in increasing order
|
||||
Ascending,
|
||||
/// Arrange values in decreasing order
|
||||
Descending,
|
||||
};
|
||||
|
||||
enum class NullPlacement {
|
||||
/// Place nulls and NaNs before any non-null values.
|
||||
/// NaNs will come after nulls.
|
||||
AtStart,
|
||||
/// Place nulls and NaNs after any non-null values.
|
||||
/// NaNs will come before nulls.
|
||||
AtEnd,
|
||||
};
|
||||
|
||||
/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
|
||||
class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
|
||||
public:
|
||||
explicit SortKey(FieldRef target, SortOrder order = SortOrder::Ascending)
|
||||
: target(std::move(target)), order(order) {}
|
||||
|
||||
bool Equals(const SortKey& other) const;
|
||||
std::string ToString() const;
|
||||
|
||||
/// A FieldRef targeting the sort column.
|
||||
FieldRef target;
|
||||
/// How to order by this sort key.
|
||||
SortOrder order;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Ordering : public util::EqualityComparable<Ordering> {
|
||||
public:
|
||||
Ordering(std::vector<SortKey> sort_keys,
|
||||
NullPlacement null_placement = NullPlacement::AtStart)
|
||||
: sort_keys_(std::move(sort_keys)), null_placement_(null_placement) {}
|
||||
/// true if data ordered by other is also ordered by this
|
||||
///
|
||||
/// For example, if data is ordered by [a, b, c] then it is also ordered
|
||||
/// by [a, b] but not by [b, c] or [a, b, c, d].
|
||||
///
|
||||
/// [a, b].IsSuborderOf([a, b, c]) - true
|
||||
/// [a, b, c].IsSuborderOf([a, b, c]) - true
|
||||
/// [b, c].IsSuborderOf([a, b, c]) - false
|
||||
/// [a, b, c, d].IsSuborderOf([a, b, c]) - false
|
||||
///
|
||||
/// The implicit ordering is not a suborder of any other ordering and
|
||||
/// no other ordering is a suborder of it. The implicit ordering is not a
|
||||
/// suborder of itself.
|
||||
///
|
||||
/// The unordered ordering is a suborder of all other orderings but no
|
||||
/// other ordering is a suborder of it. The unordered ordering is a suborder
|
||||
/// of itself.
|
||||
///
|
||||
/// The unordered ordering is a suborder of the implicit ordering.
|
||||
bool IsSuborderOf(const Ordering& other) const;
|
||||
|
||||
bool Equals(const Ordering& other) const;
|
||||
std::string ToString() const;
|
||||
|
||||
bool is_implicit() const { return is_implicit_; }
|
||||
bool is_unordered() const { return !is_implicit_ && sort_keys_.empty(); }
|
||||
|
||||
const std::vector<SortKey>& sort_keys() const { return sort_keys_; }
|
||||
NullPlacement null_placement() const { return null_placement_; }
|
||||
|
||||
static const Ordering& Implicit() {
|
||||
static const Ordering kImplicit(true);
|
||||
return kImplicit;
|
||||
}
|
||||
|
||||
static const Ordering& Unordered() {
|
||||
static const Ordering kUnordered(false);
|
||||
// It is also possible to get an unordered ordering by passing in an empty vector
|
||||
// using the normal constructor. This is ok and useful when ordering comes from user
|
||||
// input.
|
||||
return kUnordered;
|
||||
}
|
||||
|
||||
private:
|
||||
explicit Ordering(bool is_implicit)
|
||||
: null_placement_(NullPlacement::AtStart), is_implicit_(is_implicit) {}
|
||||
/// Column key(s) to order by and how to order by these sort keys.
|
||||
std::vector<SortKey> sort_keys_;
|
||||
/// Whether nulls and NaNs are placed at the start or at the end
|
||||
NullPlacement null_placement_;
|
||||
bool is_implicit_ = false;
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,126 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// NOTE: API is EXPERIMENTAL and will change without going through a
|
||||
// deprecation cycle
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
class Function;
|
||||
class FunctionOptionsType;
|
||||
|
||||
/// \brief A mutable central function registry for built-in functions as well
|
||||
/// as user-defined functions. Functions are implementations of
|
||||
/// arrow::compute::Function.
|
||||
///
|
||||
/// Generally, each function contains kernels which are implementations of a
|
||||
/// function for a specific argument signature. After looking up a function in
|
||||
/// the registry, one can either execute it eagerly with Function::Execute or
|
||||
/// use one of the function's dispatch methods to pick a suitable kernel for
|
||||
/// lower-level function execution.
|
||||
class ARROW_EXPORT FunctionRegistry {
|
||||
public:
|
||||
~FunctionRegistry();
|
||||
|
||||
/// \brief Construct a new registry.
|
||||
///
|
||||
/// Most users only need to use the global registry.
|
||||
static std::unique_ptr<FunctionRegistry> Make();
|
||||
|
||||
/// \brief Construct a new nested registry with the given parent.
|
||||
///
|
||||
/// Most users only need to use the global registry. The returned registry never changes
|
||||
/// its parent, even when an operation allows overwriting.
|
||||
static std::unique_ptr<FunctionRegistry> Make(FunctionRegistry* parent);
|
||||
|
||||
/// \brief Check whether a new function can be added to the registry.
|
||||
///
|
||||
/// \returns Status::KeyError if a function with the same name is already registered.
|
||||
Status CanAddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
|
||||
|
||||
/// \brief Add a new function to the registry.
|
||||
///
|
||||
/// \returns Status::KeyError if a function with the same name is already registered.
|
||||
Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
|
||||
|
||||
/// \brief Check whether an alias can be added for the given function name.
|
||||
///
|
||||
/// \returns Status::KeyError if the function with the given name is not registered.
|
||||
Status CanAddAlias(const std::string& target_name, const std::string& source_name);
|
||||
|
||||
/// \brief Add alias for the given function name.
|
||||
///
|
||||
/// \returns Status::KeyError if the function with the given name is not registered.
|
||||
Status AddAlias(const std::string& target_name, const std::string& source_name);
|
||||
|
||||
/// \brief Check whether a new function options type can be added to the registry.
|
||||
///
|
||||
/// \return Status::KeyError if a function options type with the same name is already
|
||||
/// registered.
|
||||
Status CanAddFunctionOptionsType(const FunctionOptionsType* options_type,
|
||||
bool allow_overwrite = false);
|
||||
|
||||
/// \brief Add a new function options type to the registry.
|
||||
///
|
||||
/// \returns Status::KeyError if a function options type with the same name is already
|
||||
/// registered.
|
||||
Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
|
||||
bool allow_overwrite = false);
|
||||
|
||||
/// \brief Retrieve a function by name from the registry.
|
||||
Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const;
|
||||
|
||||
/// \brief Return vector of all entry names in the registry.
|
||||
///
|
||||
/// Helpful for displaying a manifest of available functions.
|
||||
std::vector<std::string> GetFunctionNames() const;
|
||||
|
||||
/// \brief Retrieve a function options type by name from the registry.
|
||||
Result<const FunctionOptionsType*> GetFunctionOptionsType(
|
||||
const std::string& name) const;
|
||||
|
||||
/// \brief The number of currently registered functions.
|
||||
int num_functions() const;
|
||||
|
||||
/// \brief The cast function object registered in AddFunction.
|
||||
///
|
||||
/// Helpful for get cast function as needed.
|
||||
const Function* cast_function() const;
|
||||
|
||||
private:
|
||||
FunctionRegistry();
|
||||
|
||||
// Use PIMPL pattern to not have std::unordered_map here
|
||||
class FunctionRegistryImpl;
|
||||
std::unique_ptr<FunctionRegistryImpl> impl_;
|
||||
|
||||
explicit FunctionRegistry(FunctionRegistryImpl* impl);
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,198 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/kernel.h"
|
||||
#include "arrow/compute/visibility.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace compute {
|
||||
|
||||
/// \brief A segment
|
||||
/// A segment group is a chunk of continuous rows that have the same segment key. (For
|
||||
/// example, in ordered time series processing, segment key can be "date", and a segment
|
||||
/// group can be all the rows that belong to the same date.) A segment group can span
|
||||
/// across multiple exec batches. A segment is a chunk of continuous rows that has the
|
||||
/// same segment key within a given batch. When a segment group span cross batches, it
|
||||
/// will have multiple segments. A segment never spans cross batches. The segment data
|
||||
/// structure only makes sense when used along with a exec batch.
|
||||
struct ARROW_COMPUTE_EXPORT Segment {
|
||||
/// \brief the offset into the batch where the segment starts
|
||||
int64_t offset;
|
||||
/// \brief the length of the segment
|
||||
int64_t length;
|
||||
/// \brief whether the segment may be extended by a next one
|
||||
bool is_open;
|
||||
/// \brief whether the segment extends a preceeding one
|
||||
bool extends;
|
||||
};
|
||||
|
||||
inline bool operator==(const Segment& segment1, const Segment& segment2) {
|
||||
return segment1.offset == segment2.offset && segment1.length == segment2.length &&
|
||||
segment1.is_open == segment2.is_open && segment1.extends == segment2.extends;
|
||||
}
|
||||
inline bool operator!=(const Segment& segment1, const Segment& segment2) {
|
||||
return !(segment1 == segment2);
|
||||
}
|
||||
|
||||
/// \brief a helper class to divide a batch into segments of equal values
|
||||
///
|
||||
/// For example, given a batch with two columns specifed as segment keys:
|
||||
///
|
||||
/// A A [other columns]...
|
||||
/// A A ...
|
||||
/// A B ...
|
||||
/// A B ...
|
||||
/// A A ...
|
||||
///
|
||||
/// Then the batch could be divided into 3 segments. The first would be rows 0 & 1,
|
||||
/// the second would be rows 2 & 3, and the third would be row 4.
|
||||
///
|
||||
/// Further, a segmenter keeps track of the last value seen. This allows it to calculate
|
||||
/// segments which span batches. In our above example the last batch we emit would set
|
||||
/// the "open" flag, which indicates whether the segment may extend into the next batch.
|
||||
///
|
||||
/// If the next call to the segmenter starts with `A A` then that segment would set the
|
||||
/// "extends" flag, which indicates whether the segment continues the last open batch.
|
||||
class ARROW_COMPUTE_EXPORT RowSegmenter {
|
||||
public:
|
||||
virtual ~RowSegmenter() = default;
|
||||
|
||||
/// \brief Construct a Segmenter which segments on the specified key types
|
||||
///
|
||||
/// \param[in] key_types the specified key types
|
||||
/// \param[in] nullable_keys whether values of the specified keys may be null
|
||||
/// \param[in] ctx the execution context to use
|
||||
static Result<std::unique_ptr<RowSegmenter>> Make(
|
||||
const std::vector<TypeHolder>& key_types, bool nullable_keys, ExecContext* ctx);
|
||||
|
||||
/// \brief Return the key types of this segmenter
|
||||
virtual const std::vector<TypeHolder>& key_types() const = 0;
|
||||
|
||||
/// \brief Reset this segmenter
|
||||
///
|
||||
/// A segmenter normally extends (see `Segment`) a segment from one batch to the next.
|
||||
/// If segment-extension is undesirable, for example when each batch is processed
|
||||
/// independently, then `Reset` should be invoked before processing the next batch.
|
||||
virtual Status Reset() = 0;
|
||||
|
||||
/// \brief Get all segments for the given batch
|
||||
virtual Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) = 0;
|
||||
};
|
||||
|
||||
/// Consumes batches of keys and yields batches of the group ids.
|
||||
class ARROW_COMPUTE_EXPORT Grouper {
|
||||
public:
|
||||
virtual ~Grouper() = default;
|
||||
|
||||
/// Construct a Grouper which receives the specified key types
|
||||
static Result<std::unique_ptr<Grouper>> Make(const std::vector<TypeHolder>& key_types,
|
||||
ExecContext* ctx = default_exec_context());
|
||||
|
||||
/// Reset all intermediate state, make the grouper logically as just `Make`ed.
|
||||
/// The underlying buffers, if any, may or may not be released though.
|
||||
virtual Status Reset() = 0;
|
||||
|
||||
/// Consume a batch of keys, producing the corresponding group ids as an integer array,
|
||||
/// over a slice defined by an offset and length, which defaults to the batch length.
|
||||
/// Currently only uint32 indices will be produced, eventually the bit width will only
|
||||
/// be as wide as necessary.
|
||||
virtual Result<Datum> Consume(const ExecSpan& batch, int64_t offset = 0,
|
||||
int64_t length = -1) = 0;
|
||||
|
||||
/// Like Consume, but groups not already encountered emit null instead of
|
||||
/// generating a new group id.
|
||||
virtual Result<Datum> Lookup(const ExecSpan& batch, int64_t offset = 0,
|
||||
int64_t length = -1) = 0;
|
||||
|
||||
/// Like Consume, but only populates the Grouper without returning the group ids.
|
||||
virtual Status Populate(const ExecSpan& batch, int64_t offset = 0,
|
||||
int64_t length = -1) = 0;
|
||||
|
||||
/// Get current unique keys. May be called multiple times.
|
||||
virtual Result<ExecBatch> GetUniques() = 0;
|
||||
|
||||
/// Get the current number of groups.
|
||||
virtual uint32_t num_groups() const = 0;
|
||||
|
||||
/// \brief Assemble lists of indices of identical elements.
|
||||
///
|
||||
/// \param[in] ids An unsigned, all-valid integral array which will be
|
||||
/// used as grouping criteria.
|
||||
/// \param[in] num_groups An upper bound for the elements of ids
|
||||
/// \param[in] ctx Execution context to use during the operation
|
||||
/// \return A num_groups-long ListArray where the slot at i contains a
|
||||
/// list of indices where i appears in ids.
|
||||
///
|
||||
/// MakeGroupings([
|
||||
/// 2,
|
||||
/// 2,
|
||||
/// 5,
|
||||
/// 5,
|
||||
/// 2,
|
||||
/// 3
|
||||
/// ], 8) == [
|
||||
/// [],
|
||||
/// [],
|
||||
/// [0, 1, 4],
|
||||
/// [5],
|
||||
/// [],
|
||||
/// [2, 3],
|
||||
/// [],
|
||||
/// []
|
||||
/// ]
|
||||
static Result<std::shared_ptr<ListArray>> MakeGroupings(
|
||||
const UInt32Array& ids, uint32_t num_groups,
|
||||
ExecContext* ctx = default_exec_context());
|
||||
|
||||
/// \brief Produce a ListArray whose slots are selections of `array` which correspond to
|
||||
/// the provided groupings.
|
||||
///
|
||||
/// For example,
|
||||
/// ApplyGroupings([
|
||||
/// [],
|
||||
/// [],
|
||||
/// [0, 1, 4],
|
||||
/// [5],
|
||||
/// [],
|
||||
/// [2, 3],
|
||||
/// [],
|
||||
/// []
|
||||
/// ], [2, 2, 5, 5, 2, 3]) == [
|
||||
/// [],
|
||||
/// [],
|
||||
/// [2, 2, 2],
|
||||
/// [3],
|
||||
/// [],
|
||||
/// [5, 5],
|
||||
/// [],
|
||||
/// []
|
||||
/// ]
|
||||
static Result<std::shared_ptr<ListArray>> ApplyGroupings(
|
||||
const ListArray& groupings, const Array& array,
|
||||
ExecContext* ctx = default_exec_context());
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,59 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
struct Datum;
|
||||
struct TypeHolder;
|
||||
|
||||
namespace compute {
|
||||
|
||||
class Function;
|
||||
class ScalarAggregateFunction;
|
||||
class FunctionExecutor;
|
||||
class FunctionOptions;
|
||||
class FunctionRegistry;
|
||||
|
||||
/// \brief Return the process-global function registry.
|
||||
// Defined in registry.cc
|
||||
ARROW_EXPORT FunctionRegistry* GetFunctionRegistry();
|
||||
|
||||
class CastOptions;
|
||||
|
||||
struct ExecBatch;
|
||||
class ExecContext;
|
||||
struct ExecValue;
|
||||
class KernelContext;
|
||||
|
||||
struct Kernel;
|
||||
struct ScalarKernel;
|
||||
struct ScalarAggregateKernel;
|
||||
struct VectorKernel;
|
||||
|
||||
struct KernelState;
|
||||
|
||||
class Expression;
|
||||
|
||||
ARROW_EXPORT ExecContext* default_exec_context();
|
||||
ARROW_EXPORT ExecContext* threaded_exec_context();
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,221 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/compute/visibility.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/cpu_info.h"
|
||||
#include "arrow/util/simd.h"
|
||||
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
# define BYTESWAP(x) __builtin_bswap64(x)
|
||||
# define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
|
||||
# define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
|
||||
#elif defined(_MSC_VER)
|
||||
# include <intrin.h>
|
||||
# define BYTESWAP(x) _byteswap_uint64(x)
|
||||
# define ROTL(x, n) _rotl((x), (n))
|
||||
# define ROTL64(x, n) _rotl64((x), (n))
|
||||
#endif
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
// Some platforms typedef int64_t as long int instead of long long int,
|
||||
// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
|
||||
// which need long long.
|
||||
// We use the cast to the type below in these intrinsics to make the code
|
||||
// compile in all cases.
|
||||
//
|
||||
using int64_for_gather_t = const long long int; // NOLINT runtime-int
|
||||
|
||||
// All MiniBatch... classes use TempVectorStack for vector allocations and can
|
||||
// only work with vectors up to 1024 elements.
|
||||
//
|
||||
// They should only be allocated on the stack to guarantee the right sequence
|
||||
// of allocation and deallocation of vectors from TempVectorStack.
|
||||
//
|
||||
class MiniBatch {
|
||||
public:
|
||||
static constexpr int kLogMiniBatchLength = 10;
|
||||
static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
|
||||
};
|
||||
|
||||
namespace bit_util {
|
||||
|
||||
ARROW_COMPUTE_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
|
||||
const int num_bits, const uint8_t* bits,
|
||||
int* num_indexes, uint16_t* indexes,
|
||||
int bit_offset = 0);
|
||||
|
||||
ARROW_COMPUTE_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
|
||||
const int num_bits, const uint8_t* bits,
|
||||
const uint16_t* input_indexes,
|
||||
int* num_indexes, uint16_t* indexes,
|
||||
int bit_offset = 0);
|
||||
|
||||
// Input and output indexes may be pointing to the same data (in-place filtering).
|
||||
ARROW_COMPUTE_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits,
|
||||
const uint8_t* bits, int* num_indexes_bit0,
|
||||
uint16_t* indexes_bit0,
|
||||
uint16_t* indexes_bit1, int bit_offset = 0);
|
||||
|
||||
// Bit 1 is replaced with byte 0xFF.
|
||||
ARROW_COMPUTE_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits,
|
||||
const uint8_t* bits, uint8_t* bytes,
|
||||
int bit_offset = 0);
|
||||
|
||||
// Return highest bit of each byte.
|
||||
ARROW_COMPUTE_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits,
|
||||
const uint8_t* bytes, uint8_t* bits,
|
||||
int bit_offset = 0);
|
||||
|
||||
ARROW_COMPUTE_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
|
||||
uint32_t num_bytes);
|
||||
|
||||
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
|
||||
// The functions below use BMI2 instructions, be careful before calling!
|
||||
|
||||
namespace avx2 {
|
||||
ARROW_COMPUTE_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
|
||||
const uint8_t* bits,
|
||||
const uint16_t* input_indexes,
|
||||
int* num_indexes, uint16_t* indexes);
|
||||
ARROW_COMPUTE_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
|
||||
const uint8_t* bits, int* num_indexes,
|
||||
uint16_t* indexes,
|
||||
uint16_t base_index = 0);
|
||||
ARROW_COMPUTE_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits,
|
||||
uint8_t* bytes);
|
||||
ARROW_COMPUTE_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes,
|
||||
uint8_t* bits);
|
||||
ARROW_COMPUTE_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes,
|
||||
uint32_t num_bytes);
|
||||
} // namespace avx2
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace bit_util
|
||||
} // namespace util
|
||||
|
||||
namespace compute {
|
||||
|
||||
/// Modify an Expression with pre-order and post-order visitation.
|
||||
/// `pre` will be invoked on each Expression. `pre` will visit Calls before their
|
||||
/// arguments, `post_call` will visit Calls (and no other Expressions) after their
|
||||
/// arguments. Visitors should return the Identical expression to indicate no change; this
|
||||
/// will prevent unnecessary construction in the common case where a modification is not
|
||||
/// possible/necessary/...
|
||||
///
|
||||
/// If an argument was modified, `post_call` visits a reconstructed Call with the modified
|
||||
/// arguments but also receives a pointer to the unmodified Expression as a second
|
||||
/// argument. If no arguments were modified the unmodified Expression* will be nullptr.
|
||||
template <typename PreVisit, typename PostVisitCall>
|
||||
Result<Expression> ModifyExpression(Expression expr, const PreVisit& pre,
|
||||
const PostVisitCall& post_call) {
|
||||
ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
|
||||
|
||||
auto call = expr.call();
|
||||
if (!call) return expr;
|
||||
|
||||
bool at_least_one_modified = false;
|
||||
std::vector<Expression> modified_arguments;
|
||||
|
||||
for (size_t i = 0; i < call->arguments.size(); ++i) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto modified_argument,
|
||||
ModifyExpression(call->arguments[i], pre, post_call));
|
||||
|
||||
if (Expression::Identical(modified_argument, call->arguments[i])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!at_least_one_modified) {
|
||||
modified_arguments = call->arguments;
|
||||
at_least_one_modified = true;
|
||||
}
|
||||
|
||||
modified_arguments[i] = std::move(modified_argument);
|
||||
}
|
||||
|
||||
if (at_least_one_modified) {
|
||||
// reconstruct the call expression with the modified arguments
|
||||
auto modified_call = *call;
|
||||
modified_call.arguments = std::move(modified_arguments);
|
||||
return post_call(Expression(std::move(modified_call)), &expr);
|
||||
}
|
||||
|
||||
return post_call(std::move(expr), NULLPTR);
|
||||
}
|
||||
|
||||
// Helper class to calculate the modified number of rows to process using SIMD.
|
||||
//
|
||||
// Some array elements at the end will be skipped in order to avoid buffer
|
||||
// overrun, when doing memory loads and stores using larger word size than a
|
||||
// single array element.
|
||||
//
|
||||
class TailSkipForSIMD {
|
||||
public:
|
||||
static int64_t FixBitAccess(int num_bytes_accessed_together, int64_t num_rows,
|
||||
int bit_offset) {
|
||||
int64_t num_bytes = bit_util::BytesForBits(num_rows + bit_offset);
|
||||
int64_t num_bytes_safe =
|
||||
std::max(static_cast<int64_t>(0LL), num_bytes - num_bytes_accessed_together + 1);
|
||||
int64_t num_rows_safe =
|
||||
std::max(static_cast<int64_t>(0LL), 8 * num_bytes_safe - bit_offset);
|
||||
return std::min(num_rows_safe, num_rows);
|
||||
}
|
||||
static int64_t FixBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
|
||||
int64_t length) {
|
||||
int64_t num_rows_to_skip = bit_util::CeilDiv(length, num_bytes_accessed_together);
|
||||
int64_t num_rows_safe =
|
||||
std::max(static_cast<int64_t>(0LL), num_rows - num_rows_to_skip);
|
||||
return num_rows_safe;
|
||||
}
|
||||
static int64_t FixVarBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
|
||||
const uint32_t* offsets) {
|
||||
// Do not process rows that could read past the end of the buffer using N
|
||||
// byte loads/stores.
|
||||
//
|
||||
int64_t num_rows_safe = num_rows;
|
||||
while (num_rows_safe > 0 &&
|
||||
offsets[num_rows_safe] + num_bytes_accessed_together > offsets[num_rows]) {
|
||||
--num_rows_safe;
|
||||
}
|
||||
return num_rows_safe;
|
||||
}
|
||||
static int FixSelection(int64_t num_rows_safe, int num_selected,
|
||||
const uint16_t* selection) {
|
||||
int num_selected_safe = num_selected;
|
||||
while (num_selected_safe > 0 && selection[num_selected_safe - 1] >= num_rows_safe) {
|
||||
--num_selected_safe;
|
||||
}
|
||||
return num_selected_safe;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace compute
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,49 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(push)
|
||||
# pragma warning(disable : 4251)
|
||||
# else
|
||||
# pragma GCC diagnostic ignored "-Wattributes"
|
||||
# endif
|
||||
|
||||
# ifdef ARROW_COMPUTE_STATIC
|
||||
# define ARROW_COMPUTE_EXPORT
|
||||
# elif defined(ARROW_COMPUTE_EXPORTING)
|
||||
# define ARROW_COMPUTE_EXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define ARROW_COMPUTE_EXPORT __declspec(dllimport)
|
||||
# endif
|
||||
|
||||
# define ARROW_COMPUTE_NO_EXPORT
|
||||
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(pop)
|
||||
# endif
|
||||
|
||||
#else // Not Windows
|
||||
# ifndef ARROW_COMPUTE_EXPORT
|
||||
# define ARROW_COMPUTE_EXPORT __attribute__((visibility("default")))
|
||||
# endif
|
||||
# ifndef ARROW_COMPUTE_NO_EXPORT
|
||||
# define ARROW_COMPUTE_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
# endif
|
||||
#endif
|
||||
Reference in New Issue
Block a user