Initial commit
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
arrow_install_all_headers("arrow/python")
|
||||
add_subdirectory(vendored)
|
||||
@@ -0,0 +1,29 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/arrow_to_pandas.h"
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/datetime.h"
|
||||
#include "arrow/python/helpers.h"
|
||||
#include "arrow/python/inference.h"
|
||||
#include "arrow/python/io.h"
|
||||
#include "arrow/python/numpy_convert.h"
|
||||
#include "arrow/python/numpy_to_arrow.h"
|
||||
#include "arrow/python/python_to_arrow.h"
|
||||
#include "arrow/python/util.h"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,146 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Functions for converting between pandas's NumPy-based data representation
|
||||
// and Arrow data structures
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class ChunkedArray;
|
||||
class Column;
|
||||
class DataType;
|
||||
class MemoryPool;
|
||||
class Status;
|
||||
class Table;
|
||||
|
||||
namespace py {
|
||||
|
||||
enum class MapConversionType {
|
||||
DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas
|
||||
LOSSY, // report warnings when lossiness is encountered due to duplicate keys
|
||||
STRICT_, // raise a Python exception when lossiness is encountered due to duplicate
|
||||
// keys
|
||||
};
|
||||
|
||||
struct PandasOptions {
|
||||
/// arrow::MemoryPool to use for memory allocations
|
||||
MemoryPool* pool = default_memory_pool();
|
||||
|
||||
/// If true, we will convert all string columns to categoricals
|
||||
bool strings_to_categorical = false;
|
||||
bool zero_copy_only = false;
|
||||
bool integer_object_nulls = false;
|
||||
bool date_as_object = false;
|
||||
bool timestamp_as_object = false;
|
||||
bool use_threads = false;
|
||||
|
||||
/// Coerce all date and timestamp to datetime64[ns]
|
||||
bool coerce_temporal_nanoseconds = false;
|
||||
|
||||
/// Used to maintain backwards compatibility for
|
||||
/// timezone bugs (see ARROW-9528). Should be removed
|
||||
/// after Arrow 2.0 release.
|
||||
bool ignore_timezone = false;
|
||||
|
||||
/// \brief If true, do not create duplicate PyObject versions of equal
|
||||
/// objects. This only applies to immutable objects like strings or datetime
|
||||
/// objects
|
||||
bool deduplicate_objects = false;
|
||||
|
||||
/// \brief For certain data types, a cast is needed in order to store the
|
||||
/// data in a pandas DataFrame or Series (e.g. timestamps are always stored
|
||||
/// as nanoseconds in pandas). This option controls whether it is a safe
|
||||
/// cast or not.
|
||||
bool safe_cast = true;
|
||||
|
||||
/// \brief If true, create one block per column rather than consolidated
|
||||
/// blocks (1 per data type). Do zero-copy wrapping when there are no
|
||||
/// nulls. pandas currently will consolidate the blocks on its own, causing
|
||||
/// increased memory use, so keep this in mind if you are working on a
|
||||
/// memory-constrained situation.
|
||||
bool split_blocks = false;
|
||||
|
||||
/// \brief If true, allow non-writable zero-copy views to be created for
|
||||
/// single column blocks. This option is also used to provide zero copy for
|
||||
/// Series data
|
||||
bool allow_zero_copy_blocks = false;
|
||||
|
||||
/// \brief If true, attempt to deallocate buffers in passed Arrow object if
|
||||
/// it is the only remaining shared_ptr copy of it. See ARROW-3789 for
|
||||
/// original context for this feature. Only currently implemented for Table
|
||||
/// conversions
|
||||
bool self_destruct = false;
|
||||
|
||||
/// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to
|
||||
/// Python association lists (list-of-tuples) in the same order as the Arrow
|
||||
/// Map, as in [(key1, value1), (key2, value2), ...]
|
||||
/// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts.
|
||||
/// This can change the ordering of (key, value) pairs, and will deduplicate
|
||||
/// multiple keys, resulting in a possible loss of data.
|
||||
/// If 'lossy', this key deduplication results in a warning printed
|
||||
/// when detected. If 'strict', this instead results in an exception
|
||||
/// being raised when detected.
|
||||
MapConversionType maps_as_pydicts = MapConversionType::DEFAULT;
|
||||
|
||||
// Used internally for nested arrays.
|
||||
bool decode_dictionaries = false;
|
||||
|
||||
// Columns that should be casted to categorical
|
||||
std::unordered_set<std::string> categorical_columns;
|
||||
|
||||
// Columns that should be passed through to be converted to
|
||||
// ExtensionArray/Block
|
||||
std::unordered_set<std::string> extension_columns;
|
||||
|
||||
// Used internally to decipher between to_numpy() and to_pandas() when
|
||||
// the expected output differs
|
||||
bool to_numpy = false;
|
||||
};
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr<Array> arr,
|
||||
PyObject* py_ref, PyObject** out);
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status ConvertChunkedArrayToPandas(const PandasOptions& options,
|
||||
std::shared_ptr<ChunkedArray> col, PyObject* py_ref,
|
||||
PyObject** out);
|
||||
|
||||
// Convert a whole table as efficiently as possible to a pandas.DataFrame.
|
||||
//
|
||||
// The returned Python object is a list of tuples consisting of the exact 2D
|
||||
// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
|
||||
//
|
||||
// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table> table,
|
||||
PyObject** out);
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,49 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace internal {
|
||||
// TODO(ARROW-12976): See if we can refactor Pandas ObjectWriter logic
|
||||
// to the .cc file and move this there as well if we can.
|
||||
|
||||
// Converts array to a sequency of python objects.
|
||||
template <typename ArrayType, typename WriteValue, typename Assigner>
|
||||
inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func,
|
||||
Assigner out_values) {
|
||||
// TODO(ARROW-12976): Use visitor here?
|
||||
const bool has_nulls = arr.null_count() > 0;
|
||||
for (int64_t i = 0; i < arr.length(); ++i) {
|
||||
if (has_nulls && arr.IsNull(i)) {
|
||||
Py_INCREF(Py_None);
|
||||
*out_values = Py_None;
|
||||
} else {
|
||||
RETURN_NOT_OK(write_func(arr.GetView(i), out_values));
|
||||
}
|
||||
++out_values;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,60 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/future.h"
|
||||
|
||||
namespace arrow::py {
|
||||
|
||||
/// \brief Bind a Python callback to an arrow::Future.
|
||||
///
|
||||
/// If the Future finishes successfully, py_wrapper is called with its
|
||||
/// result value and should return a PyObject*. If py_wrapper is successful,
|
||||
/// py_cb is called with its return value.
|
||||
///
|
||||
/// If either the Future or py_wrapper fails, py_cb is called with the
|
||||
/// associated Python exception.
|
||||
///
|
||||
/// \param future The future to bind to.
|
||||
/// \param py_cb The Python callback function. Will be passed the result of
|
||||
/// py_wrapper, or a Python exception if the future failed or one was
|
||||
/// raised by py_wrapper.
|
||||
/// \param py_wrapper A function (likely defined in Cython) to convert the C++
|
||||
/// result of the future to a Python object.
|
||||
template <typename T, typename PyWrapper = PyObject* (*)(T)>
|
||||
void BindFuture(Future<T> future, PyObject* py_cb, PyWrapper py_wrapper) {
|
||||
Py_INCREF(py_cb);
|
||||
OwnedRefNoGIL cb_ref(py_cb);
|
||||
|
||||
auto future_cb = [cb_ref = std::move(cb_ref),
|
||||
py_wrapper = std::move(py_wrapper)](Result<T> result) {
|
||||
SafeCallIntoPythonVoid([&]() {
|
||||
OwnedRef py_value_or_exc{WrapResult(std::move(result), std::move(py_wrapper))};
|
||||
Py_XDECREF(
|
||||
PyObject_CallFunctionObjArgs(cb_ref.obj(), py_value_or_exc.obj(), NULLPTR));
|
||||
ARROW_WARN_NOT_OK(CheckPyError(), "Internal error in async call");
|
||||
});
|
||||
};
|
||||
future.AddCallback(std::move(future_cb));
|
||||
}
|
||||
|
||||
} // namespace arrow::py
|
||||
@@ -0,0 +1,38 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/benchmark.h"
|
||||
#include "arrow/python/helpers.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace benchmark {
|
||||
|
||||
void Benchmark_PandasObjectIsNull(PyObject* list) {
|
||||
if (!PyList_CheckExact(list)) {
|
||||
PyErr_SetString(PyExc_TypeError, "expected a list");
|
||||
return;
|
||||
}
|
||||
Py_ssize_t i, n = PyList_GET_SIZE(list);
|
||||
for (i = 0; i < n; i++) {
|
||||
internal::PandasObjectIsNull(PyList_GET_ITEM(list, i));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace benchmark
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,36 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace benchmark {
|
||||
|
||||
// Micro-benchmark routines for use from ASV
|
||||
|
||||
// Run PandasObjectIsNull() once over every object in *list*
|
||||
ARROW_PYTHON_EXPORT
|
||||
void Benchmark_PandasObjectIsNull(PyObject* list);
|
||||
|
||||
} // namespace benchmark
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,246 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
#include "arrow/python/helpers.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using internal::checked_cast;
|
||||
|
||||
namespace py {
|
||||
|
||||
static std::mutex memory_pool_mutex;
|
||||
static MemoryPool* default_python_pool = nullptr;
|
||||
|
||||
void set_default_memory_pool(MemoryPool* pool) {
|
||||
std::lock_guard<std::mutex> guard(memory_pool_mutex);
|
||||
default_python_pool = pool;
|
||||
}
|
||||
|
||||
MemoryPool* get_memory_pool() {
|
||||
std::lock_guard<std::mutex> guard(memory_pool_mutex);
|
||||
if (default_python_pool) {
|
||||
return default_python_pool;
|
||||
} else {
|
||||
return default_memory_pool();
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// PythonErrorDetail
|
||||
|
||||
namespace {
|
||||
|
||||
const char kErrorDetailTypeId[] = "arrow::py::PythonErrorDetail";
|
||||
|
||||
// Try to match the Python exception type with an appropriate Status code
|
||||
StatusCode MapPyError(PyObject* exc_type) {
|
||||
StatusCode code;
|
||||
|
||||
if (PyErr_GivenExceptionMatches(exc_type, PyExc_MemoryError)) {
|
||||
code = StatusCode::OutOfMemory;
|
||||
} else if (PyErr_GivenExceptionMatches(exc_type, PyExc_IndexError)) {
|
||||
code = StatusCode::IndexError;
|
||||
} else if (PyErr_GivenExceptionMatches(exc_type, PyExc_KeyError)) {
|
||||
code = StatusCode::KeyError;
|
||||
} else if (PyErr_GivenExceptionMatches(exc_type, PyExc_TypeError)) {
|
||||
code = StatusCode::TypeError;
|
||||
} else if (PyErr_GivenExceptionMatches(exc_type, PyExc_ValueError) ||
|
||||
PyErr_GivenExceptionMatches(exc_type, PyExc_OverflowError)) {
|
||||
code = StatusCode::Invalid;
|
||||
} else if (PyErr_GivenExceptionMatches(exc_type, PyExc_EnvironmentError)) {
|
||||
code = StatusCode::IOError;
|
||||
} else if (PyErr_GivenExceptionMatches(exc_type, PyExc_NotImplementedError)) {
|
||||
code = StatusCode::NotImplemented;
|
||||
} else {
|
||||
code = StatusCode::UnknownError;
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
// PythonErrorDetail indicates a Python exception was raised.
|
||||
class PythonErrorDetail : public StatusDetail {
|
||||
public:
|
||||
const char* type_id() const override { return kErrorDetailTypeId; }
|
||||
|
||||
std::string ToString() const override {
|
||||
// This is simple enough not to need the GIL
|
||||
Result<std::string> result = FormatImpl();
|
||||
|
||||
if (result.ok()) {
|
||||
return result.ValueOrDie();
|
||||
} else {
|
||||
// Fallback to just the exception type
|
||||
const auto ty = reinterpret_cast<const PyTypeObject*>(exc_type_.obj());
|
||||
return std::string("Python exception: ") + ty->tp_name;
|
||||
}
|
||||
}
|
||||
|
||||
void RestorePyError() const {
|
||||
Py_INCREF(exc_type_.obj());
|
||||
Py_INCREF(exc_value_.obj());
|
||||
Py_INCREF(exc_traceback_.obj());
|
||||
PyErr_Restore(exc_type_.obj(), exc_value_.obj(), exc_traceback_.obj());
|
||||
}
|
||||
|
||||
PyObject* exc_type() const { return exc_type_.obj(); }
|
||||
|
||||
PyObject* exc_value() const { return exc_value_.obj(); }
|
||||
|
||||
static std::shared_ptr<PythonErrorDetail> FromPyError() {
|
||||
PyObject* exc_type = nullptr;
|
||||
PyObject* exc_value = nullptr;
|
||||
PyObject* exc_traceback = nullptr;
|
||||
|
||||
PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
|
||||
PyErr_NormalizeException(&exc_type, &exc_value, &exc_traceback);
|
||||
ARROW_CHECK(exc_type)
|
||||
<< "PythonErrorDetail::FromPyError called without a Python error set";
|
||||
ARROW_DCHECK(PyType_Check(exc_type));
|
||||
ARROW_DCHECK(exc_value); // Ensured by PyErr_NormalizeException, double-check
|
||||
if (exc_traceback == nullptr) {
|
||||
// Needed by PyErr_Restore()
|
||||
Py_INCREF(Py_None);
|
||||
exc_traceback = Py_None;
|
||||
}
|
||||
|
||||
std::shared_ptr<PythonErrorDetail> detail(new PythonErrorDetail);
|
||||
detail->exc_type_.reset(exc_type);
|
||||
detail->exc_value_.reset(exc_value);
|
||||
detail->exc_traceback_.reset(exc_traceback);
|
||||
return detail;
|
||||
}
|
||||
|
||||
protected:
|
||||
Result<std::string> FormatImpl() const {
|
||||
PyAcquireGIL lock;
|
||||
|
||||
// Use traceback.format_exception()
|
||||
OwnedRef traceback_module;
|
||||
RETURN_NOT_OK(internal::ImportModule("traceback", &traceback_module));
|
||||
|
||||
OwnedRef fmt_exception;
|
||||
RETURN_NOT_OK(internal::ImportFromModule(traceback_module.obj(), "format_exception",
|
||||
&fmt_exception));
|
||||
|
||||
OwnedRef formatted;
|
||||
formatted.reset(PyObject_CallFunctionObjArgs(fmt_exception.obj(), exc_type_.obj(),
|
||||
exc_value_.obj(), exc_traceback_.obj(),
|
||||
NULL));
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "Python exception: ";
|
||||
Py_ssize_t num_lines = PySequence_Length(formatted.obj());
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
for (Py_ssize_t i = 0; i < num_lines; ++i) {
|
||||
Py_ssize_t line_size;
|
||||
|
||||
PyObject* line = PySequence_GetItem(formatted.obj(), i);
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
const char* data = PyUnicode_AsUTF8AndSize(line, &line_size);
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
ss << std::string_view(data, line_size);
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
PythonErrorDetail() = default;
|
||||
|
||||
OwnedRefNoGIL exc_type_, exc_value_, exc_traceback_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Python exception <-> Status
|
||||
|
||||
Status ConvertPyError(StatusCode code) {
|
||||
auto detail = PythonErrorDetail::FromPyError();
|
||||
if (code == StatusCode::UnknownError) {
|
||||
code = MapPyError(detail->exc_type());
|
||||
}
|
||||
|
||||
std::string message;
|
||||
RETURN_NOT_OK(internal::PyObject_StdStringStr(detail->exc_value(), &message));
|
||||
return Status(code, message, detail);
|
||||
}
|
||||
|
||||
bool IsPyError(const Status& status) {
|
||||
if (status.ok()) {
|
||||
return false;
|
||||
}
|
||||
auto detail = status.detail();
|
||||
bool result = detail != nullptr && detail->type_id() == kErrorDetailTypeId;
|
||||
return result;
|
||||
}
|
||||
|
||||
void RestorePyError(const Status& status) {
|
||||
ARROW_CHECK(IsPyError(status));
|
||||
const auto& detail = checked_cast<const PythonErrorDetail&>(*status.detail());
|
||||
detail.RestorePyError();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// PyBuffer
|
||||
|
||||
PyBuffer::PyBuffer() : Buffer(nullptr, 0) {}
|
||||
|
||||
Status PyBuffer::Init(PyObject* obj) {
|
||||
if (!PyObject_GetBuffer(obj, &py_buf_, PyBUF_ANY_CONTIGUOUS)) {
|
||||
data_ = reinterpret_cast<const uint8_t*>(py_buf_.buf);
|
||||
ARROW_CHECK_NE(data_, nullptr) << "Null pointer in Py_buffer";
|
||||
size_ = py_buf_.len;
|
||||
capacity_ = py_buf_.len;
|
||||
is_mutable_ = !py_buf_.readonly;
|
||||
return Status::OK();
|
||||
} else {
|
||||
return ConvertPyError(StatusCode::Invalid);
|
||||
}
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> PyBuffer::FromPyObject(PyObject* obj) {
|
||||
PyBuffer* buf = new PyBuffer();
|
||||
std::shared_ptr<Buffer> res(buf);
|
||||
RETURN_NOT_OK(buf->Init(obj));
|
||||
return res;
|
||||
}
|
||||
|
||||
PyBuffer::~PyBuffer() {
|
||||
if (data_ != nullptr) {
|
||||
PyAcquireGIL lock;
|
||||
PyBuffer_Release(&py_buf_);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,457 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/python/pyarrow.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class MemoryPool;
|
||||
template <class T>
|
||||
class Result;
|
||||
|
||||
namespace py {
|
||||
|
||||
// Convert current Python error to a Status. The Python error state is cleared
|
||||
// and can be restored with RestorePyError().
|
||||
ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError);
|
||||
// Query whether the given Status is a Python error (as wrapped by ConvertPyError()).
|
||||
ARROW_PYTHON_EXPORT bool IsPyError(const Status& status);
|
||||
// Restore a Python error wrapped in a Status.
|
||||
ARROW_PYTHON_EXPORT void RestorePyError(const Status& status);
|
||||
|
||||
// Catch a pending Python exception and return the corresponding Status.
|
||||
// If no exception is pending, Status::OK() is returned.
|
||||
inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) {
|
||||
if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
return ConvertPyError(code);
|
||||
}
|
||||
}
|
||||
|
||||
#define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError())
|
||||
|
||||
#define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE))
|
||||
|
||||
// For Cython, as you can't define template C++ functions in Cython, only use them.
|
||||
// This function can set a Python exception. It assumes that T has a (cheap)
|
||||
// default constructor.
|
||||
template <class T>
|
||||
T GetResultValue(Result<T> result) {
|
||||
if (ARROW_PREDICT_TRUE(result.ok())) {
|
||||
return *std::move(result);
|
||||
} else {
|
||||
int r = internal::check_status(result.status()); // takes the GIL
|
||||
assert(r == -1); // should have errored out
|
||||
ARROW_UNUSED(r);
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Wrap a Result and return the corresponding Python object.
|
||||
///
|
||||
/// If the Result is successful, py_wrapper is called with its result value
|
||||
/// and should return a PyObject*. If py_wrapper is successful (returns
|
||||
/// a non-NULL value), its return value is returned.
|
||||
///
|
||||
/// If either the Result or py_wrapper fails, the associated Python exception
|
||||
/// is raised and NULL is returned.
|
||||
//
|
||||
/// \param result The Result whose value to wrap in a Python object.
|
||||
/// \param py_wrapper A function (likely defined in Cython) to convert the C++
|
||||
/// value of the Result to a Python object.
|
||||
/// \return A new Python reference, or NULL if an exception occurred
|
||||
template <typename T, typename PyWrapper = PyObject* (*)(T)>
|
||||
PyObject* WrapResult(Result<T> result, PyWrapper&& py_wrapper) {
|
||||
static_assert(std::is_same_v<PyObject*, decltype(py_wrapper(std::declval<T>()))>,
|
||||
"PyWrapper argument to WrapResult should return a PyObject* "
|
||||
"when called with a T*");
|
||||
Status st = result.status();
|
||||
if (st.ok()) {
|
||||
PyObject* py_value = py_wrapper(result.MoveValueUnsafe());
|
||||
st = CheckPyError();
|
||||
if (st.ok()) {
|
||||
return py_value;
|
||||
}
|
||||
Py_XDECREF(py_value); // should be null, but who knows
|
||||
}
|
||||
// Status is an error, convert it to an exception.
|
||||
return internal::convert_status(st);
|
||||
}
|
||||
|
||||
// A RAII-style helper that ensures the GIL is acquired inside a lexical block.
|
||||
class ARROW_PYTHON_EXPORT PyAcquireGIL {
|
||||
public:
|
||||
PyAcquireGIL() : acquired_gil_(false) { acquire(); }
|
||||
|
||||
~PyAcquireGIL() { release(); }
|
||||
|
||||
void acquire() {
|
||||
if (!acquired_gil_) {
|
||||
state_ = PyGILState_Ensure();
|
||||
acquired_gil_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// idempotent
|
||||
void release() {
|
||||
if (acquired_gil_) {
|
||||
PyGILState_Release(state_);
|
||||
acquired_gil_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool acquired_gil_;
|
||||
PyGILState_STATE state_;
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL);
|
||||
};
|
||||
|
||||
// A RAII-style helper that releases the GIL until the end of a lexical block
|
||||
class ARROW_PYTHON_EXPORT PyReleaseGIL {
|
||||
public:
|
||||
PyReleaseGIL() : ptr_(PyEval_SaveThread(), &unique_ptr_deleter) {}
|
||||
|
||||
private:
|
||||
static void unique_ptr_deleter(PyThreadState* state) {
|
||||
if (state) {
|
||||
PyEval_RestoreThread(state);
|
||||
}
|
||||
}
|
||||
std::unique_ptr<PyThreadState, decltype(&unique_ptr_deleter)> ptr_;
|
||||
};
|
||||
|
||||
// A helper to call safely into the Python interpreter from arbitrary C++ code.
|
||||
// The GIL is acquired, and the current thread's error status is preserved.
|
||||
template <typename Function>
|
||||
auto SafeCallIntoPython(Function&& func) -> decltype(func()) {
|
||||
PyAcquireGIL lock;
|
||||
PyObject* exc_type;
|
||||
PyObject* exc_value;
|
||||
PyObject* exc_traceback;
|
||||
PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
|
||||
auto maybe_status = std::forward<Function>(func)();
|
||||
// If the return Status is a "Python error", the current Python error status
|
||||
// describes the error and shouldn't be clobbered.
|
||||
if (!IsPyError(::arrow::ToStatus(maybe_status)) && exc_type != NULLPTR) {
|
||||
PyErr_Restore(exc_type, exc_value, exc_traceback);
|
||||
}
|
||||
return maybe_status;
|
||||
}
|
||||
|
||||
template <typename Function>
|
||||
auto SafeCallIntoPythonVoid(Function&& func) -> decltype(func()) {
|
||||
PyAcquireGIL lock;
|
||||
PyObject* exc_type;
|
||||
PyObject* exc_value;
|
||||
PyObject* exc_traceback;
|
||||
PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
|
||||
func();
|
||||
if (exc_type != NULLPTR) {
|
||||
PyErr_Restore(exc_type, exc_value, exc_traceback);
|
||||
}
|
||||
}
|
||||
|
||||
// A RAII primitive that DECREFs the underlying PyObject* when it
|
||||
// goes out of scope.
|
||||
class ARROW_PYTHON_EXPORT OwnedRef {
|
||||
public:
|
||||
OwnedRef() : obj_(NULLPTR) {}
|
||||
OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {}
|
||||
explicit OwnedRef(PyObject* obj) : obj_(obj) {}
|
||||
|
||||
OwnedRef& operator=(OwnedRef&& other) {
|
||||
obj_ = other.detach();
|
||||
return *this;
|
||||
}
|
||||
|
||||
~OwnedRef() {
|
||||
// GH-38626: destructor may be called after the Python interpreter is finalized.
|
||||
if (Py_IsInitialized()) {
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
||||
void reset(PyObject* obj) {
|
||||
Py_XDECREF(obj_);
|
||||
obj_ = obj;
|
||||
}
|
||||
|
||||
void reset() { reset(NULLPTR); }
|
||||
|
||||
PyObject* detach() {
|
||||
PyObject* result = obj_;
|
||||
obj_ = NULLPTR;
|
||||
return result;
|
||||
}
|
||||
|
||||
PyObject* obj() const { return obj_; }
|
||||
|
||||
PyObject** ref() { return &obj_; }
|
||||
|
||||
operator bool() const { return obj_ != NULLPTR; }
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef);
|
||||
|
||||
PyObject* obj_;
|
||||
};
|
||||
|
||||
// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope.
|
||||
// This is for situations where the GIL is not always known to be held
|
||||
// (e.g. if it is released in the middle of a function for performance reasons)
|
||||
class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef {
|
||||
public:
|
||||
OwnedRefNoGIL() : OwnedRef() {}
|
||||
OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {}
|
||||
explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {}
|
||||
|
||||
~OwnedRefNoGIL() {
|
||||
// GH-38626: destructor may be called after the Python interpreter is finalized.
|
||||
if (Py_IsInitialized() && obj() != NULLPTR) {
|
||||
PyAcquireGIL lock;
|
||||
reset();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <template <typename...> typename SmartPtr, typename... Ts>
|
||||
class SmartPtrNoGIL : public SmartPtr<Ts...> {
|
||||
using Base = SmartPtr<Ts...>;
|
||||
|
||||
public:
|
||||
template <typename... Args>
|
||||
SmartPtrNoGIL(Args&&... args) : Base(std::forward<Args>(args)...) {}
|
||||
|
||||
~SmartPtrNoGIL() { reset(); }
|
||||
|
||||
template <typename... Args>
|
||||
void reset(Args&&... args) {
|
||||
auto release_guard = optional_gil_release();
|
||||
Base::reset(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
SmartPtrNoGIL& operator=(V&& v) {
|
||||
auto release_guard = optional_gil_release();
|
||||
Base::operator=(std::forward<V>(v));
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
// Only release the GIL if we own an object *and* the Python runtime is
|
||||
// valid *and* the GIL is held.
|
||||
std::optional<PyReleaseGIL> optional_gil_release() const {
|
||||
if (this->get() != nullptr && Py_IsInitialized() && PyGILState_Check()) {
|
||||
return PyReleaseGIL();
|
||||
}
|
||||
return {};
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief A std::shared_ptr<T, ...> subclass that releases the GIL when destroying T
|
||||
template <typename... Ts>
|
||||
using SharedPtrNoGIL = SmartPtrNoGIL<std::shared_ptr, Ts...>;
|
||||
|
||||
/// \brief A std::unique_ptr<T, ...> subclass that releases the GIL when destroying T
|
||||
template <typename... Ts>
|
||||
using UniquePtrNoGIL = SmartPtrNoGIL<std::unique_ptr, Ts...>;
|
||||
|
||||
template <typename Fn>
|
||||
struct BoundFunction;
|
||||
|
||||
template <typename... Args>
|
||||
struct BoundFunction<void(PyObject*, Args...)> {
|
||||
// We bind `cdef void fn(object, ...)` to get a `Status(...)`
|
||||
// where the Status contains any Python error raised by `fn`
|
||||
using Unbound = void(PyObject*, Args...);
|
||||
using Bound = Status(Args...);
|
||||
|
||||
BoundFunction(Unbound* unbound, PyObject* bound_arg)
|
||||
: unbound_(unbound), bound_arg_(bound_arg) {}
|
||||
|
||||
Status Invoke(Args... args) const {
|
||||
PyAcquireGIL lock;
|
||||
unbound_(bound_arg_.obj(), std::forward<Args>(args)...);
|
||||
RETURN_IF_PYERROR();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Unbound* unbound_;
|
||||
OwnedRefNoGIL bound_arg_;
|
||||
};
|
||||
|
||||
template <typename Return, typename... Args>
|
||||
struct BoundFunction<Return(PyObject*, Args...)> {
|
||||
// We bind `cdef Return fn(object, ...)` to get a `Result<Return>(...)`
|
||||
// where the Result contains any Python error raised by `fn` or the
|
||||
// return value from `fn`.
|
||||
using Unbound = Return(PyObject*, Args...);
|
||||
using Bound = Result<Return>(Args...);
|
||||
|
||||
BoundFunction(Unbound* unbound, PyObject* bound_arg)
|
||||
: unbound_(unbound), bound_arg_(bound_arg) {}
|
||||
|
||||
Result<Return> Invoke(Args... args) const {
|
||||
PyAcquireGIL lock;
|
||||
Return ret = unbound_(bound_arg_.obj(), std::forward<Args>(args)...);
|
||||
RETURN_IF_PYERROR();
|
||||
return ret;
|
||||
}
|
||||
|
||||
Unbound* unbound_;
|
||||
OwnedRefNoGIL bound_arg_;
|
||||
};
|
||||
|
||||
template <typename OutFn, typename Return, typename... Args>
|
||||
std::function<OutFn> BindFunction(Return (*unbound)(PyObject*, Args...),
|
||||
PyObject* bound_arg) {
|
||||
using Fn = BoundFunction<Return(PyObject*, Args...)>;
|
||||
|
||||
static_assert(std::is_same<typename Fn::Bound, OutFn>::value,
|
||||
"requested bound function of unsupported type");
|
||||
|
||||
Py_XINCREF(bound_arg);
|
||||
auto bound_fn = std::make_shared<Fn>(unbound, bound_arg);
|
||||
return
|
||||
[bound_fn](Args... args) { return bound_fn->Invoke(std::forward<Args>(args)...); };
|
||||
}
|
||||
|
||||
// A temporary conversion of a Python object to a bytes area.
|
||||
struct PyBytesView {
|
||||
const char* bytes;
|
||||
Py_ssize_t size;
|
||||
bool is_utf8;
|
||||
|
||||
static Result<PyBytesView> FromString(PyObject* obj, bool check_utf8 = false) {
|
||||
PyBytesView self;
|
||||
ARROW_RETURN_NOT_OK(self.ParseString(obj, check_utf8));
|
||||
return std::move(self);
|
||||
}
|
||||
|
||||
static Result<PyBytesView> FromUnicode(PyObject* obj) {
|
||||
PyBytesView self;
|
||||
ARROW_RETURN_NOT_OK(self.ParseUnicode(obj));
|
||||
return std::move(self);
|
||||
}
|
||||
|
||||
static Result<PyBytesView> FromBinary(PyObject* obj) {
|
||||
PyBytesView self;
|
||||
ARROW_RETURN_NOT_OK(self.ParseBinary(obj));
|
||||
return std::move(self);
|
||||
}
|
||||
|
||||
// View the given Python object as string-like, i.e. str or (utf8) bytes
|
||||
Status ParseString(PyObject* obj, bool check_utf8 = false) {
|
||||
if (PyUnicode_Check(obj)) {
|
||||
return ParseUnicode(obj);
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(ParseBinary(obj));
|
||||
if (check_utf8) {
|
||||
// Check the bytes are utf8 utf-8
|
||||
OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size));
|
||||
if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
|
||||
is_utf8 = true;
|
||||
} else {
|
||||
PyErr_Clear();
|
||||
is_utf8 = false;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
// View the given Python object as unicode string
|
||||
Status ParseUnicode(PyObject* obj) {
|
||||
// The utf-8 representation is cached on the unicode object
|
||||
bytes = PyUnicode_AsUTF8AndSize(obj, &size);
|
||||
RETURN_IF_PYERROR();
|
||||
is_utf8 = true;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// View the given Python object as binary-like, i.e. bytes
|
||||
Status ParseBinary(PyObject* obj) {
|
||||
if (PyBytes_Check(obj)) {
|
||||
bytes = PyBytes_AS_STRING(obj);
|
||||
size = PyBytes_GET_SIZE(obj);
|
||||
is_utf8 = false;
|
||||
} else if (PyByteArray_Check(obj)) {
|
||||
bytes = PyByteArray_AS_STRING(obj);
|
||||
size = PyByteArray_GET_SIZE(obj);
|
||||
is_utf8 = false;
|
||||
} else if (PyMemoryView_Check(obj)) {
|
||||
PyObject* ref = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C');
|
||||
RETURN_IF_PYERROR();
|
||||
Py_buffer* buffer = PyMemoryView_GET_BUFFER(ref);
|
||||
bytes = reinterpret_cast<const char*>(buffer->buf);
|
||||
size = buffer->len;
|
||||
is_utf8 = false;
|
||||
} else {
|
||||
return Status::TypeError("Expected bytes, got a '", Py_TYPE(obj)->tp_name,
|
||||
"' object");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
OwnedRef ref;
|
||||
};
|
||||
|
||||
class ARROW_PYTHON_EXPORT PyBuffer : public Buffer {
|
||||
public:
|
||||
/// While memoryview objects support multi-dimensional buffers, PyBuffer only supports
|
||||
/// one-dimensional byte buffers.
|
||||
~PyBuffer();
|
||||
|
||||
static Result<std::shared_ptr<Buffer>> FromPyObject(PyObject* obj);
|
||||
|
||||
private:
|
||||
PyBuffer();
|
||||
Status Init(PyObject*);
|
||||
|
||||
Py_buffer py_buf_;
|
||||
};
|
||||
|
||||
// Return the common PyArrow memory pool
|
||||
ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool);
|
||||
ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool();
|
||||
|
||||
// This is annoying: because C++11 does not allow implicit conversion of string
|
||||
// literals to non-const char*, we need to go through some gymnastics to use
|
||||
// PyObject_CallMethod without a lot of pain (its arguments are non-const
|
||||
// char*)
|
||||
template <typename... ArgTypes>
|
||||
static inline PyObject* cpp_PyObject_CallMethod(PyObject* obj, const char* method_name,
|
||||
const char* argspec, ArgTypes... args) {
|
||||
return PyObject_CallMethod(obj, const_cast<char*>(method_name),
|
||||
const_cast<char*>(argspec), args...);
|
||||
}
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,35 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/config.h"
|
||||
#include "arrow/python/config_internal.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
namespace {
|
||||
|
||||
const BuildInfo kBuildInfo = {
|
||||
PYARROW_BUILD_TYPE,
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
const BuildInfo& GetBuildInfo() { return kBuildInfo; }
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,38 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
struct BuildInfo {
|
||||
// The uppercase build type, e.g. "DEBUG" or "RELEASE"
|
||||
std::string build_type;
|
||||
};
|
||||
|
||||
/// \brief Get build info for PyArrow.
|
||||
///
|
||||
ARROW_PYTHON_EXPORT
|
||||
const BuildInfo& GetBuildInfo();
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,18 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#define PYARROW_BUILD_TYPE "@UPPERCASE_PYBUILD_TYPE@"
|
||||
@@ -0,0 +1,62 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "csv.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using csv::InvalidRow;
|
||||
using csv::InvalidRowHandler;
|
||||
using csv::InvalidRowResult;
|
||||
|
||||
namespace py {
|
||||
namespace csv {
|
||||
|
||||
InvalidRowHandler MakeInvalidRowHandler(PyInvalidRowCallback cb, PyObject* py_handler) {
|
||||
if (cb == nullptr) {
|
||||
return InvalidRowHandler{};
|
||||
}
|
||||
|
||||
struct Handler {
|
||||
PyInvalidRowCallback cb;
|
||||
std::shared_ptr<OwnedRefNoGIL> handler_ref;
|
||||
|
||||
InvalidRowResult operator()(const InvalidRow& invalid_row) {
|
||||
InvalidRowResult result;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
result = cb(handler_ref->obj(), invalid_row);
|
||||
if (PyErr_Occurred()) {
|
||||
PyErr_WriteUnraisable(handler_ref->obj());
|
||||
}
|
||||
return Status::OK();
|
||||
});
|
||||
ARROW_UNUSED(st);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
Py_INCREF(py_handler);
|
||||
return Handler{cb, std::make_shared<OwnedRefNoGIL>(py_handler)};
|
||||
}
|
||||
|
||||
} // namespace csv
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,42 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/csv/options.h"
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace csv {
|
||||
|
||||
using PyInvalidRowCallback = std::function<::arrow::csv::InvalidRowResult(
|
||||
PyObject*, const ::arrow::csv::InvalidRow&)>;
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
::arrow::csv::InvalidRowHandler MakeInvalidRowHandler(PyInvalidRowCallback,
|
||||
PyObject* handler);
|
||||
|
||||
} // namespace csv
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,665 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#include "datetime.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <iomanip>
|
||||
#include <regex>
|
||||
#include <string_view>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/python/arrow_to_python_internal.h"
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/helpers.h"
|
||||
#include "arrow/python/platform.h"
|
||||
#include "arrow/scalar.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "arrow/util/regex.h"
|
||||
#include "arrow/util/value_parsing.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using internal::RegexMatch;
|
||||
|
||||
namespace py {
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
|
||||
bool MatchFixedOffset(const std::string& tz, std::string_view* sign,
|
||||
std::string_view* hour, std::string_view* minute) {
|
||||
static const std::regex regex("^([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$");
|
||||
if (tz.size() < 5) {
|
||||
return false;
|
||||
}
|
||||
return RegexMatch(regex, tz, {sign, hour, minute});
|
||||
}
|
||||
|
||||
constexpr char* NonConst(const char* st) {
|
||||
// Hack for python versions < 3.7 where members of PyStruct members
|
||||
// where non-const (C++ doesn't like assigning string literals to these types)
|
||||
return const_cast<char*>(st);
|
||||
}
|
||||
|
||||
static PyTypeObject MonthDayNanoTupleType = {};
|
||||
|
||||
static PyStructSequence_Field MonthDayNanoField[] = {
|
||||
{NonConst("months"), NonConst("The number of months in the interval")},
|
||||
{NonConst("days"), NonConst("The number days in the interval")},
|
||||
{NonConst("nanoseconds"), NonConst("The number of nanoseconds in the interval")},
|
||||
{nullptr, nullptr}};
|
||||
|
||||
static PyStructSequence_Desc MonthDayNanoTupleDesc = {
|
||||
NonConst("MonthDayNano"),
|
||||
NonConst("A calendar interval consisting of months, days and nanoseconds."),
|
||||
MonthDayNanoField,
|
||||
/*n_in_sequence=*/3};
|
||||
|
||||
} // namespace
|
||||
|
||||
#ifndef PYPY_VERSION
|
||||
PyDateTime_CAPI* datetime_api = nullptr;
|
||||
|
||||
void InitDatetime() {
|
||||
PyAcquireGIL lock;
|
||||
datetime_api =
|
||||
reinterpret_cast<PyDateTime_CAPI*>(PyCapsule_Import(PyDateTime_CAPSULE_NAME, 0));
|
||||
if (datetime_api == nullptr) {
|
||||
Py_FatalError("Could not import datetime C API");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// The following code is adapted from
|
||||
// https://github.com/numpy/numpy/blob/main/numpy/core/src/multiarray/datetime.c
|
||||
|
||||
// Days per month, regular year and leap year
|
||||
static int64_t _days_per_month_table[2][12] = {
|
||||
{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
|
||||
{31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
|
||||
|
||||
static bool is_leapyear(int64_t year) {
|
||||
return (year & 0x3) == 0 && // year % 4 == 0
|
||||
((year % 100) != 0 || (year % 400) == 0);
|
||||
}
|
||||
|
||||
// Calculates the days offset from the 1970 epoch.
|
||||
static int64_t get_days_from_date(int64_t date_year, int64_t date_month,
|
||||
int64_t date_day) {
|
||||
int64_t i, month;
|
||||
int64_t year, days = 0;
|
||||
int64_t* month_lengths;
|
||||
|
||||
year = date_year - 1970;
|
||||
days = year * 365;
|
||||
|
||||
// Adjust for leap years
|
||||
if (days >= 0) {
|
||||
// 1968 is the closest leap year before 1970.
|
||||
// Exclude the current year, so add 1.
|
||||
year += 1;
|
||||
// Add one day for each 4 years
|
||||
days += year / 4;
|
||||
// 1900 is the closest previous year divisible by 100
|
||||
year += 68;
|
||||
// Subtract one day for each 100 years
|
||||
days -= year / 100;
|
||||
// 1600 is the closest previous year divisible by 400
|
||||
year += 300;
|
||||
// Add one day for each 400 years
|
||||
days += year / 400;
|
||||
} else {
|
||||
// 1972 is the closest later year after 1970.
|
||||
// Include the current year, so subtract 2.
|
||||
year -= 2;
|
||||
// Subtract one day for each 4 years
|
||||
days += year / 4;
|
||||
// 2000 is the closest later year divisible by 100
|
||||
year -= 28;
|
||||
// Add one day for each 100 years
|
||||
days -= year / 100;
|
||||
// 2000 is also the closest later year divisible by 400
|
||||
// Subtract one day for each 400 years
|
||||
days += year / 400;
|
||||
}
|
||||
|
||||
month_lengths = _days_per_month_table[is_leapyear(date_year)];
|
||||
month = date_month - 1;
|
||||
|
||||
// Add the months
|
||||
for (i = 0; i < month; ++i) {
|
||||
days += month_lengths[i];
|
||||
}
|
||||
|
||||
// Add the days
|
||||
days += date_day - 1;
|
||||
|
||||
return days;
|
||||
}
|
||||
|
||||
// Modifies '*days_' to be the day offset within the year,
|
||||
// and returns the year.
|
||||
static int64_t days_to_yearsdays(int64_t* days_) {
|
||||
const int64_t days_per_400years = (400 * 365 + 100 - 4 + 1);
|
||||
// Adjust so it's relative to the year 2000 (divisible by 400)
|
||||
int64_t days = (*days_) - (365 * 30 + 7);
|
||||
int64_t year;
|
||||
|
||||
// Break down the 400 year cycle to get the year and day within the year
|
||||
if (days >= 0) {
|
||||
year = 400 * (days / days_per_400years);
|
||||
days = days % days_per_400years;
|
||||
} else {
|
||||
year = 400 * ((days - (days_per_400years - 1)) / days_per_400years);
|
||||
days = days % days_per_400years;
|
||||
if (days < 0) {
|
||||
days += days_per_400years;
|
||||
}
|
||||
}
|
||||
|
||||
// Work out the year/day within the 400 year cycle
|
||||
if (days >= 366) {
|
||||
year += 100 * ((days - 1) / (100 * 365 + 25 - 1));
|
||||
days = (days - 1) % (100 * 365 + 25 - 1);
|
||||
if (days >= 365) {
|
||||
year += 4 * ((days + 1) / (4 * 365 + 1));
|
||||
days = (days + 1) % (4 * 365 + 1);
|
||||
if (days >= 366) {
|
||||
year += (days - 1) / 365;
|
||||
days = (days - 1) % 365;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*days_ = days;
|
||||
return year + 2000;
|
||||
}
|
||||
|
||||
// Extracts the month and year and day number from a number of days
|
||||
static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_month,
|
||||
int64_t* date_day) {
|
||||
int64_t *month_lengths, i;
|
||||
|
||||
*date_year = days_to_yearsdays(&days);
|
||||
month_lengths = _days_per_month_table[is_leapyear(*date_year)];
|
||||
|
||||
for (i = 0; i < 12; ++i) {
|
||||
if (days < month_lengths[i]) {
|
||||
*date_month = i + 1;
|
||||
*date_day = days + 1;
|
||||
return;
|
||||
} else {
|
||||
days -= month_lengths[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Should never get here
|
||||
return;
|
||||
}
|
||||
|
||||
// Splitting time quantities, for example splitting total seconds into
|
||||
// minutes and remaining seconds. After we run
|
||||
// int64_t remaining = split_time(total, quotient, &next)
|
||||
// we have
|
||||
// total = next * quotient + remaining. Handles negative values by propagating
|
||||
// them: If total is negative, next will be negative and remaining will
|
||||
// always be non-negative.
|
||||
static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) {
|
||||
int64_t r = total % quotient;
|
||||
if (r < 0) {
|
||||
*next = total / quotient - 1;
|
||||
return r + quotient;
|
||||
} else {
|
||||
*next = total / quotient;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit,
|
||||
int64_t* hour, int64_t* minute, int64_t* second,
|
||||
int64_t* microsecond) {
|
||||
switch (unit) {
|
||||
case TimeUnit::NANO:
|
||||
if (val % 1000 != 0) {
|
||||
return Status::Invalid("Value ", val, " has non-zero nanoseconds");
|
||||
}
|
||||
val /= 1000;
|
||||
// fall through
|
||||
case TimeUnit::MICRO:
|
||||
*microsecond = split_time(val, 1000000LL, &val);
|
||||
*second = split_time(val, 60, &val);
|
||||
*minute = split_time(val, 60, hour);
|
||||
break;
|
||||
case TimeUnit::MILLI:
|
||||
*microsecond = split_time(val, 1000, &val) * 1000;
|
||||
// fall through
|
||||
case TimeUnit::SECOND:
|
||||
*second = split_time(val, 60, &val);
|
||||
*minute = split_time(val, 60, hour);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year,
|
||||
int64_t* month, int64_t* day) {
|
||||
switch (unit) {
|
||||
case DateUnit::MILLI:
|
||||
val /= 86400000LL; // fall through
|
||||
case DateUnit::DAY:
|
||||
get_date_from_days(val, year, month, day);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
PyObject* NewMonthDayNanoTupleType() {
|
||||
if (MonthDayNanoTupleType.tp_name == nullptr) {
|
||||
if (PyStructSequence_InitType2(&MonthDayNanoTupleType, &MonthDayNanoTupleDesc) != 0) {
|
||||
Py_FatalError("Could not initialize MonthDayNanoTuple");
|
||||
}
|
||||
}
|
||||
Py_INCREF(&MonthDayNanoTupleType);
|
||||
return (PyObject*)&MonthDayNanoTupleType;
|
||||
}
|
||||
|
||||
Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) {
|
||||
int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
|
||||
RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond));
|
||||
*out = PyTime_FromTime(static_cast<int32_t>(hour), static_cast<int32_t>(minute),
|
||||
static_cast<int32_t>(second), static_cast<int32_t>(microsecond));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) {
|
||||
int64_t year = 0, month = 0, day = 0;
|
||||
RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day));
|
||||
*out = PyDate_FromDate(static_cast<int32_t>(year), static_cast<int32_t>(month),
|
||||
static_cast<int32_t>(day));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) {
|
||||
int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
|
||||
RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond));
|
||||
int64_t total_days = 0;
|
||||
hour = split_time(hour, 24, &total_days);
|
||||
int64_t year = 0, month = 0, day = 0;
|
||||
get_date_from_days(total_days, &year, &month, &day);
|
||||
*out = PyDateTime_FromDateAndTime(
|
||||
static_cast<int32_t>(year), static_cast<int32_t>(month), static_cast<int32_t>(day),
|
||||
static_cast<int32_t>(hour), static_cast<int32_t>(minute),
|
||||
static_cast<int32_t>(second), static_cast<int32_t>(microsecond));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int64_t PyDate_to_days(PyDateTime_Date* pydate) {
|
||||
return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate),
|
||||
PyDateTime_GET_DAY(pydate));
|
||||
}
|
||||
|
||||
Result<int64_t> PyDateTime_utcoffset_s(PyObject* obj) {
|
||||
// calculate offset from UTC timezone in seconds
|
||||
// supports only PyDateTime_DateTime and PyDateTime_Time objects
|
||||
OwnedRef pyoffset(PyObject_CallMethod(obj, "utcoffset", NULL));
|
||||
RETURN_IF_PYERROR();
|
||||
if (pyoffset.obj() != nullptr && pyoffset.obj() != Py_None) {
|
||||
auto delta = reinterpret_cast<PyDateTime_Delta*>(pyoffset.obj());
|
||||
return internal::PyDelta_to_s(delta);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
Result<std::string> PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) {
|
||||
// attempt to convert timezone offset objects to "+/-{hh}:{mm}" format
|
||||
OwnedRef pydelta_object(PyObject_CallMethod(pytzinfo, "utcoffset", "O", Py_None));
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
if (!PyDelta_Check(pydelta_object.obj())) {
|
||||
return Status::Invalid(
|
||||
"Object returned by tzinfo.utcoffset(None) is not an instance of "
|
||||
"datetime.timedelta");
|
||||
}
|
||||
auto pydelta = reinterpret_cast<PyDateTime_Delta*>(pydelta_object.obj());
|
||||
|
||||
// retrieve the offset as seconds
|
||||
auto total_seconds = internal::PyDelta_to_s(pydelta);
|
||||
|
||||
// determine whether the offset is positive or negative
|
||||
auto sign = (total_seconds < 0) ? "-" : "+";
|
||||
total_seconds = abs(total_seconds);
|
||||
|
||||
// calculate offset components
|
||||
int64_t hours, minutes, seconds;
|
||||
seconds = split_time(total_seconds, 60, &minutes);
|
||||
minutes = split_time(minutes, 60, &hours);
|
||||
if (seconds > 0) {
|
||||
// check there are no remaining seconds
|
||||
return Status::Invalid("Offset must represent whole number of minutes");
|
||||
}
|
||||
|
||||
// construct the timezone string
|
||||
std::stringstream stream;
|
||||
stream << sign << std::setfill('0') << std::setw(2) << hours << ":" << std::setfill('0')
|
||||
<< std::setw(2) << minutes;
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
// Converted from python. See https://github.com/apache/arrow/pull/7604
|
||||
// for details.
|
||||
Result<PyObject*> StringToTzinfo(const std::string& tz) {
|
||||
std::string_view sign_str, hour_str, minute_str;
|
||||
OwnedRef pytz;
|
||||
OwnedRef zoneinfo;
|
||||
OwnedRef datetime;
|
||||
|
||||
if (internal::ImportModule("pytz", &pytz).ok()) {
|
||||
if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
|
||||
int sign = -1;
|
||||
if (sign_str == "+") {
|
||||
sign = 1;
|
||||
}
|
||||
OwnedRef fixed_offset;
|
||||
RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset));
|
||||
uint32_t minutes, hours;
|
||||
if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) ||
|
||||
!::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(),
|
||||
&minutes)) {
|
||||
return Status::Invalid("Invalid timezone: ", tz);
|
||||
}
|
||||
OwnedRef total_minutes(PyLong_FromLong(
|
||||
sign * ((static_cast<int>(hours) * 60) + static_cast<int>(minutes))));
|
||||
RETURN_IF_PYERROR();
|
||||
auto tzinfo =
|
||||
PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL);
|
||||
RETURN_IF_PYERROR();
|
||||
return tzinfo;
|
||||
}
|
||||
|
||||
OwnedRef timezone;
|
||||
RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone));
|
||||
OwnedRef py_tz_string(
|
||||
PyUnicode_FromStringAndSize(tz.c_str(), static_cast<Py_ssize_t>(tz.size())));
|
||||
auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL);
|
||||
RETURN_IF_PYERROR();
|
||||
return tzinfo;
|
||||
}
|
||||
|
||||
// catch fixed offset if pytz is not present
|
||||
if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
|
||||
RETURN_NOT_OK(internal::ImportModule("datetime", &datetime));
|
||||
int sign = -1;
|
||||
if (sign_str == "+") {
|
||||
sign = 1;
|
||||
}
|
||||
|
||||
// import timezone and timedelta module to create a tzinfo object
|
||||
OwnedRef class_timezone;
|
||||
OwnedRef class_timedelta;
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(datetime.obj(), "timezone", &class_timezone));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(datetime.obj(), "timedelta", &class_timedelta));
|
||||
|
||||
// check input
|
||||
uint32_t minutes, hours;
|
||||
if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) ||
|
||||
!::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(),
|
||||
&minutes)) {
|
||||
return Status::Invalid("Invalid timezone: ", tz);
|
||||
}
|
||||
|
||||
// save offset as a signed integer
|
||||
OwnedRef total_minutes(PyLong_FromLong(
|
||||
sign * ((static_cast<int>(hours) * 60) + static_cast<int>(minutes))));
|
||||
// create zero integers for empty arguments in datetime.timedelta
|
||||
OwnedRef zero(PyLong_FromLong(static_cast<int>(0)));
|
||||
|
||||
// call datetime.timedelta to get correct offset object for datetime.timezone
|
||||
auto offset =
|
||||
PyObject_CallFunctionObjArgs(class_timedelta.obj(), zero.obj(), zero.obj(),
|
||||
zero.obj(), zero.obj(), total_minutes.obj(), NULL);
|
||||
RETURN_IF_PYERROR();
|
||||
// call datetime.timezone
|
||||
auto tzinfo = PyObject_CallFunctionObjArgs(class_timezone.obj(), offset, NULL);
|
||||
RETURN_IF_PYERROR();
|
||||
return tzinfo;
|
||||
}
|
||||
|
||||
// fallback on zoneinfo if tz is string and pytz is not present
|
||||
if (internal::ImportModule("zoneinfo", &zoneinfo).ok()) {
|
||||
OwnedRef class_zoneinfo;
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(zoneinfo.obj(), "ZoneInfo", &class_zoneinfo));
|
||||
OwnedRef py_tz_string(
|
||||
PyUnicode_FromStringAndSize(tz.c_str(), static_cast<Py_ssize_t>(tz.size())));
|
||||
auto tzinfo =
|
||||
PyObject_CallFunctionObjArgs(class_zoneinfo.obj(), py_tz_string.obj(), NULL);
|
||||
RETURN_IF_PYERROR();
|
||||
return tzinfo;
|
||||
}
|
||||
|
||||
return Status::Invalid(
|
||||
"Pytz package or Python>=3.8 for zoneinfo module must be installed.");
|
||||
}
|
||||
|
||||
Result<std::string> TzinfoToString(PyObject* tzinfo) {
|
||||
OwnedRef module_pytz; // import pytz
|
||||
OwnedRef module_datetime; // import datetime
|
||||
OwnedRef module_zoneinfo; // import zoneinfo
|
||||
OwnedRef module_dateutil; // import dateutil
|
||||
OwnedRef class_timezone; // from datetime import timezone
|
||||
OwnedRef class_fixedoffset; // from pytz import _FixedOffset
|
||||
OwnedRef class_basetzinfo; // from pytz import BaseTzInfo
|
||||
OwnedRef class_zoneinfo; // from zoneinfo import ZoneInfo
|
||||
OwnedRef class_tzfile; // from zoneinfo import tzfile
|
||||
|
||||
// import necessary modules
|
||||
RETURN_NOT_OK(internal::ImportModule("datetime", &module_datetime));
|
||||
// import necessary classes
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(module_datetime.obj(), "timezone", &class_timezone));
|
||||
|
||||
// check that it's a valid tzinfo object
|
||||
if (!PyTZInfo_Check(tzinfo)) {
|
||||
return Status::TypeError("Not an instance of datetime.tzinfo");
|
||||
}
|
||||
|
||||
// if tzinfo is an instance of datetime.timezone return the
|
||||
// HH:MM offset string representation
|
||||
if (PyObject_IsInstance(tzinfo, class_timezone.obj())) {
|
||||
// still recognize datetime.timezone.utc as UTC (instead of +00:00)
|
||||
OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None));
|
||||
RETURN_IF_PYERROR();
|
||||
if (PyUnicode_Check(tzname_object.obj())) {
|
||||
std::string result;
|
||||
RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result));
|
||||
if (result == "UTC") {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
return PyTZInfo_utcoffset_hhmm(tzinfo);
|
||||
}
|
||||
|
||||
// Try to import pytz if it is available
|
||||
if (internal::ImportModule("pytz", &module_pytz).ok()) {
|
||||
RETURN_NOT_OK(internal::ImportFromModule(module_pytz.obj(), "_FixedOffset",
|
||||
&class_fixedoffset));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(module_pytz.obj(), "BaseTzInfo", &class_basetzinfo));
|
||||
}
|
||||
|
||||
// if tzinfo is an instance of pytz._FixedOffset return the
|
||||
// HH:MM offset string representation
|
||||
if (module_pytz.obj() != nullptr &&
|
||||
PyObject_IsInstance(tzinfo, class_fixedoffset.obj())) {
|
||||
OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None));
|
||||
RETURN_IF_PYERROR();
|
||||
return PyTZInfo_utcoffset_hhmm(tzinfo);
|
||||
}
|
||||
|
||||
// if pytz is installed and tzinfo is and instance of pytz.BaseTzInfo
|
||||
if (module_pytz.obj() != nullptr &&
|
||||
PyObject_IsInstance(tzinfo, class_basetzinfo.obj())) {
|
||||
OwnedRef zone(PyObject_GetAttrString(tzinfo, "zone"));
|
||||
RETURN_IF_PYERROR();
|
||||
std::string result;
|
||||
RETURN_NOT_OK(internal::PyUnicode_AsStdString(zone.obj(), &result));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Try to import zoneinfo if it is available
|
||||
if (internal::ImportModule("zoneinfo", &module_zoneinfo).ok()) {
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(module_zoneinfo.obj(), "ZoneInfo", &class_zoneinfo));
|
||||
}
|
||||
|
||||
// if zoneinfo is installed and tzinfo is an instance of zoneinfo.ZoneInfo
|
||||
if (module_zoneinfo.obj() != nullptr &&
|
||||
PyObject_IsInstance(tzinfo, class_zoneinfo.obj())) {
|
||||
OwnedRef key(PyObject_GetAttrString(tzinfo, "key"));
|
||||
RETURN_IF_PYERROR();
|
||||
std::string result;
|
||||
RETURN_NOT_OK(internal::PyUnicode_AsStdString(key.obj(), &result));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Try to import dateutil if it is available
|
||||
if (internal::ImportModule("dateutil.tz", &module_dateutil).ok()) {
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(module_dateutil.obj(), "tzfile", &class_tzfile));
|
||||
}
|
||||
|
||||
// if dateutil is installed and tzinfo is an instance of dateutil.tz.tzfile
|
||||
if (module_dateutil.obj() != nullptr &&
|
||||
PyObject_IsInstance(tzinfo, class_tzfile.obj())) {
|
||||
OwnedRef _filename(PyObject_GetAttrString(tzinfo, "_filename"));
|
||||
RETURN_IF_PYERROR();
|
||||
std::string result;
|
||||
RETURN_NOT_OK(internal::PyUnicode_AsStdString(_filename.obj(), &result));
|
||||
// _filename returns a full path in general ('/usr/share/zoneinfo/Europe/Paris')
|
||||
// or POSIX name on Windows ('Europe/Paris') - we need a substring in first case
|
||||
std::size_t pos = result.find("zoneinfo/");
|
||||
if (pos != std::string::npos) {
|
||||
return result.substr(pos + 9);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// attempt to call tzinfo.tzname(None)
|
||||
OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None));
|
||||
RETURN_IF_PYERROR();
|
||||
if (PyUnicode_Check(tzname_object.obj())) {
|
||||
std::string result;
|
||||
RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result));
|
||||
return result;
|
||||
}
|
||||
|
||||
// fall back to HH:MM offset string representation based on tzinfo.utcoffset(None)
|
||||
return PyTZInfo_utcoffset_hhmm(tzinfo);
|
||||
}
|
||||
|
||||
PyObject* MonthDayNanoIntervalToNamedTuple(
|
||||
const MonthDayNanoIntervalType::MonthDayNanos& interval) {
|
||||
OwnedRef tuple(PyStructSequence_New(&MonthDayNanoTupleType));
|
||||
if (ARROW_PREDICT_FALSE(tuple.obj() == nullptr)) {
|
||||
return nullptr;
|
||||
}
|
||||
PyStructSequence_SetItem(tuple.obj(), /*pos=*/0, PyLong_FromLong(interval.months));
|
||||
PyStructSequence_SetItem(tuple.obj(), /*pos=*/1, PyLong_FromLong(interval.days));
|
||||
PyStructSequence_SetItem(tuple.obj(), /*pos=*/2,
|
||||
PyLong_FromLongLong(interval.nanoseconds));
|
||||
return tuple.detach();
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// Wrapper around a Python list object that mimics dereference and assignment
|
||||
// operations.
|
||||
struct PyListAssigner {
|
||||
public:
|
||||
explicit PyListAssigner(PyObject* list) : list_(list) {
|
||||
ARROW_DCHECK(PyList_Check(list_));
|
||||
}
|
||||
|
||||
PyListAssigner& operator*() { return *this; }
|
||||
|
||||
void operator=(PyObject* obj) {
|
||||
if (ARROW_PREDICT_FALSE(PyList_SetItem(list_, current_index_, obj) == -1)) {
|
||||
Py_FatalError("list did not have the correct preallocated size.");
|
||||
}
|
||||
}
|
||||
|
||||
PyListAssigner& operator++() {
|
||||
current_index_++;
|
||||
return *this;
|
||||
}
|
||||
|
||||
PyListAssigner& operator+=(int64_t offset) {
|
||||
current_index_ += offset;
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
PyObject* list_;
|
||||
int64_t current_index_ = 0;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
Result<PyObject*> MonthDayNanoIntervalArrayToPyList(
|
||||
const MonthDayNanoIntervalArray& array) {
|
||||
OwnedRef out_list(PyList_New(array.length()));
|
||||
RETURN_IF_PYERROR();
|
||||
PyListAssigner out_objects(out_list.obj());
|
||||
auto& interval_array =
|
||||
arrow::internal::checked_cast<const MonthDayNanoIntervalArray&>(array);
|
||||
RETURN_NOT_OK(internal::WriteArrayObjects(
|
||||
interval_array,
|
||||
[&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) {
|
||||
PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval);
|
||||
if (ARROW_PREDICT_FALSE(tuple == nullptr)) {
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
|
||||
*out = tuple;
|
||||
return Status::OK();
|
||||
},
|
||||
out_objects));
|
||||
return out_list.detach();
|
||||
}
|
||||
|
||||
Result<PyObject*> MonthDayNanoIntervalScalarToPyObject(
|
||||
const MonthDayNanoIntervalScalar& scalar) {
|
||||
if (scalar.is_valid) {
|
||||
return internal::MonthDayNanoIntervalToNamedTuple(scalar.value);
|
||||
} else {
|
||||
Py_INCREF(Py_None);
|
||||
return Py_None;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,231 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/int_util_overflow.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
// By default, PyDateTimeAPI is a *static* variable. This forces
|
||||
// PyDateTime_IMPORT to be called in every C/C++ module using the
|
||||
// C datetime API. This is error-prone and potentially costly.
|
||||
// Instead, we redefine PyDateTimeAPI to point to a global variable,
|
||||
// which is initialized once by calling InitDatetime().
|
||||
#ifdef PYPY_VERSION
|
||||
# include "datetime.h"
|
||||
#else
|
||||
# define PyDateTimeAPI ::arrow::py::internal::datetime_api
|
||||
#endif
|
||||
|
||||
namespace arrow {
|
||||
using internal::AddWithOverflow;
|
||||
using internal::MultiplyWithOverflow;
|
||||
namespace py {
|
||||
namespace internal {
|
||||
|
||||
#ifndef PYPY_VERSION
|
||||
extern PyDateTime_CAPI* datetime_api;
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
void InitDatetime();
|
||||
#endif
|
||||
|
||||
// Returns the MonthDayNano namedtuple type (increments the reference count).
|
||||
ARROW_PYTHON_EXPORT
|
||||
PyObject* NewMonthDayNanoTupleType();
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyTime_to_us(PyObject* pytime) {
|
||||
return (PyDateTime_TIME_GET_HOUR(pytime) * 3600000000LL +
|
||||
PyDateTime_TIME_GET_MINUTE(pytime) * 60000000LL +
|
||||
PyDateTime_TIME_GET_SECOND(pytime) * 1000000LL +
|
||||
PyDateTime_TIME_GET_MICROSECOND(pytime));
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyTime_to_s(PyObject* pytime) { return PyTime_to_us(pytime) / 1000000; }
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyTime_to_ms(PyObject* pytime) { return PyTime_to_us(pytime) / 1000; }
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyTime_to_ns(PyObject* pytime) { return PyTime_to_us(pytime) * 1000; }
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out);
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out);
|
||||
|
||||
// WARNING: This function returns a naive datetime.
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out);
|
||||
|
||||
// This declaration must be the same as in filesystem/filesystem.h
|
||||
using TimePoint =
|
||||
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
int64_t PyDate_to_days(PyDateTime_Date* pydate);
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyDate_to_s(PyDateTime_Date* pydate) {
|
||||
return PyDate_to_days(pydate) * 86400LL;
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
|
||||
return PyDate_to_days(pydate) * 86400000LL;
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) {
|
||||
return (PyDate_to_s(reinterpret_cast<PyDateTime_Date*>(pydatetime)) +
|
||||
PyDateTime_DATE_GET_HOUR(pydatetime) * 3600LL +
|
||||
PyDateTime_DATE_GET_MINUTE(pydatetime) * 60LL +
|
||||
PyDateTime_DATE_GET_SECOND(pydatetime));
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) {
|
||||
return (PyDateTime_to_s(pydatetime) * 1000LL +
|
||||
PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000);
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) {
|
||||
return (PyDateTime_to_s(pydatetime) * 1000000LL +
|
||||
PyDateTime_DATE_GET_MICROSECOND(pydatetime));
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) {
|
||||
return PyDateTime_to_us(pydatetime) * 1000LL;
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline TimePoint PyDateTime_to_TimePoint(PyDateTime_DateTime* pydatetime) {
|
||||
return TimePoint(TimePoint::duration(PyDateTime_to_ns(pydatetime)));
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t TimePoint_to_ns(TimePoint val) { return val.time_since_epoch().count(); }
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline TimePoint TimePoint_from_s(double val) {
|
||||
return TimePoint(TimePoint::duration(static_cast<int64_t>(1e9 * val)));
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline TimePoint TimePoint_from_ns(int64_t val) {
|
||||
return TimePoint(TimePoint::duration(val));
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyDelta_to_s(PyDateTime_Delta* pytimedelta) {
|
||||
return (PyDateTime_DELTA_GET_DAYS(pytimedelta) * 86400LL +
|
||||
PyDateTime_DELTA_GET_SECONDS(pytimedelta));
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline int64_t PyDelta_to_ms(PyDateTime_Delta* pytimedelta) {
|
||||
return (PyDelta_to_s(pytimedelta) * 1000LL +
|
||||
PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta) / 1000);
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline Result<int64_t> PyDelta_to_us(PyDateTime_Delta* pytimedelta) {
|
||||
int64_t result = PyDelta_to_s(pytimedelta);
|
||||
if (MultiplyWithOverflow(result, 1000000LL, &result)) {
|
||||
return Status::Invalid("Timedelta too large to fit in 64-bit integer");
|
||||
}
|
||||
if (AddWithOverflow(result, PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta), &result)) {
|
||||
return Status::Invalid("Timedelta too large to fit in 64-bit integer");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
inline Result<int64_t> PyDelta_to_ns(PyDateTime_Delta* pytimedelta) {
|
||||
ARROW_ASSIGN_OR_RAISE(int64_t result, PyDelta_to_us(pytimedelta));
|
||||
if (MultiplyWithOverflow(result, 1000LL, &result)) {
|
||||
return Status::Invalid("Timedelta too large to fit in 64-bit integer");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<int64_t> PyDateTime_utcoffset_s(PyObject* pydatetime);
|
||||
|
||||
/// \brief Convert a time zone name into a time zone object.
|
||||
///
|
||||
/// Supported input strings are:
|
||||
/// * As used in the Olson time zone database (the "tz database" or
|
||||
/// "tzdata"), such as "America/New_York"
|
||||
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
|
||||
/// GIL must be held when calling this method.
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<PyObject*> StringToTzinfo(const std::string& tz);
|
||||
|
||||
/// \brief Convert a time zone object to a string representation.
|
||||
///
|
||||
/// The output strings are:
|
||||
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
|
||||
/// if the input object is either an instance of pytz._FixedOffset or
|
||||
/// datetime.timedelta
|
||||
/// * The timezone's name if the input object's tzname() method returns with a
|
||||
/// non-empty timezone name such as "UTC" or "America/New_York"
|
||||
///
|
||||
/// GIL must be held when calling this method.
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<std::string> TzinfoToString(PyObject* pytzinfo);
|
||||
|
||||
/// \brief Convert MonthDayNano to a python namedtuple.
|
||||
///
|
||||
/// Return a named tuple (pyarrow.MonthDayNano) containing attributes
|
||||
/// "months", "days", "nanoseconds" in the given order
|
||||
/// with values extracted from the fields on interval.
|
||||
///
|
||||
/// GIL must be held when calling this method.
|
||||
ARROW_PYTHON_EXPORT
|
||||
PyObject* MonthDayNanoIntervalToNamedTuple(
|
||||
const MonthDayNanoIntervalType::MonthDayNanos& interval);
|
||||
|
||||
/// \brief Convert the given Array to a PyList object containing
|
||||
/// pyarrow.MonthDayNano objects.
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<PyObject*> MonthDayNanoIntervalArrayToPyList(
|
||||
const MonthDayNanoIntervalArray& array);
|
||||
|
||||
/// \brief Convert the Scalar object to a pyarrow.MonthDayNano (or None if
|
||||
/// is isn't valid).
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<PyObject*> MonthDayNanoIntervalScalarToPyObject(
|
||||
const MonthDayNanoIntervalScalar& scalar);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,265 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/decimal.h"
|
||||
#include "arrow/python/helpers.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace internal {
|
||||
|
||||
Status ImportDecimalType(OwnedRef* decimal_type) {
|
||||
OwnedRef decimal_module;
|
||||
RETURN_NOT_OK(ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(ImportFromModule(decimal_module.obj(), "Decimal", decimal_type));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PythonDecimalToString(PyObject* python_decimal, std::string* out) {
|
||||
// Call Python's str(decimal_object)
|
||||
return PyObject_StdStringStr(python_decimal, out);
|
||||
}
|
||||
|
||||
// \brief Infer the precision and scale of a Python decimal.Decimal instance
|
||||
// \param python_decimal[in] An instance of decimal.Decimal
|
||||
// \param precision[out] The value of the inferred precision
|
||||
// \param scale[out] The value of the inferred scale
|
||||
// \return The status of the operation
|
||||
static Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision,
|
||||
int32_t* scale) {
|
||||
ARROW_DCHECK_NE(python_decimal, NULLPTR);
|
||||
ARROW_DCHECK_NE(precision, NULLPTR);
|
||||
ARROW_DCHECK_NE(scale, NULLPTR);
|
||||
|
||||
// TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a
|
||||
// ARROW_DCHECK
|
||||
OwnedRef as_tuple(PyObject_CallMethod(python_decimal, const_cast<char*>("as_tuple"),
|
||||
const_cast<char*>("")));
|
||||
RETURN_IF_PYERROR();
|
||||
ARROW_DCHECK(PyTuple_Check(as_tuple.obj()));
|
||||
|
||||
OwnedRef digits(PyObject_GetAttrString(as_tuple.obj(), "digits"));
|
||||
RETURN_IF_PYERROR();
|
||||
ARROW_DCHECK(PyTuple_Check(digits.obj()));
|
||||
|
||||
const auto num_digits = static_cast<int32_t>(PyTuple_Size(digits.obj()));
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
OwnedRef py_exponent(PyObject_GetAttrString(as_tuple.obj(), "exponent"));
|
||||
RETURN_IF_PYERROR();
|
||||
ARROW_DCHECK(IsPyInteger(py_exponent.obj()));
|
||||
|
||||
const auto exponent = static_cast<int32_t>(PyLong_AsLong(py_exponent.obj()));
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
if (exponent < 0) {
|
||||
// If exponent > num_digits, we have a number with leading zeros
|
||||
// such as 0.01234. Ensure we have enough precision for leading zeros
|
||||
// (which are not included in num_digits).
|
||||
*precision = std::max(num_digits, -exponent);
|
||||
*scale = -exponent;
|
||||
} else {
|
||||
// Trailing zeros are not included in num_digits, need to add to precision.
|
||||
// Note we don't generate negative scales as they are poorly supported
|
||||
// in non-Arrow systems.
|
||||
*precision = num_digits + exponent;
|
||||
*scale = 0;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
PyObject* DecimalFromString(PyObject* decimal_constructor,
|
||||
const std::string& decimal_string) {
|
||||
ARROW_DCHECK_NE(decimal_constructor, nullptr);
|
||||
|
||||
auto string_size = decimal_string.size();
|
||||
ARROW_DCHECK_GT(string_size, 0);
|
||||
|
||||
auto string_bytes = decimal_string.c_str();
|
||||
ARROW_DCHECK_NE(string_bytes, nullptr);
|
||||
|
||||
return PyObject_CallFunction(decimal_constructor, const_cast<char*>("s#"), string_bytes,
|
||||
static_cast<Py_ssize_t>(string_size));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename ArrowDecimal>
|
||||
Status DecimalFromStdString(const std::string& decimal_string,
|
||||
const DecimalType& arrow_type, ArrowDecimal* out) {
|
||||
int32_t inferred_precision;
|
||||
int32_t inferred_scale;
|
||||
|
||||
RETURN_NOT_OK(ArrowDecimal::FromString(decimal_string, out, &inferred_precision,
|
||||
&inferred_scale));
|
||||
|
||||
const int32_t precision = arrow_type.precision();
|
||||
const int32_t scale = arrow_type.scale();
|
||||
|
||||
if (scale != inferred_scale) {
|
||||
ARROW_DCHECK_NE(out, NULLPTR);
|
||||
ARROW_ASSIGN_OR_RAISE(*out, out->Rescale(inferred_scale, scale));
|
||||
}
|
||||
|
||||
auto inferred_scale_delta = inferred_scale - scale;
|
||||
if (ARROW_PREDICT_FALSE((inferred_precision - inferred_scale_delta) > precision)) {
|
||||
return Status::Invalid(
|
||||
"Decimal type with precision ", inferred_precision,
|
||||
" does not fit into precision inferred from first array element: ", precision);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename ArrowDecimal>
|
||||
Status InternalDecimalFromPythonDecimal(PyObject* python_decimal,
|
||||
const DecimalType& arrow_type,
|
||||
ArrowDecimal* out) {
|
||||
ARROW_DCHECK_NE(python_decimal, NULLPTR);
|
||||
ARROW_DCHECK_NE(out, NULLPTR);
|
||||
|
||||
std::string string;
|
||||
RETURN_NOT_OK(PythonDecimalToString(python_decimal, &string));
|
||||
return DecimalFromStdString(string, arrow_type, out);
|
||||
}
|
||||
|
||||
template <typename ArrowDecimal>
|
||||
Status InternalDecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
|
||||
ArrowDecimal* out) {
|
||||
ARROW_DCHECK_NE(obj, NULLPTR);
|
||||
ARROW_DCHECK_NE(out, NULLPTR);
|
||||
|
||||
if (IsPyInteger(obj)) {
|
||||
// TODO: add a fast path for small-ish ints
|
||||
std::string string;
|
||||
RETURN_NOT_OK(PyObject_StdStringStr(obj, &string));
|
||||
return DecimalFromStdString(string, arrow_type, out);
|
||||
} else if (PyDecimal_Check(obj)) {
|
||||
return InternalDecimalFromPythonDecimal<ArrowDecimal>(obj, arrow_type, out);
|
||||
} else {
|
||||
return Status::TypeError("int or Decimal object expected, got ",
|
||||
Py_TYPE(obj)->tp_name);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
|
||||
Decimal32* out) {
|
||||
return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
|
||||
}
|
||||
|
||||
Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal32* out) {
|
||||
return InternalDecimalFromPyObject(obj, arrow_type, out);
|
||||
}
|
||||
|
||||
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
|
||||
Decimal64* out) {
|
||||
return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
|
||||
}
|
||||
|
||||
Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal64* out) {
|
||||
return InternalDecimalFromPyObject(obj, arrow_type, out);
|
||||
}
|
||||
|
||||
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
|
||||
Decimal128* out) {
|
||||
return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
|
||||
}
|
||||
|
||||
Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
|
||||
Decimal128* out) {
|
||||
return InternalDecimalFromPyObject(obj, arrow_type, out);
|
||||
}
|
||||
|
||||
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
|
||||
Decimal256* out) {
|
||||
return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
|
||||
}
|
||||
|
||||
Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
|
||||
Decimal256* out) {
|
||||
return InternalDecimalFromPyObject(obj, arrow_type, out);
|
||||
}
|
||||
|
||||
bool PyDecimal_Check(PyObject* obj) {
|
||||
static OwnedRef decimal_type;
|
||||
if (!decimal_type.obj()) {
|
||||
ARROW_CHECK_OK(ImportDecimalType(&decimal_type));
|
||||
ARROW_DCHECK(PyType_Check(decimal_type.obj()));
|
||||
}
|
||||
// PyObject_IsInstance() is slower as it has to check for virtual subclasses
|
||||
const int result =
|
||||
PyType_IsSubtype(Py_TYPE(obj), reinterpret_cast<PyTypeObject*>(decimal_type.obj()));
|
||||
ARROW_CHECK_NE(result, -1) << " error during PyType_IsSubtype check";
|
||||
return result == 1;
|
||||
}
|
||||
|
||||
bool PyDecimal_ISNAN(PyObject* obj) {
|
||||
ARROW_DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal";
|
||||
OwnedRef is_nan(
|
||||
PyObject_CallMethod(obj, const_cast<char*>("is_nan"), const_cast<char*>("")));
|
||||
return PyObject_IsTrue(is_nan.obj()) == 1;
|
||||
}
|
||||
|
||||
DecimalMetadata::DecimalMetadata()
|
||||
: DecimalMetadata(std::numeric_limits<int32_t>::min(),
|
||||
std::numeric_limits<int32_t>::min()) {}
|
||||
|
||||
DecimalMetadata::DecimalMetadata(int32_t precision, int32_t scale)
|
||||
: precision_(precision), scale_(scale) {}
|
||||
|
||||
Status DecimalMetadata::Update(int32_t suggested_precision, int32_t suggested_scale) {
|
||||
const int32_t current_scale = scale_;
|
||||
scale_ = std::max(current_scale, suggested_scale);
|
||||
|
||||
const int32_t current_precision = precision_;
|
||||
|
||||
if (current_precision == std::numeric_limits<int32_t>::min()) {
|
||||
precision_ = suggested_precision;
|
||||
} else {
|
||||
auto num_digits = std::max(current_precision - current_scale,
|
||||
suggested_precision - suggested_scale);
|
||||
precision_ = std::max(num_digits + scale_, current_precision);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DecimalMetadata::Update(PyObject* object) {
|
||||
bool is_decimal = PyDecimal_Check(object);
|
||||
|
||||
if (ARROW_PREDICT_FALSE(!is_decimal || PyDecimal_ISNAN(object))) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int32_t precision = 0;
|
||||
int32_t scale = 0;
|
||||
RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale));
|
||||
return Update(precision, scale);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,162 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/type.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Decimal128;
|
||||
class Decimal256;
|
||||
|
||||
namespace py {
|
||||
|
||||
class OwnedRef;
|
||||
|
||||
//
|
||||
// Python Decimal support
|
||||
//
|
||||
|
||||
namespace internal {
|
||||
|
||||
// \brief Import the Python Decimal type
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status ImportDecimalType(OwnedRef* decimal_type);
|
||||
|
||||
// \brief Convert a Python Decimal object to a C++ string
|
||||
// \param[in] python_decimal A Python decimal.Decimal instance
|
||||
// \param[out] The string representation of the Python Decimal instance
|
||||
// \return The status of the operation
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status PythonDecimalToString(PyObject* python_decimal, std::string* out);
|
||||
|
||||
// \brief Convert a C++ std::string to a Python Decimal instance
|
||||
// \param[in] decimal_constructor The decimal type object
|
||||
// \param[in] decimal_string A decimal string
|
||||
// \return An instance of decimal.Decimal
|
||||
ARROW_PYTHON_EXPORT
|
||||
PyObject* DecimalFromString(PyObject* decimal_constructor,
|
||||
const std::string& decimal_string);
|
||||
|
||||
// \brief Convert a Python decimal to an Arrow Decimal128 object
|
||||
// \param[in] python_decimal A Python decimal.Decimal instance
|
||||
// \param[in] arrow_type An instance of arrow::DecimalType
|
||||
// \param[out] out A pointer to a Decimal128
|
||||
// \return The status of the operation
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
|
||||
Decimal32* out);
|
||||
|
||||
// \brief Convert a Python object to an Arrow Decimal128 object
|
||||
// \param[in] python_decimal A Python int or decimal.Decimal instance
|
||||
// \param[in] arrow_type An instance of arrow::DecimalType
|
||||
// \param[out] out A pointer to a Decimal128
|
||||
// \return The status of the operation
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal32* out);
|
||||
|
||||
// \brief Convert a Python decimal to an Arrow Decimal128 object
|
||||
// \param[in] python_decimal A Python decimal.Decimal instance
|
||||
// \param[in] arrow_type An instance of arrow::DecimalType
|
||||
// \param[out] out A pointer to a Decimal128
|
||||
// \return The status of the operation
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
|
||||
Decimal64* out);
|
||||
|
||||
// \brief Convert a Python object to an Arrow Decimal128 object
|
||||
// \param[in] python_decimal A Python int or decimal.Decimal instance
|
||||
// \param[in] arrow_type An instance of arrow::DecimalType
|
||||
// \param[out] out A pointer to a Decimal128
|
||||
// \return The status of the operation
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal64* out);
|
||||
|
||||
// \brief Convert a Python decimal to an Arrow Decimal128 object
|
||||
// \param[in] python_decimal A Python decimal.Decimal instance
|
||||
// \param[in] arrow_type An instance of arrow::DecimalType
|
||||
// \param[out] out A pointer to a Decimal128
|
||||
// \return The status of the operation
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
|
||||
Decimal128* out);
|
||||
|
||||
// \brief Convert a Python object to an Arrow Decimal128 object
|
||||
// \param[in] python_decimal A Python int or decimal.Decimal instance
|
||||
// \param[in] arrow_type An instance of arrow::DecimalType
|
||||
// \param[out] out A pointer to a Decimal128
|
||||
// \return The status of the operation
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal128* out);
|
||||
|
||||
// \brief Convert a Python decimal to an Arrow Decimal256 object
|
||||
// \param[in] python_decimal A Python decimal.Decimal instance
|
||||
// \param[in] arrow_type An instance of arrow::DecimalType
|
||||
// \param[out] out A pointer to a Decimal256
|
||||
// \return The status of the operation
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
|
||||
Decimal256* out);
|
||||
|
||||
// \brief Convert a Python object to an Arrow Decimal256 object
|
||||
// \param[in] python_decimal A Python int or decimal.Decimal instance
|
||||
// \param[in] arrow_type An instance of arrow::DecimalType
|
||||
// \param[out] out A pointer to a Decimal256
|
||||
// \return The status of the operation
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal256* out);
|
||||
|
||||
// \brief Check whether obj is an instance of Decimal
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool PyDecimal_Check(PyObject* obj);
|
||||
|
||||
// \brief Check whether obj is nan. This function will abort the program if the argument
|
||||
// is not a Decimal instance
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool PyDecimal_ISNAN(PyObject* obj);
|
||||
|
||||
// \brief Helper class to track and update the precision and scale of a decimal
|
||||
class ARROW_PYTHON_EXPORT DecimalMetadata {
|
||||
public:
|
||||
DecimalMetadata();
|
||||
DecimalMetadata(int32_t precision, int32_t scale);
|
||||
|
||||
// \brief Adjust the precision and scale of a decimal type given a new precision and a
|
||||
// new scale \param[in] suggested_precision A candidate precision \param[in]
|
||||
// suggested_scale A candidate scale \return The status of the operation
|
||||
Status Update(int32_t suggested_precision, int32_t suggested_scale);
|
||||
|
||||
// \brief A convenient interface for updating the precision and scale based on a Python
|
||||
// Decimal object \param object A Python Decimal object \return The status of the
|
||||
// operation
|
||||
Status Update(PyObject* object);
|
||||
|
||||
int32_t precision() const { return precision_; }
|
||||
int32_t scale() const { return scale_; }
|
||||
|
||||
private:
|
||||
int32_t precision_;
|
||||
int32_t scale_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,217 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/python/extension_type.h"
|
||||
#include "arrow/python/helpers.h"
|
||||
#include "arrow/python/pyarrow.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using internal::checked_cast;
|
||||
|
||||
namespace py {
|
||||
|
||||
namespace {
|
||||
|
||||
// Serialize a Python ExtensionType instance
|
||||
Status SerializeExtInstance(PyObject* type_instance, std::string* out) {
|
||||
OwnedRef res(
|
||||
cpp_PyObject_CallMethod(type_instance, "__arrow_ext_serialize__", nullptr));
|
||||
if (!res) {
|
||||
return ConvertPyError();
|
||||
}
|
||||
if (!PyBytes_Check(res.obj())) {
|
||||
return Status::TypeError(
|
||||
"__arrow_ext_serialize__ should return bytes object, "
|
||||
"got ",
|
||||
internal::PyObject_StdStringRepr(res.obj()));
|
||||
}
|
||||
*out = internal::PyBytes_AsStdString(res.obj());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Deserialize a Python ExtensionType instance
|
||||
PyObject* DeserializeExtInstance(PyObject* type_class,
|
||||
std::shared_ptr<DataType> storage_type,
|
||||
const std::string& serialized_data) {
|
||||
OwnedRef storage_ref(wrap_data_type(storage_type));
|
||||
if (!storage_ref) {
|
||||
return nullptr;
|
||||
}
|
||||
OwnedRef data_ref(PyBytes_FromStringAndSize(
|
||||
serialized_data.data(), static_cast<Py_ssize_t>(serialized_data.size())));
|
||||
if (!data_ref) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return cpp_PyObject_CallMethod(type_class, "__arrow_ext_deserialize__", "OO",
|
||||
storage_ref.obj(), data_ref.obj());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
static const char* kExtensionName = "arrow.py_extension_type";
|
||||
|
||||
std::string PyExtensionType::ToString(bool show_metadata) const {
|
||||
PyAcquireGIL lock;
|
||||
|
||||
std::stringstream ss;
|
||||
OwnedRef instance(GetInstance());
|
||||
ss << "extension<" << this->extension_name() << "<" << Py_TYPE(instance.obj())->tp_name
|
||||
<< ">>";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
PyExtensionType::PyExtensionType(std::shared_ptr<DataType> storage_type, PyObject* typ,
|
||||
PyObject* inst)
|
||||
: ExtensionType(storage_type),
|
||||
extension_name_(kExtensionName),
|
||||
type_class_(typ),
|
||||
type_instance_(inst) {}
|
||||
|
||||
PyExtensionType::PyExtensionType(std::shared_ptr<DataType> storage_type,
|
||||
std::string extension_name, PyObject* typ,
|
||||
PyObject* inst)
|
||||
: ExtensionType(storage_type),
|
||||
extension_name_(std::move(extension_name)),
|
||||
type_class_(typ),
|
||||
type_instance_(inst) {}
|
||||
|
||||
bool PyExtensionType::ExtensionEquals(const ExtensionType& other) const {
|
||||
PyAcquireGIL lock;
|
||||
|
||||
if (other.extension_name() != extension_name()) {
|
||||
return false;
|
||||
}
|
||||
const auto& other_ext = checked_cast<const PyExtensionType&>(other);
|
||||
int res = -1;
|
||||
if (!type_instance_) {
|
||||
if (other_ext.type_instance_) {
|
||||
return false;
|
||||
}
|
||||
// Compare Python types
|
||||
res = PyObject_RichCompareBool(type_class_.obj(), other_ext.type_class_.obj(), Py_EQ);
|
||||
} else {
|
||||
if (!other_ext.type_instance_) {
|
||||
return false;
|
||||
}
|
||||
// Compare Python instances
|
||||
OwnedRef left(GetInstance());
|
||||
OwnedRef right(other_ext.GetInstance());
|
||||
if (!left || !right) {
|
||||
goto error;
|
||||
}
|
||||
res = PyObject_RichCompareBool(left.obj(), right.obj(), Py_EQ);
|
||||
}
|
||||
if (res == -1) {
|
||||
goto error;
|
||||
}
|
||||
return res == 1;
|
||||
|
||||
error:
|
||||
// Cannot propagate error
|
||||
PyErr_WriteUnraisable(nullptr);
|
||||
return false;
|
||||
}
|
||||
|
||||
std::shared_ptr<Array> PyExtensionType::MakeArray(std::shared_ptr<ArrayData> data) const {
|
||||
ARROW_DCHECK_EQ(data->type->id(), Type::EXTENSION);
|
||||
return std::make_shared<ExtensionArray>(data);
|
||||
}
|
||||
|
||||
std::string PyExtensionType::Serialize() const {
|
||||
ARROW_DCHECK(type_instance_);
|
||||
return serialized_;
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<DataType>> PyExtensionType::Deserialize(
|
||||
std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
|
||||
PyAcquireGIL lock;
|
||||
|
||||
if (import_pyarrow()) {
|
||||
return ConvertPyError();
|
||||
}
|
||||
OwnedRef res(DeserializeExtInstance(type_class_.obj(), storage_type, serialized_data));
|
||||
if (!res) {
|
||||
return ConvertPyError();
|
||||
}
|
||||
return unwrap_data_type(res.obj());
|
||||
}
|
||||
|
||||
PyObject* PyExtensionType::GetInstance() const {
|
||||
if (!type_instance_) {
|
||||
PyErr_SetString(PyExc_TypeError, "Not an instance");
|
||||
return nullptr;
|
||||
}
|
||||
ARROW_DCHECK(PyWeakref_CheckRef(type_instance_.obj()));
|
||||
PyObject* inst = PyWeakref_GET_OBJECT(type_instance_.obj());
|
||||
if (inst != Py_None) {
|
||||
// Cached instance still alive
|
||||
Py_INCREF(inst);
|
||||
return inst;
|
||||
} else {
|
||||
// Must reconstruct from serialized form
|
||||
// XXX cache again?
|
||||
return DeserializeExtInstance(type_class_.obj(), storage_type_, serialized_);
|
||||
}
|
||||
}
|
||||
|
||||
Status PyExtensionType::SetInstance(PyObject* inst) const {
|
||||
// Check we have the right type
|
||||
PyObject* typ = reinterpret_cast<PyObject*>(Py_TYPE(inst));
|
||||
if (typ != type_class_.obj()) {
|
||||
return Status::TypeError("Unexpected Python ExtensionType class ",
|
||||
internal::PyObject_StdStringRepr(typ), " expected ",
|
||||
internal::PyObject_StdStringRepr(type_class_.obj()));
|
||||
}
|
||||
|
||||
PyObject* wr = PyWeakref_NewRef(inst, nullptr);
|
||||
if (wr == NULL) {
|
||||
return ConvertPyError();
|
||||
}
|
||||
type_instance_.reset(wr);
|
||||
return SerializeExtInstance(inst, &serialized_);
|
||||
}
|
||||
|
||||
Status PyExtensionType::FromClass(const std::shared_ptr<DataType> storage_type,
|
||||
const std::string extension_name, PyObject* typ,
|
||||
std::shared_ptr<ExtensionType>* out) {
|
||||
Py_INCREF(typ);
|
||||
out->reset(new PyExtensionType(storage_type, std::move(extension_name), typ));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegisterPyExtensionType(const std::shared_ptr<DataType>& type) {
|
||||
ARROW_DCHECK_EQ(type->id(), Type::EXTENSION);
|
||||
auto ext_type = std::dynamic_pointer_cast<ExtensionType>(type);
|
||||
return RegisterExtensionType(ext_type);
|
||||
}
|
||||
|
||||
Status UnregisterPyExtensionType(const std::string& type_name) {
|
||||
return UnregisterExtensionType(type_name);
|
||||
}
|
||||
|
||||
std::string PyExtensionName() { return kExtensionName; }
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,85 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/extension_type.h"
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
class ARROW_PYTHON_EXPORT PyExtensionType : public ExtensionType {
|
||||
public:
|
||||
// Implement extensionType API
|
||||
std::string extension_name() const override { return extension_name_; }
|
||||
|
||||
std::string ToString(bool show_metadata = false) const override;
|
||||
|
||||
bool ExtensionEquals(const ExtensionType& other) const override;
|
||||
|
||||
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
|
||||
|
||||
Result<std::shared_ptr<DataType>> Deserialize(
|
||||
std::shared_ptr<DataType> storage_type,
|
||||
const std::string& serialized) const override;
|
||||
|
||||
std::string Serialize() const override;
|
||||
|
||||
// For use from Cython
|
||||
// Assumes that `typ` is borrowed
|
||||
static Status FromClass(const std::shared_ptr<DataType> storage_type,
|
||||
const std::string extension_name, PyObject* typ,
|
||||
std::shared_ptr<ExtensionType>* out);
|
||||
|
||||
// Return new ref
|
||||
PyObject* GetInstance() const;
|
||||
Status SetInstance(PyObject*) const;
|
||||
|
||||
protected:
|
||||
PyExtensionType(std::shared_ptr<DataType> storage_type, PyObject* typ,
|
||||
PyObject* inst = NULLPTR);
|
||||
PyExtensionType(std::shared_ptr<DataType> storage_type, std::string extension_name,
|
||||
PyObject* typ, PyObject* inst = NULLPTR);
|
||||
|
||||
std::string extension_name_;
|
||||
|
||||
// These fields are mutable because of two-step initialization.
|
||||
mutable OwnedRefNoGIL type_class_;
|
||||
// A weakref or null. Storing a strong reference to the Python extension type
|
||||
// instance would create an unreclaimable reference cycle between Python and C++
|
||||
// (the Python instance has to keep a strong reference to the C++ ExtensionType
|
||||
// in other direction). Instead, we store a weakref to the instance.
|
||||
// If the weakref is dead, we reconstruct the instance from its serialized form.
|
||||
mutable OwnedRefNoGIL type_instance_;
|
||||
// Empty if type_instance_ is null
|
||||
mutable std::string serialized_;
|
||||
};
|
||||
|
||||
ARROW_PYTHON_EXPORT std::string PyExtensionName();
|
||||
|
||||
ARROW_PYTHON_EXPORT Status RegisterPyExtensionType(const std::shared_ptr<DataType>&);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status UnregisterPyExtensionType(const std::string& type_name);
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,206 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/filesystem.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using fs::FileInfo;
|
||||
using fs::FileSelector;
|
||||
|
||||
namespace py {
|
||||
namespace fs {
|
||||
|
||||
PyFileSystem::PyFileSystem(PyObject* handler, PyFileSystemVtable vtable)
|
||||
: handler_(handler), vtable_(std::move(vtable)) {
|
||||
Py_INCREF(handler);
|
||||
}
|
||||
|
||||
PyFileSystem::~PyFileSystem() {}
|
||||
|
||||
std::shared_ptr<PyFileSystem> PyFileSystem::Make(PyObject* handler,
|
||||
PyFileSystemVtable vtable) {
|
||||
return std::make_shared<PyFileSystem>(handler, std::move(vtable));
|
||||
}
|
||||
|
||||
std::string PyFileSystem::type_name() const {
|
||||
std::string result;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.get_type_name(handler_.obj(), &result);
|
||||
if (PyErr_Occurred()) {
|
||||
PyErr_WriteUnraisable(handler_.obj());
|
||||
}
|
||||
return Status::OK();
|
||||
});
|
||||
ARROW_UNUSED(st);
|
||||
return result;
|
||||
}
|
||||
|
||||
bool PyFileSystem::Equals(const FileSystem& other) const {
|
||||
bool result;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
result = vtable_.equals(handler_.obj(), other);
|
||||
if (PyErr_Occurred()) {
|
||||
PyErr_WriteUnraisable(handler_.obj());
|
||||
}
|
||||
return Status::OK();
|
||||
});
|
||||
ARROW_UNUSED(st);
|
||||
return result;
|
||||
}
|
||||
|
||||
Result<FileInfo> PyFileSystem::GetFileInfo(const std::string& path) {
|
||||
FileInfo info;
|
||||
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.get_file_info(handler_.obj(), path, &info);
|
||||
return CheckPyError();
|
||||
});
|
||||
RETURN_NOT_OK(st);
|
||||
return info;
|
||||
}
|
||||
|
||||
Result<std::vector<FileInfo>> PyFileSystem::GetFileInfo(
|
||||
const std::vector<std::string>& paths) {
|
||||
std::vector<FileInfo> infos;
|
||||
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.get_file_info_vector(handler_.obj(), paths, &infos);
|
||||
return CheckPyError();
|
||||
});
|
||||
RETURN_NOT_OK(st);
|
||||
return infos;
|
||||
}
|
||||
|
||||
Result<std::vector<FileInfo>> PyFileSystem::GetFileInfo(const FileSelector& select) {
|
||||
std::vector<FileInfo> infos;
|
||||
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.get_file_info_selector(handler_.obj(), select, &infos);
|
||||
return CheckPyError();
|
||||
});
|
||||
RETURN_NOT_OK(st);
|
||||
return infos;
|
||||
}
|
||||
|
||||
Status PyFileSystem::CreateDir(const std::string& path, bool recursive) {
|
||||
return SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.create_dir(handler_.obj(), path, recursive);
|
||||
return CheckPyError();
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFileSystem::DeleteDir(const std::string& path) {
|
||||
return SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.delete_dir(handler_.obj(), path);
|
||||
return CheckPyError();
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) {
|
||||
return SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.delete_dir_contents(handler_.obj(), path, missing_dir_ok);
|
||||
return CheckPyError();
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFileSystem::DeleteRootDirContents() {
|
||||
return SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.delete_root_dir_contents(handler_.obj());
|
||||
return CheckPyError();
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFileSystem::DeleteFile(const std::string& path) {
|
||||
return SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.delete_file(handler_.obj(), path);
|
||||
return CheckPyError();
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFileSystem::Move(const std::string& src, const std::string& dest) {
|
||||
return SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.move(handler_.obj(), src, dest);
|
||||
return CheckPyError();
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFileSystem::CopyFile(const std::string& src, const std::string& dest) {
|
||||
return SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.copy_file(handler_.obj(), src, dest);
|
||||
return CheckPyError();
|
||||
});
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<io::InputStream>> PyFileSystem::OpenInputStream(
|
||||
const std::string& path) {
|
||||
std::shared_ptr<io::InputStream> stream;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.open_input_stream(handler_.obj(), path, &stream);
|
||||
return CheckPyError();
|
||||
});
|
||||
RETURN_NOT_OK(st);
|
||||
return stream;
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> PyFileSystem::OpenInputFile(
|
||||
const std::string& path) {
|
||||
std::shared_ptr<io::RandomAccessFile> stream;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.open_input_file(handler_.obj(), path, &stream);
|
||||
return CheckPyError();
|
||||
});
|
||||
RETURN_NOT_OK(st);
|
||||
return stream;
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<io::OutputStream>> PyFileSystem::OpenOutputStream(
|
||||
const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
|
||||
std::shared_ptr<io::OutputStream> stream;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.open_output_stream(handler_.obj(), path, metadata, &stream);
|
||||
return CheckPyError();
|
||||
});
|
||||
RETURN_NOT_OK(st);
|
||||
return stream;
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<io::OutputStream>> PyFileSystem::OpenAppendStream(
|
||||
const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
|
||||
std::shared_ptr<io::OutputStream> stream;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.open_append_stream(handler_.obj(), path, metadata, &stream);
|
||||
return CheckPyError();
|
||||
});
|
||||
RETURN_NOT_OK(st);
|
||||
return stream;
|
||||
}
|
||||
|
||||
Result<std::string> PyFileSystem::NormalizePath(std::string path) {
|
||||
std::string normalized;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.normalize_path(handler_.obj(), path, &normalized);
|
||||
return CheckPyError();
|
||||
});
|
||||
RETURN_NOT_OK(st);
|
||||
return normalized;
|
||||
}
|
||||
|
||||
} // namespace fs
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,130 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow::py::fs {
|
||||
|
||||
class ARROW_PYTHON_EXPORT PyFileSystemVtable {
|
||||
public:
|
||||
std::function<void(PyObject*, std::string* out)> get_type_name;
|
||||
std::function<bool(PyObject*, const arrow::fs::FileSystem& other)> equals;
|
||||
|
||||
std::function<void(PyObject*, const std::string& path, arrow::fs::FileInfo* out)>
|
||||
get_file_info;
|
||||
std::function<void(PyObject*, const std::vector<std::string>& paths,
|
||||
std::vector<arrow::fs::FileInfo>* out)>
|
||||
get_file_info_vector;
|
||||
std::function<void(PyObject*, const arrow::fs::FileSelector&,
|
||||
std::vector<arrow::fs::FileInfo>* out)>
|
||||
get_file_info_selector;
|
||||
|
||||
std::function<void(PyObject*, const std::string& path, bool)> create_dir;
|
||||
std::function<void(PyObject*, const std::string& path)> delete_dir;
|
||||
std::function<void(PyObject*, const std::string& path, bool)> delete_dir_contents;
|
||||
std::function<void(PyObject*)> delete_root_dir_contents;
|
||||
std::function<void(PyObject*, const std::string& path)> delete_file;
|
||||
std::function<void(PyObject*, const std::string& src, const std::string& dest)> move;
|
||||
std::function<void(PyObject*, const std::string& src, const std::string& dest)>
|
||||
copy_file;
|
||||
|
||||
std::function<void(PyObject*, const std::string& path,
|
||||
std::shared_ptr<io::InputStream>* out)>
|
||||
open_input_stream;
|
||||
std::function<void(PyObject*, const std::string& path,
|
||||
std::shared_ptr<io::RandomAccessFile>* out)>
|
||||
open_input_file;
|
||||
std::function<void(PyObject*, const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>&,
|
||||
std::shared_ptr<io::OutputStream>* out)>
|
||||
open_output_stream;
|
||||
std::function<void(PyObject*, const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>&,
|
||||
std::shared_ptr<io::OutputStream>* out)>
|
||||
open_append_stream;
|
||||
|
||||
std::function<void(PyObject*, const std::string& path, std::string* out)>
|
||||
normalize_path;
|
||||
};
|
||||
|
||||
class ARROW_PYTHON_EXPORT PyFileSystem : public arrow::fs::FileSystem {
|
||||
public:
|
||||
PyFileSystem(PyObject* handler, PyFileSystemVtable vtable);
|
||||
~PyFileSystem() override;
|
||||
|
||||
static std::shared_ptr<PyFileSystem> Make(PyObject* handler, PyFileSystemVtable vtable);
|
||||
|
||||
std::string type_name() const override;
|
||||
|
||||
bool Equals(const FileSystem& other) const override;
|
||||
|
||||
/// \cond FALSE
|
||||
using FileSystem::CreateDir;
|
||||
using FileSystem::DeleteDirContents;
|
||||
using FileSystem::GetFileInfo;
|
||||
using FileSystem::OpenAppendStream;
|
||||
using FileSystem::OpenOutputStream;
|
||||
/// \endcond
|
||||
|
||||
Result<arrow::fs::FileInfo> GetFileInfo(const std::string& path) override;
|
||||
Result<std::vector<arrow::fs::FileInfo>> GetFileInfo(
|
||||
const std::vector<std::string>& paths) override;
|
||||
Result<std::vector<arrow::fs::FileInfo>> GetFileInfo(
|
||||
const arrow::fs::FileSelector& select) override;
|
||||
|
||||
Status CreateDir(const std::string& path, bool recursive) override;
|
||||
|
||||
Status DeleteDir(const std::string& path) override;
|
||||
Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
|
||||
Status DeleteRootDirContents() override;
|
||||
|
||||
Status DeleteFile(const std::string& path) override;
|
||||
|
||||
Status Move(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Status CopyFile(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
|
||||
|
||||
Result<std::string> NormalizePath(std::string path) override;
|
||||
|
||||
PyObject* handler() const { return handler_.obj(); }
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL handler_;
|
||||
PyFileSystemVtable vtable_;
|
||||
};
|
||||
|
||||
} // namespace arrow::py::fs
|
||||
@@ -0,0 +1,390 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <signal.h>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/python/flight.h"
|
||||
#include "arrow/util/io_util.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
using arrow::flight::FlightPayload;
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace flight {
|
||||
|
||||
const char* kPyServerMiddlewareName = "arrow.py_server_middleware";
|
||||
|
||||
PyServerAuthHandler::PyServerAuthHandler(PyObject* handler,
|
||||
const PyServerAuthHandlerVtable& vtable)
|
||||
: vtable_(vtable) {
|
||||
Py_INCREF(handler);
|
||||
handler_.reset(handler);
|
||||
}
|
||||
|
||||
Status PyServerAuthHandler::Authenticate(const arrow::flight::ServerCallContext& context,
|
||||
arrow::flight::ServerAuthSender* outgoing,
|
||||
arrow::flight::ServerAuthReader* incoming) {
|
||||
return SafeCallIntoPython([=] {
|
||||
const Status status = vtable_.authenticate(handler_.obj(), outgoing, incoming);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyServerAuthHandler::IsValid(const std::string& token,
|
||||
std::string* peer_identity) {
|
||||
return SafeCallIntoPython([=] {
|
||||
const Status status = vtable_.is_valid(handler_.obj(), token, peer_identity);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
PyClientAuthHandler::PyClientAuthHandler(PyObject* handler,
|
||||
const PyClientAuthHandlerVtable& vtable)
|
||||
: vtable_(vtable) {
|
||||
Py_INCREF(handler);
|
||||
handler_.reset(handler);
|
||||
}
|
||||
|
||||
Status PyClientAuthHandler::Authenticate(arrow::flight::ClientAuthSender* outgoing,
|
||||
arrow::flight::ClientAuthReader* incoming) {
|
||||
return SafeCallIntoPython([=] {
|
||||
const Status status = vtable_.authenticate(handler_.obj(), outgoing, incoming);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyClientAuthHandler::GetToken(std::string* token) {
|
||||
return SafeCallIntoPython([=] {
|
||||
const Status status = vtable_.get_token(handler_.obj(), token);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
PyFlightServer::PyFlightServer(PyObject* server, const PyFlightServerVtable& vtable)
|
||||
: vtable_(vtable) {
|
||||
Py_INCREF(server);
|
||||
server_.reset(server);
|
||||
}
|
||||
|
||||
Status PyFlightServer::ListFlights(
|
||||
const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::Criteria* criteria,
|
||||
std::unique_ptr<arrow::flight::FlightListing>* listings) {
|
||||
return SafeCallIntoPython([&] {
|
||||
const Status status =
|
||||
vtable_.list_flights(server_.obj(), context, criteria, listings);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFlightServer::GetFlightInfo(const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::FlightDescriptor& request,
|
||||
std::unique_ptr<arrow::flight::FlightInfo>* info) {
|
||||
return SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.get_flight_info(server_.obj(), context, request, info);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFlightServer::GetSchema(const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::FlightDescriptor& request,
|
||||
std::unique_ptr<arrow::flight::SchemaResult>* result) {
|
||||
return SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.get_schema(server_.obj(), context, request, result);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFlightServer::DoGet(const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::Ticket& request,
|
||||
std::unique_ptr<arrow::flight::FlightDataStream>* stream) {
|
||||
return SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.do_get(server_.obj(), context, request, stream);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFlightServer::DoPut(
|
||||
const arrow::flight::ServerCallContext& context,
|
||||
std::unique_ptr<arrow::flight::FlightMessageReader> reader,
|
||||
std::unique_ptr<arrow::flight::FlightMetadataWriter> writer) {
|
||||
return SafeCallIntoPython([&] {
|
||||
const Status status =
|
||||
vtable_.do_put(server_.obj(), context, std::move(reader), std::move(writer));
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFlightServer::DoExchange(
|
||||
const arrow::flight::ServerCallContext& context,
|
||||
std::unique_ptr<arrow::flight::FlightMessageReader> reader,
|
||||
std::unique_ptr<arrow::flight::FlightMessageWriter> writer) {
|
||||
return SafeCallIntoPython([&] {
|
||||
const Status status =
|
||||
vtable_.do_exchange(server_.obj(), context, std::move(reader), std::move(writer));
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFlightServer::DoAction(const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::Action& action,
|
||||
std::unique_ptr<arrow::flight::ResultStream>* result) {
|
||||
return SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.do_action(server_.obj(), context, action, result);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFlightServer::ListActions(const arrow::flight::ServerCallContext& context,
|
||||
std::vector<arrow::flight::ActionType>* actions) {
|
||||
return SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.list_actions(server_.obj(), context, actions);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
Status PyFlightServer::ServeWithSignals() {
|
||||
// Respect the current Python settings, i.e. only interrupt the server if there is
|
||||
// an active signal handler for SIGINT and SIGTERM.
|
||||
std::vector<int> signals;
|
||||
for (const int signum : {SIGINT, SIGTERM}) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto handler, ::arrow::internal::GetSignalHandler(signum));
|
||||
auto cb = handler.callback();
|
||||
if (cb != SIG_DFL && cb != SIG_IGN) {
|
||||
signals.push_back(signum);
|
||||
}
|
||||
}
|
||||
RETURN_NOT_OK(SetShutdownOnSignals(signals));
|
||||
|
||||
// Serve until we got told to shutdown or a signal interrupted us
|
||||
RETURN_NOT_OK(Serve());
|
||||
int signum = GotSignal();
|
||||
if (signum != 0) {
|
||||
// Issue the signal again with Python's signal handlers restored
|
||||
PyAcquireGIL lock;
|
||||
raise(signum);
|
||||
// XXX Ideally we would loop and serve again if no exception was raised.
|
||||
// Unfortunately, gRPC will return immediately if Serve() is called again.
|
||||
ARROW_UNUSED(PyErr_CheckSignals());
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
PyFlightResultStream::PyFlightResultStream(PyObject* generator,
|
||||
PyFlightResultStreamCallback callback)
|
||||
: callback_(callback) {
|
||||
Py_INCREF(generator);
|
||||
generator_.reset(generator);
|
||||
}
|
||||
|
||||
arrow::Result<std::unique_ptr<arrow::flight::Result>> PyFlightResultStream::Next() {
|
||||
return SafeCallIntoPython(
|
||||
[=]() -> arrow::Result<std::unique_ptr<arrow::flight::Result>> {
|
||||
std::unique_ptr<arrow::flight::Result> result;
|
||||
const Status status = callback_(generator_.obj(), &result);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
RETURN_NOT_OK(status);
|
||||
return result;
|
||||
});
|
||||
}
|
||||
|
||||
PyFlightDataStream::PyFlightDataStream(
|
||||
PyObject* data_source, std::unique_ptr<arrow::flight::FlightDataStream> stream)
|
||||
: stream_(std::move(stream)) {
|
||||
Py_INCREF(data_source);
|
||||
data_source_.reset(data_source);
|
||||
}
|
||||
|
||||
std::shared_ptr<Schema> PyFlightDataStream::schema() { return stream_->schema(); }
|
||||
|
||||
arrow::Result<FlightPayload> PyFlightDataStream::GetSchemaPayload() {
|
||||
return stream_->GetSchemaPayload();
|
||||
}
|
||||
|
||||
arrow::Result<FlightPayload> PyFlightDataStream::Next() { return stream_->Next(); }
|
||||
|
||||
PyGeneratorFlightDataStream::PyGeneratorFlightDataStream(
|
||||
PyObject* generator, std::shared_ptr<arrow::Schema> schema,
|
||||
PyGeneratorFlightDataStreamCallback callback, const ipc::IpcWriteOptions& options)
|
||||
: schema_(schema), mapper_(*schema_), options_(options), callback_(callback) {
|
||||
Py_INCREF(generator);
|
||||
generator_.reset(generator);
|
||||
}
|
||||
|
||||
std::shared_ptr<Schema> PyGeneratorFlightDataStream::schema() { return schema_; }
|
||||
|
||||
arrow::Result<FlightPayload> PyGeneratorFlightDataStream::GetSchemaPayload() {
|
||||
FlightPayload payload;
|
||||
RETURN_NOT_OK(ipc::GetSchemaPayload(*schema_, options_, mapper_, &payload.ipc_message));
|
||||
return payload;
|
||||
}
|
||||
|
||||
arrow::Result<FlightPayload> PyGeneratorFlightDataStream::Next() {
|
||||
return SafeCallIntoPython([=]() -> arrow::Result<FlightPayload> {
|
||||
FlightPayload payload;
|
||||
const Status status = callback_(generator_.obj(), &payload);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
RETURN_NOT_OK(status);
|
||||
return payload;
|
||||
});
|
||||
}
|
||||
|
||||
// Flight Server Middleware
|
||||
|
||||
PyServerMiddlewareFactory::PyServerMiddlewareFactory(PyObject* factory,
|
||||
StartCallCallback start_call)
|
||||
: start_call_(start_call) {
|
||||
Py_INCREF(factory);
|
||||
factory_.reset(factory);
|
||||
}
|
||||
|
||||
Status PyServerMiddlewareFactory::StartCall(
|
||||
const arrow::flight::CallInfo& info, const arrow::flight::ServerCallContext& context,
|
||||
std::shared_ptr<arrow::flight::ServerMiddleware>* middleware) {
|
||||
return SafeCallIntoPython([&] {
|
||||
const Status status =
|
||||
start_call_(factory_.obj(), info, context.incoming_headers(), middleware);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
}
|
||||
|
||||
PyServerMiddleware::PyServerMiddleware(PyObject* middleware, Vtable vtable)
|
||||
: vtable_(vtable) {
|
||||
Py_INCREF(middleware);
|
||||
middleware_.reset(middleware);
|
||||
}
|
||||
|
||||
void PyServerMiddleware::SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) {
|
||||
const Status& status = SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.sending_headers(middleware_.obj(), outgoing_headers);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
|
||||
ARROW_WARN_NOT_OK(status, "Python server middleware failed in SendingHeaders");
|
||||
}
|
||||
|
||||
void PyServerMiddleware::CallCompleted(const Status& call_status) {
|
||||
const Status& status = SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.call_completed(middleware_.obj(), call_status);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
|
||||
ARROW_WARN_NOT_OK(status, "Python server middleware failed in CallCompleted");
|
||||
}
|
||||
|
||||
std::string PyServerMiddleware::name() const { return kPyServerMiddlewareName; }
|
||||
|
||||
PyObject* PyServerMiddleware::py_object() const { return middleware_.obj(); }
|
||||
|
||||
// Flight Client Middleware
|
||||
|
||||
PyClientMiddlewareFactory::PyClientMiddlewareFactory(PyObject* factory,
|
||||
StartCallCallback start_call)
|
||||
: start_call_(start_call) {
|
||||
Py_INCREF(factory);
|
||||
factory_.reset(factory);
|
||||
}
|
||||
|
||||
void PyClientMiddlewareFactory::StartCall(
|
||||
const arrow::flight::CallInfo& info,
|
||||
std::unique_ptr<arrow::flight::ClientMiddleware>* middleware) {
|
||||
const Status& status = SafeCallIntoPython([&] {
|
||||
const Status status = start_call_(factory_.obj(), info, middleware);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
|
||||
ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall");
|
||||
}
|
||||
|
||||
PyClientMiddleware::PyClientMiddleware(PyObject* middleware, Vtable vtable)
|
||||
: vtable_(vtable) {
|
||||
Py_INCREF(middleware);
|
||||
middleware_.reset(middleware);
|
||||
}
|
||||
|
||||
void PyClientMiddleware::SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) {
|
||||
const Status& status = SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.sending_headers(middleware_.obj(), outgoing_headers);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
|
||||
ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall");
|
||||
}
|
||||
|
||||
void PyClientMiddleware::ReceivedHeaders(
|
||||
const arrow::flight::CallHeaders& incoming_headers) {
|
||||
const Status& status = SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.received_headers(middleware_.obj(), incoming_headers);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
|
||||
ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall");
|
||||
}
|
||||
|
||||
void PyClientMiddleware::CallCompleted(const Status& call_status) {
|
||||
const Status& status = SafeCallIntoPython([&] {
|
||||
const Status status = vtable_.call_completed(middleware_.obj(), call_status);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return status;
|
||||
});
|
||||
|
||||
ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall");
|
||||
}
|
||||
|
||||
Status CreateFlightInfo(const std::shared_ptr<arrow::Schema>& schema,
|
||||
const arrow::flight::FlightDescriptor& descriptor,
|
||||
const std::vector<arrow::flight::FlightEndpoint>& endpoints,
|
||||
int64_t total_records, int64_t total_bytes, bool ordered,
|
||||
const std::string& app_metadata,
|
||||
std::unique_ptr<arrow::flight::FlightInfo>* out) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto result, arrow::flight::FlightInfo::Make(
|
||||
schema, descriptor, endpoints, total_records,
|
||||
total_bytes, ordered, app_metadata));
|
||||
*out = std::unique_ptr<arrow::flight::FlightInfo>(
|
||||
new arrow::flight::FlightInfo(std::move(result)));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status CreateSchemaResult(const std::shared_ptr<arrow::Schema>& schema,
|
||||
std::unique_ptr<arrow::flight::SchemaResult>* out) {
|
||||
return arrow::flight::SchemaResult::Make(*schema).Value(out);
|
||||
}
|
||||
|
||||
} // namespace flight
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,352 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/flight/api.h"
|
||||
#include "arrow/ipc/dictionary.h"
|
||||
#include "arrow/python/common.h"
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__) // Windows
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(disable : 4251)
|
||||
# else
|
||||
# pragma GCC diagnostic ignored "-Wattributes"
|
||||
# endif
|
||||
|
||||
# ifdef ARROW_PYTHON_STATIC
|
||||
# define ARROW_PYFLIGHT_EXPORT
|
||||
# elif defined(ARROW_PYFLIGHT_EXPORTING)
|
||||
# define ARROW_PYFLIGHT_EXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define ARROW_PYFLIGHT_EXPORT __declspec(dllimport)
|
||||
# endif
|
||||
|
||||
#else // Not Windows
|
||||
# ifndef ARROW_PYFLIGHT_EXPORT
|
||||
# define ARROW_PYFLIGHT_EXPORT __attribute__((visibility("default")))
|
||||
# endif
|
||||
#endif // Non-Windows
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace py {
|
||||
|
||||
namespace flight {
|
||||
|
||||
ARROW_PYFLIGHT_EXPORT
|
||||
extern const char* kPyServerMiddlewareName;
|
||||
|
||||
/// \brief A table of function pointers for calling from C++ into
|
||||
/// Python.
|
||||
class ARROW_PYFLIGHT_EXPORT PyFlightServerVtable {
|
||||
public:
|
||||
std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
|
||||
const arrow::flight::Criteria*,
|
||||
std::unique_ptr<arrow::flight::FlightListing>*)>
|
||||
list_flights;
|
||||
std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
|
||||
const arrow::flight::FlightDescriptor&,
|
||||
std::unique_ptr<arrow::flight::FlightInfo>*)>
|
||||
get_flight_info;
|
||||
std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
|
||||
const arrow::flight::FlightDescriptor&,
|
||||
std::unique_ptr<arrow::flight::SchemaResult>*)>
|
||||
get_schema;
|
||||
std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
|
||||
const arrow::flight::Ticket&,
|
||||
std::unique_ptr<arrow::flight::FlightDataStream>*)>
|
||||
do_get;
|
||||
std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
|
||||
std::unique_ptr<arrow::flight::FlightMessageReader>,
|
||||
std::unique_ptr<arrow::flight::FlightMetadataWriter>)>
|
||||
do_put;
|
||||
std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
|
||||
std::unique_ptr<arrow::flight::FlightMessageReader>,
|
||||
std::unique_ptr<arrow::flight::FlightMessageWriter>)>
|
||||
do_exchange;
|
||||
std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
|
||||
const arrow::flight::Action&,
|
||||
std::unique_ptr<arrow::flight::ResultStream>*)>
|
||||
do_action;
|
||||
std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
|
||||
std::vector<arrow::flight::ActionType>*)>
|
||||
list_actions;
|
||||
};
|
||||
|
||||
class ARROW_PYFLIGHT_EXPORT PyServerAuthHandlerVtable {
|
||||
public:
|
||||
std::function<Status(PyObject*, arrow::flight::ServerAuthSender*,
|
||||
arrow::flight::ServerAuthReader*)>
|
||||
authenticate;
|
||||
std::function<Status(PyObject*, const std::string&, std::string*)> is_valid;
|
||||
};
|
||||
|
||||
class ARROW_PYFLIGHT_EXPORT PyClientAuthHandlerVtable {
|
||||
public:
|
||||
std::function<Status(PyObject*, arrow::flight::ClientAuthSender*,
|
||||
arrow::flight::ClientAuthReader*)>
|
||||
authenticate;
|
||||
std::function<Status(PyObject*, std::string*)> get_token;
|
||||
};
|
||||
|
||||
/// \brief A helper to implement an auth mechanism in Python.
|
||||
class ARROW_PYFLIGHT_EXPORT PyServerAuthHandler
|
||||
: public arrow::flight::ServerAuthHandler {
|
||||
public:
|
||||
explicit PyServerAuthHandler(PyObject* handler,
|
||||
const PyServerAuthHandlerVtable& vtable);
|
||||
Status Authenticate(const arrow::flight::ServerCallContext& context,
|
||||
arrow::flight::ServerAuthSender* outgoing,
|
||||
arrow::flight::ServerAuthReader* incoming) override;
|
||||
Status IsValid(const std::string& token, std::string* peer_identity) override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL handler_;
|
||||
PyServerAuthHandlerVtable vtable_;
|
||||
};
|
||||
|
||||
/// \brief A helper to implement an auth mechanism in Python.
|
||||
class ARROW_PYFLIGHT_EXPORT PyClientAuthHandler
|
||||
: public arrow::flight::ClientAuthHandler {
|
||||
public:
|
||||
explicit PyClientAuthHandler(PyObject* handler,
|
||||
const PyClientAuthHandlerVtable& vtable);
|
||||
Status Authenticate(arrow::flight::ClientAuthSender* outgoing,
|
||||
arrow::flight::ClientAuthReader* incoming) override;
|
||||
Status GetToken(std::string* token) override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL handler_;
|
||||
PyClientAuthHandlerVtable vtable_;
|
||||
};
|
||||
|
||||
class ARROW_PYFLIGHT_EXPORT PyFlightServer : public arrow::flight::FlightServerBase {
|
||||
public:
|
||||
explicit PyFlightServer(PyObject* server, const PyFlightServerVtable& vtable);
|
||||
|
||||
// Like Serve(), but set up signals and invoke Python signal handlers
|
||||
// if necessary. This function may return with a Python exception set.
|
||||
Status ServeWithSignals();
|
||||
|
||||
Status ListFlights(const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::Criteria* criteria,
|
||||
std::unique_ptr<arrow::flight::FlightListing>* listings) override;
|
||||
Status GetFlightInfo(const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::FlightDescriptor& request,
|
||||
std::unique_ptr<arrow::flight::FlightInfo>* info) override;
|
||||
Status GetSchema(const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::FlightDescriptor& request,
|
||||
std::unique_ptr<arrow::flight::SchemaResult>* result) override;
|
||||
Status DoGet(const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::Ticket& request,
|
||||
std::unique_ptr<arrow::flight::FlightDataStream>* stream) override;
|
||||
Status DoPut(const arrow::flight::ServerCallContext& context,
|
||||
std::unique_ptr<arrow::flight::FlightMessageReader> reader,
|
||||
std::unique_ptr<arrow::flight::FlightMetadataWriter> writer) override;
|
||||
Status DoExchange(const arrow::flight::ServerCallContext& context,
|
||||
std::unique_ptr<arrow::flight::FlightMessageReader> reader,
|
||||
std::unique_ptr<arrow::flight::FlightMessageWriter> writer) override;
|
||||
Status DoAction(const arrow::flight::ServerCallContext& context,
|
||||
const arrow::flight::Action& action,
|
||||
std::unique_ptr<arrow::flight::ResultStream>* result) override;
|
||||
Status ListActions(const arrow::flight::ServerCallContext& context,
|
||||
std::vector<arrow::flight::ActionType>* actions) override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL server_;
|
||||
PyFlightServerVtable vtable_;
|
||||
};
|
||||
|
||||
/// \brief A callback that obtains the next result from a Flight action.
|
||||
typedef std::function<Status(PyObject*, std::unique_ptr<arrow::flight::Result>*)>
|
||||
PyFlightResultStreamCallback;
|
||||
|
||||
/// \brief A ResultStream built around a Python callback.
|
||||
class ARROW_PYFLIGHT_EXPORT PyFlightResultStream : public arrow::flight::ResultStream {
|
||||
public:
|
||||
/// \brief Construct a FlightResultStream from a Python object and callback.
|
||||
/// Must only be called while holding the GIL.
|
||||
explicit PyFlightResultStream(PyObject* generator,
|
||||
PyFlightResultStreamCallback callback);
|
||||
arrow::Result<std::unique_ptr<arrow::flight::Result>> Next() override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL generator_;
|
||||
PyFlightResultStreamCallback callback_;
|
||||
};
|
||||
|
||||
/// \brief A wrapper around a FlightDataStream that keeps alive a
|
||||
/// Python object backing it.
|
||||
class ARROW_PYFLIGHT_EXPORT PyFlightDataStream : public arrow::flight::FlightDataStream {
|
||||
public:
|
||||
/// \brief Construct a FlightDataStream from a Python object and underlying stream.
|
||||
/// Must only be called while holding the GIL.
|
||||
explicit PyFlightDataStream(PyObject* data_source,
|
||||
std::unique_ptr<arrow::flight::FlightDataStream> stream);
|
||||
|
||||
std::shared_ptr<Schema> schema() override;
|
||||
arrow::Result<arrow::flight::FlightPayload> GetSchemaPayload() override;
|
||||
arrow::Result<arrow::flight::FlightPayload> Next() override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL data_source_;
|
||||
std::unique_ptr<arrow::flight::FlightDataStream> stream_;
|
||||
};
|
||||
|
||||
class ARROW_PYFLIGHT_EXPORT PyServerMiddlewareFactory
|
||||
: public arrow::flight::ServerMiddlewareFactory {
|
||||
public:
|
||||
/// \brief A callback to create the middleware instance in Python
|
||||
typedef std::function<Status(
|
||||
PyObject*, const arrow::flight::CallInfo& info,
|
||||
const arrow::flight::CallHeaders& incoming_headers,
|
||||
std::shared_ptr<arrow::flight::ServerMiddleware>* middleware)>
|
||||
StartCallCallback;
|
||||
|
||||
/// \brief Must only be called while holding the GIL.
|
||||
explicit PyServerMiddlewareFactory(PyObject* factory, StartCallCallback start_call);
|
||||
|
||||
Status StartCall(const arrow::flight::CallInfo& info,
|
||||
const arrow::flight::ServerCallContext& context,
|
||||
std::shared_ptr<arrow::flight::ServerMiddleware>* middleware) override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL factory_;
|
||||
StartCallCallback start_call_;
|
||||
};
|
||||
|
||||
class ARROW_PYFLIGHT_EXPORT PyServerMiddleware : public arrow::flight::ServerMiddleware {
|
||||
public:
|
||||
typedef std::function<Status(PyObject*,
|
||||
arrow::flight::AddCallHeaders* outgoing_headers)>
|
||||
SendingHeadersCallback;
|
||||
typedef std::function<Status(PyObject*, const Status& status)> CallCompletedCallback;
|
||||
|
||||
struct Vtable {
|
||||
SendingHeadersCallback sending_headers;
|
||||
CallCompletedCallback call_completed;
|
||||
};
|
||||
|
||||
/// \brief Must only be called while holding the GIL.
|
||||
explicit PyServerMiddleware(PyObject* middleware, Vtable vtable);
|
||||
|
||||
void SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) override;
|
||||
void CallCompleted(const Status& status) override;
|
||||
std::string name() const override;
|
||||
/// \brief Get the underlying Python object.
|
||||
PyObject* py_object() const;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL middleware_;
|
||||
Vtable vtable_;
|
||||
};
|
||||
|
||||
class ARROW_PYFLIGHT_EXPORT PyClientMiddlewareFactory
|
||||
: public arrow::flight::ClientMiddlewareFactory {
|
||||
public:
|
||||
/// \brief A callback to create the middleware instance in Python
|
||||
typedef std::function<Status(
|
||||
PyObject*, const arrow::flight::CallInfo& info,
|
||||
std::unique_ptr<arrow::flight::ClientMiddleware>* middleware)>
|
||||
StartCallCallback;
|
||||
|
||||
/// \brief Must only be called while holding the GIL.
|
||||
explicit PyClientMiddlewareFactory(PyObject* factory, StartCallCallback start_call);
|
||||
|
||||
void StartCall(const arrow::flight::CallInfo& info,
|
||||
std::unique_ptr<arrow::flight::ClientMiddleware>* middleware) override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL factory_;
|
||||
StartCallCallback start_call_;
|
||||
};
|
||||
|
||||
class ARROW_PYFLIGHT_EXPORT PyClientMiddleware : public arrow::flight::ClientMiddleware {
|
||||
public:
|
||||
typedef std::function<Status(PyObject*,
|
||||
arrow::flight::AddCallHeaders* outgoing_headers)>
|
||||
SendingHeadersCallback;
|
||||
typedef std::function<Status(PyObject*,
|
||||
const arrow::flight::CallHeaders& incoming_headers)>
|
||||
ReceivedHeadersCallback;
|
||||
typedef std::function<Status(PyObject*, const Status& status)> CallCompletedCallback;
|
||||
|
||||
struct Vtable {
|
||||
SendingHeadersCallback sending_headers;
|
||||
ReceivedHeadersCallback received_headers;
|
||||
CallCompletedCallback call_completed;
|
||||
};
|
||||
|
||||
/// \brief Must only be called while holding the GIL.
|
||||
explicit PyClientMiddleware(PyObject* factory, Vtable vtable);
|
||||
|
||||
void SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) override;
|
||||
void ReceivedHeaders(const arrow::flight::CallHeaders& incoming_headers) override;
|
||||
void CallCompleted(const Status& status) override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL middleware_;
|
||||
Vtable vtable_;
|
||||
};
|
||||
|
||||
/// \brief A callback that obtains the next payload from a Flight result stream.
|
||||
typedef std::function<Status(PyObject*, arrow::flight::FlightPayload*)>
|
||||
PyGeneratorFlightDataStreamCallback;
|
||||
|
||||
/// \brief A FlightDataStream built around a Python callback.
|
||||
class ARROW_PYFLIGHT_EXPORT PyGeneratorFlightDataStream
|
||||
: public arrow::flight::FlightDataStream {
|
||||
public:
|
||||
/// \brief Construct a FlightDataStream from a Python object and underlying stream.
|
||||
/// Must only be called while holding the GIL.
|
||||
explicit PyGeneratorFlightDataStream(PyObject* generator,
|
||||
std::shared_ptr<arrow::Schema> schema,
|
||||
PyGeneratorFlightDataStreamCallback callback,
|
||||
const ipc::IpcWriteOptions& options);
|
||||
std::shared_ptr<Schema> schema() override;
|
||||
arrow::Result<arrow::flight::FlightPayload> GetSchemaPayload() override;
|
||||
arrow::Result<arrow::flight::FlightPayload> Next() override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL generator_;
|
||||
std::shared_ptr<arrow::Schema> schema_;
|
||||
ipc::DictionaryFieldMapper mapper_;
|
||||
ipc::IpcWriteOptions options_;
|
||||
PyGeneratorFlightDataStreamCallback callback_;
|
||||
};
|
||||
|
||||
ARROW_PYFLIGHT_EXPORT
|
||||
Status CreateFlightInfo(const std::shared_ptr<arrow::Schema>& schema,
|
||||
const arrow::flight::FlightDescriptor& descriptor,
|
||||
const std::vector<arrow::flight::FlightEndpoint>& endpoints,
|
||||
int64_t total_records, int64_t total_bytes, bool ordered,
|
||||
const std::string& app_metadata,
|
||||
std::unique_ptr<arrow::flight::FlightInfo>* out);
|
||||
|
||||
/// \brief Create a SchemaResult from schema.
|
||||
ARROW_PYFLIGHT_EXPORT
|
||||
Status CreateSchemaResult(const std::shared_ptr<arrow::Schema>& schema,
|
||||
std::unique_ptr<arrow::flight::SchemaResult>* out);
|
||||
|
||||
} // namespace flight
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,505 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/chunked_array.h"
|
||||
#include "arrow/datum.h"
|
||||
#include "arrow/extension/uuid.h"
|
||||
#include "arrow/json/from_string.h"
|
||||
#include "arrow/python/gdb.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/scalar.h"
|
||||
#include "arrow/table.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/debug.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/key_value_metadata.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using extension::uuid;
|
||||
using extension::UuidType;
|
||||
using json::ArrayFromJSONString;
|
||||
using json::ChunkedArrayFromJSONString;
|
||||
using json::ScalarFromJSONString;
|
||||
|
||||
namespace gdb {
|
||||
|
||||
// Add a nested `arrow` namespace to exercise type lookup from GDB (ARROW-15652)
|
||||
namespace arrow {
|
||||
void DummyFunction() {}
|
||||
} // namespace arrow
|
||||
|
||||
namespace {
|
||||
|
||||
class CustomStatusDetail : public StatusDetail {
|
||||
public:
|
||||
const char* type_id() const override { return "custom-detail-id"; }
|
||||
std::string ToString() const override { return "This is a detail"; }
|
||||
};
|
||||
|
||||
std::shared_ptr<Array> SliceArrayFromJSON(const std::shared_ptr<DataType>& ty,
|
||||
std::string_view json, int64_t offset = 0,
|
||||
int64_t length = -1) {
|
||||
auto array = *ArrayFromJSONString(ty, json);
|
||||
if (length != -1) {
|
||||
return array->Slice(offset, length);
|
||||
} else {
|
||||
return array->Slice(offset);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void TestSession() {
|
||||
// We define local variables for all types for which we want to test
|
||||
// pretty-printing.
|
||||
// Then, at the end of this function, we trap to the debugger, so that
|
||||
// test instrumentation can print values from this frame by interacting
|
||||
// with the debugger.
|
||||
// The test instrumentation is in pyarrow/tests/test_gdb.py
|
||||
|
||||
#ifdef __clang__
|
||||
_Pragma("clang diagnostic push");
|
||||
_Pragma("clang diagnostic ignored \"-Wunused-variable\"");
|
||||
#elif defined(__GNUC__)
|
||||
_Pragma("GCC diagnostic push");
|
||||
_Pragma("GCC diagnostic ignored \"-Wunused-variable\"");
|
||||
#endif
|
||||
|
||||
arrow::DummyFunction();
|
||||
|
||||
// Status & Result
|
||||
auto ok_status = Status::OK();
|
||||
auto error_status = Status::IOError("This is an error");
|
||||
auto error_detail_status =
|
||||
error_status.WithDetail(std::make_shared<CustomStatusDetail>());
|
||||
auto ok_result = Result<int>(42);
|
||||
auto error_result = Result<int>(error_status);
|
||||
auto error_detail_result = Result<int>(error_detail_status);
|
||||
|
||||
// String views
|
||||
std::string_view string_view_abc{"abc"};
|
||||
std::string special_chars = std::string("foo\"bar") + '\x00' + "\r\n\t\x1f";
|
||||
std::string_view string_view_special_chars(special_chars);
|
||||
|
||||
// Buffers
|
||||
Buffer buffer_null{nullptr, 0};
|
||||
Buffer buffer_abc{string_view_abc};
|
||||
Buffer buffer_special_chars{string_view_special_chars};
|
||||
char mutable_array[3] = {'a', 'b', 'c'};
|
||||
MutableBuffer buffer_mutable{reinterpret_cast<uint8_t*>(mutable_array), 3};
|
||||
auto heap_buffer = std::make_shared<Buffer>(string_view_abc);
|
||||
auto heap_buffer_mutable = *AllocateBuffer(buffer_abc.size());
|
||||
memcpy(heap_buffer_mutable->mutable_data(), buffer_abc.data(), buffer_abc.size());
|
||||
|
||||
// KeyValueMetadata
|
||||
auto empty_metadata = key_value_metadata({}, {});
|
||||
auto metadata = key_value_metadata(
|
||||
{"key_text", "key_binary"}, {"some value", std::string("z") + '\x00' + "\x1f\xff"});
|
||||
|
||||
// Decimals
|
||||
Decimal128 decimal128_zero{};
|
||||
Decimal128 decimal128_pos{"98765432109876543210987654321098765432"};
|
||||
Decimal128 decimal128_neg{"-98765432109876543210987654321098765432"};
|
||||
BasicDecimal128 basic_decimal128_zero{};
|
||||
BasicDecimal128 basic_decimal128_pos{decimal128_pos.native_endian_array()};
|
||||
BasicDecimal128 basic_decimal128_neg{decimal128_neg.native_endian_array()};
|
||||
Decimal256 decimal256_zero{};
|
||||
Decimal256 decimal256_pos{
|
||||
"9876543210987654321098765432109876543210987654321098765432109876543210987654"};
|
||||
Decimal256 decimal256_neg{
|
||||
"-9876543210987654321098765432109876543210987654321098765432109876543210987654"};
|
||||
BasicDecimal256 basic_decimal256_zero{};
|
||||
BasicDecimal256 basic_decimal256_pos{decimal256_pos.native_endian_array()};
|
||||
BasicDecimal256 basic_decimal256_neg{decimal256_neg.native_endian_array()};
|
||||
|
||||
// Data types
|
||||
NullType null_type;
|
||||
auto heap_null_type = null();
|
||||
BooleanType bool_type;
|
||||
auto heap_bool_type = boolean();
|
||||
|
||||
Date32Type date32_type;
|
||||
Date64Type date64_type;
|
||||
Time32Type time_type_s(TimeUnit::SECOND);
|
||||
Time32Type time_type_ms(TimeUnit::MILLI);
|
||||
Time64Type time_type_us(TimeUnit::MICRO);
|
||||
Time64Type time_type_ns(TimeUnit::NANO);
|
||||
auto heap_time_type_ns = time64(TimeUnit::NANO);
|
||||
|
||||
TimestampType timestamp_type_s(TimeUnit::SECOND);
|
||||
TimestampType timestamp_type_ms_timezone(TimeUnit::MILLI, "Europe/Paris");
|
||||
TimestampType timestamp_type_us(TimeUnit::MICRO);
|
||||
TimestampType timestamp_type_ns_timezone(TimeUnit::NANO, "Europe/Paris");
|
||||
auto heap_timestamp_type_ns_timezone = timestamp(TimeUnit::NANO, "Europe/Paris");
|
||||
|
||||
DayTimeIntervalType day_time_interval_type;
|
||||
MonthIntervalType month_interval_type;
|
||||
MonthDayNanoIntervalType month_day_nano_interval_type;
|
||||
|
||||
DurationType duration_type_s(TimeUnit::SECOND);
|
||||
DurationType duration_type_ns(TimeUnit::NANO);
|
||||
|
||||
BinaryType binary_type;
|
||||
StringType string_type;
|
||||
LargeBinaryType large_binary_type;
|
||||
LargeStringType large_string_type;
|
||||
FixedSizeBinaryType fixed_size_binary_type(10);
|
||||
auto heap_fixed_size_binary_type = fixed_size_binary(10);
|
||||
|
||||
Decimal128Type decimal128_type(16, 5);
|
||||
Decimal256Type decimal256_type(42, 12);
|
||||
auto heap_decimal128_type = decimal128(16, 5);
|
||||
|
||||
ListType list_type(uint8());
|
||||
LargeListType large_list_type(large_utf8());
|
||||
auto heap_list_type = list(uint8());
|
||||
auto heap_large_list_type = large_list(large_utf8());
|
||||
|
||||
FixedSizeListType fixed_size_list_type(float64(), 3);
|
||||
auto heap_fixed_size_list_type = fixed_size_list(float64(), 3);
|
||||
|
||||
DictionaryType dict_type_unordered(int16(), utf8());
|
||||
DictionaryType dict_type_ordered(int16(), utf8(), /*ordered=*/true);
|
||||
auto heap_dict_type = dictionary(int16(), utf8());
|
||||
|
||||
MapType map_type_unsorted(utf8(), binary());
|
||||
MapType map_type_sorted(utf8(), binary(), /*keys_sorted=*/true);
|
||||
auto heap_map_type = map(utf8(), binary());
|
||||
|
||||
StructType struct_type_empty({});
|
||||
StructType struct_type(
|
||||
{field("ints", int8()), field("strs", utf8(), /*nullable=*/false)});
|
||||
auto heap_struct_type =
|
||||
struct_({field("ints", int8()), field("strs", utf8(), /*nullable=*/false)});
|
||||
|
||||
std::vector<int8_t> union_type_codes({7, 42});
|
||||
FieldVector union_fields(
|
||||
{field("ints", int8()), field("strs", utf8(), /*nullable=*/false)});
|
||||
SparseUnionType sparse_union_type(union_fields, union_type_codes);
|
||||
DenseUnionType dense_union_type(union_fields, union_type_codes);
|
||||
|
||||
UuidType uuid_type{};
|
||||
std::shared_ptr<DataType> heap_uuid_type = std::make_shared<UuidType>();
|
||||
|
||||
// Schema
|
||||
auto schema_empty = schema({});
|
||||
auto schema_non_empty = schema({field("ints", int8()), field("strs", utf8())});
|
||||
auto schema_with_metadata = schema_non_empty->WithMetadata(
|
||||
key_value_metadata({"key1", "key2"}, {"value1", "value2"}));
|
||||
|
||||
// Fields
|
||||
Field int_field("ints", int64());
|
||||
Field float_field("floats", float32(), /*nullable=*/false);
|
||||
auto heap_int_field = field("ints", int64());
|
||||
|
||||
// Scalars
|
||||
NullScalar null_scalar;
|
||||
auto heap_null_scalar = MakeNullScalar(null());
|
||||
|
||||
BooleanScalar bool_scalar_null{};
|
||||
BooleanScalar bool_scalar{true};
|
||||
auto heap_bool_scalar = *MakeScalar(boolean(), true);
|
||||
|
||||
Int8Scalar int8_scalar_null{};
|
||||
UInt8Scalar uint8_scalar_null{};
|
||||
Int64Scalar int64_scalar_null{};
|
||||
UInt64Scalar uint64_scalar_null{};
|
||||
Int8Scalar int8_scalar{-42};
|
||||
UInt8Scalar uint8_scalar{234};
|
||||
Int64Scalar int64_scalar{-9223372036854775807LL - 1};
|
||||
UInt64Scalar uint64_scalar{18446744073709551615ULL};
|
||||
HalfFloatScalar half_float_scalar{48640}; // -1.5
|
||||
FloatScalar float_scalar{1.25f};
|
||||
DoubleScalar double_scalar{2.5};
|
||||
|
||||
Time32Scalar time_scalar_s{100, TimeUnit::SECOND};
|
||||
Time32Scalar time_scalar_ms{1000, TimeUnit::MILLI};
|
||||
Time64Scalar time_scalar_us{10000, TimeUnit::MICRO};
|
||||
Time64Scalar time_scalar_ns{100000, TimeUnit::NANO};
|
||||
Time64Scalar time_scalar_null{time64(TimeUnit::NANO)};
|
||||
|
||||
DurationScalar duration_scalar_s{-100, TimeUnit::SECOND};
|
||||
DurationScalar duration_scalar_ms{-1000, TimeUnit::MILLI};
|
||||
DurationScalar duration_scalar_us{-10000, TimeUnit::MICRO};
|
||||
DurationScalar duration_scalar_ns{-100000, TimeUnit::NANO};
|
||||
DurationScalar duration_scalar_null{duration(TimeUnit::NANO)};
|
||||
|
||||
TimestampScalar timestamp_scalar_s{12345, timestamp(TimeUnit::SECOND)};
|
||||
TimestampScalar timestamp_scalar_ms{-123456, timestamp(TimeUnit::MILLI)};
|
||||
TimestampScalar timestamp_scalar_us{1234567, timestamp(TimeUnit::MICRO)};
|
||||
TimestampScalar timestamp_scalar_ns{-12345678, timestamp(TimeUnit::NANO)};
|
||||
TimestampScalar timestamp_scalar_null{timestamp(TimeUnit::NANO)};
|
||||
|
||||
TimestampScalar timestamp_scalar_s_tz{12345,
|
||||
timestamp(TimeUnit::SECOND, "Europe/Paris")};
|
||||
TimestampScalar timestamp_scalar_ms_tz{-123456,
|
||||
timestamp(TimeUnit::MILLI, "Europe/Paris")};
|
||||
TimestampScalar timestamp_scalar_us_tz{1234567,
|
||||
timestamp(TimeUnit::MICRO, "Europe/Paris")};
|
||||
TimestampScalar timestamp_scalar_ns_tz{-12345678,
|
||||
timestamp(TimeUnit::NANO, "Europe/Paris")};
|
||||
TimestampScalar timestamp_scalar_null_tz{timestamp(TimeUnit::NANO, "Europe/Paris")};
|
||||
|
||||
MonthIntervalScalar month_interval_scalar{23};
|
||||
MonthIntervalScalar month_interval_scalar_null{};
|
||||
DayTimeIntervalScalar day_time_interval_scalar{{23, -456}};
|
||||
DayTimeIntervalScalar day_time_interval_scalar_null{};
|
||||
MonthDayNanoIntervalScalar month_day_nano_interval_scalar{{1, 23, -456}};
|
||||
MonthDayNanoIntervalScalar month_day_nano_interval_scalar_null{};
|
||||
|
||||
Date32Scalar date32_scalar{23};
|
||||
Date32Scalar date32_scalar_null{};
|
||||
Date64Scalar date64_scalar{45 * 86400000LL};
|
||||
Date64Scalar date64_scalar_null{};
|
||||
|
||||
Decimal128Scalar decimal128_scalar_pos_scale_pos{Decimal128("1234567"),
|
||||
decimal128(10, 4)};
|
||||
Decimal128Scalar decimal128_scalar_pos_scale_neg{Decimal128("-1234567"),
|
||||
decimal128(10, 4)};
|
||||
Decimal128Scalar decimal128_scalar_neg_scale_pos{Decimal128("1234567"),
|
||||
decimal128(10, -4)};
|
||||
Decimal128Scalar decimal128_scalar_neg_scale_neg{Decimal128("-1234567"),
|
||||
decimal128(10, -4)};
|
||||
Decimal128Scalar decimal128_scalar_null{decimal128(10, 4)};
|
||||
auto heap_decimal128_scalar = *MakeScalar(decimal128(10, 4), Decimal128("1234567"));
|
||||
|
||||
Decimal256Scalar decimal256_scalar_pos_scale_pos{
|
||||
Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, 4)};
|
||||
Decimal256Scalar decimal256_scalar_pos_scale_neg{
|
||||
Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, 4)};
|
||||
Decimal256Scalar decimal256_scalar_neg_scale_pos{
|
||||
Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, -4)};
|
||||
Decimal256Scalar decimal256_scalar_neg_scale_neg{
|
||||
Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, -4)};
|
||||
Decimal256Scalar decimal256_scalar_null{decimal256(50, 4)};
|
||||
auto heap_decimal256_scalar = *MakeScalar(
|
||||
decimal256(50, 4), Decimal256("1234567890123456789012345678901234567890123456"));
|
||||
|
||||
BinaryScalar binary_scalar_null{};
|
||||
BinaryScalar binary_scalar_unallocated{std::shared_ptr<Buffer>{nullptr}};
|
||||
BinaryScalar binary_scalar_empty{Buffer::FromString("")};
|
||||
BinaryScalar binary_scalar_abc{Buffer::FromString("abc")};
|
||||
BinaryScalar binary_scalar_bytes{
|
||||
Buffer::FromString(std::string() + '\x00' + "\x1f\xff")};
|
||||
|
||||
StringScalar string_scalar_null{};
|
||||
StringScalar string_scalar_unallocated{std::shared_ptr<Buffer>{nullptr}};
|
||||
StringScalar string_scalar_empty{Buffer::FromString("")};
|
||||
StringScalar string_scalar_hehe{Buffer::FromString("héhé")};
|
||||
StringScalar string_scalar_invalid_chars{
|
||||
Buffer::FromString(std::string("abc") + '\x00' + "def\xffghi")};
|
||||
|
||||
LargeBinaryScalar large_binary_scalar_abc{Buffer::FromString("abc")};
|
||||
LargeStringScalar large_string_scalar_hehe{Buffer::FromString("héhé")};
|
||||
|
||||
FixedSizeBinaryScalar fixed_size_binary_scalar{Buffer::FromString("abc"),
|
||||
fixed_size_binary(3)};
|
||||
FixedSizeBinaryScalar fixed_size_binary_scalar_null{
|
||||
Buffer::FromString(" "), fixed_size_binary(3), /*is_valid=*/false};
|
||||
|
||||
std::shared_ptr<Array> dict_array;
|
||||
dict_array = *ArrayFromJSONString(utf8(), R"(["foo", "bar", "quux"])");
|
||||
DictionaryScalar dict_scalar{{std::make_shared<Int8Scalar>(42), dict_array},
|
||||
dictionary(int8(), utf8())};
|
||||
DictionaryScalar dict_scalar_null{dictionary(int8(), utf8())};
|
||||
|
||||
std::shared_ptr<Array> list_value_array = *ArrayFromJSONString(int32(), R"([4, 5, 6])");
|
||||
std::shared_ptr<Array> list_zero_length = *ArrayFromJSONString(int32(), R"([])");
|
||||
ListScalar list_scalar{list_value_array};
|
||||
ListScalar list_scalar_null{list_zero_length, list(int32()), /*is_valid=*/false};
|
||||
LargeListScalar large_list_scalar{list_value_array};
|
||||
LargeListScalar large_list_scalar_null{list_zero_length, large_list(int32()),
|
||||
/*is_valid=*/false};
|
||||
FixedSizeListScalar fixed_size_list_scalar{list_value_array};
|
||||
FixedSizeListScalar fixed_size_list_scalar_null{
|
||||
list_value_array, fixed_size_list(int32(), 3), /*is_valid=*/false};
|
||||
|
||||
auto struct_scalar_type = struct_({field("ints", int32()), field("strs", utf8())});
|
||||
StructScalar struct_scalar{
|
||||
ScalarVector{MakeScalar(int32_t(42)), MakeScalar("some text")}, struct_scalar_type};
|
||||
StructScalar struct_scalar_null{struct_scalar.value, struct_scalar_type,
|
||||
/*is_valid=*/false};
|
||||
|
||||
auto sparse_union_scalar_type =
|
||||
sparse_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42});
|
||||
auto dense_union_scalar_type =
|
||||
dense_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42});
|
||||
std::vector<std::shared_ptr<Scalar>> union_values = {MakeScalar(int32_t(43)),
|
||||
MakeNullScalar(utf8())};
|
||||
SparseUnionScalar sparse_union_scalar{union_values, 7, sparse_union_scalar_type};
|
||||
DenseUnionScalar dense_union_scalar{union_values[0], 7, dense_union_scalar_type};
|
||||
|
||||
union_values[0] = MakeNullScalar(int32());
|
||||
SparseUnionScalar sparse_union_scalar_null{union_values, 7, sparse_union_scalar_type};
|
||||
DenseUnionScalar dense_union_scalar_null{union_values[0], 7, dense_union_scalar_type};
|
||||
|
||||
auto extension_scalar_type = std::make_shared<UuidType>();
|
||||
ExtensionScalar extension_scalar{
|
||||
std::make_shared<FixedSizeBinaryScalar>(Buffer::FromString("0123456789abcdef"),
|
||||
extension_scalar_type->storage_type()),
|
||||
extension_scalar_type};
|
||||
ExtensionScalar extension_scalar_null{extension_scalar.value, extension_scalar_type,
|
||||
/*is_valid=*/false};
|
||||
|
||||
auto heap_map_scalar =
|
||||
*ScalarFromJSONString(map(utf8(), int32()), R"([["a", 5], ["b", 6]])");
|
||||
auto heap_map_scalar_null = MakeNullScalar(heap_map_scalar->type);
|
||||
|
||||
// Array and ArrayData
|
||||
auto heap_null_array = SliceArrayFromJSON(null(), "[null, null]");
|
||||
|
||||
auto heap_int32_array = SliceArrayFromJSON(int32(), "[-5, 6, null, 42]");
|
||||
ArrayData int32_array_data{*heap_int32_array->data()};
|
||||
Int32Array int32_array{heap_int32_array->data()->Copy()};
|
||||
|
||||
auto heap_int32_array_no_nulls = SliceArrayFromJSON(int32(), "[-5, 6, 3, 42]");
|
||||
|
||||
const char* json_int32_array = "[-1, 2, -3, 4, null, -5, 6, -7, 8, null, -9, -10]";
|
||||
auto heap_int32_array_sliced_1_9 = SliceArrayFromJSON(int32(), json_int32_array, 1, 9);
|
||||
auto heap_int32_array_sliced_2_6 = SliceArrayFromJSON(int32(), json_int32_array, 2, 6);
|
||||
auto heap_int32_array_sliced_8_4 = SliceArrayFromJSON(int32(), json_int32_array, 8, 4);
|
||||
auto heap_int32_array_sliced_empty =
|
||||
SliceArrayFromJSON(int32(), json_int32_array, 6, 0);
|
||||
|
||||
const char* json_bool_array =
|
||||
"[false, false, true, true, null, null, false, false, true, true, "
|
||||
"null, null, false, false, true, true, null, null]";
|
||||
auto heap_bool_array = SliceArrayFromJSON(boolean(), json_bool_array);
|
||||
auto heap_bool_array_sliced_1_9 = SliceArrayFromJSON(boolean(), json_bool_array, 1, 9);
|
||||
auto heap_bool_array_sliced_2_6 = SliceArrayFromJSON(boolean(), json_bool_array, 2, 6);
|
||||
auto heap_bool_array_sliced_empty =
|
||||
SliceArrayFromJSON(boolean(), json_bool_array, 6, 0);
|
||||
|
||||
auto heap_list_array = SliceArrayFromJSON(list(int64()), "[[1, 2], null, []]");
|
||||
ListArray list_array{heap_list_array->data()};
|
||||
|
||||
const char* json_double_array = "[-1.5, null]";
|
||||
auto heap_double_array = SliceArrayFromJSON(float64(), json_double_array);
|
||||
|
||||
const char* json_float16_array = "[0, 48640]";
|
||||
auto heap_float16_array =
|
||||
*SliceArrayFromJSON(uint16(), json_float16_array)->View(float16());
|
||||
|
||||
auto heap_date32_array =
|
||||
SliceArrayFromJSON(date32(), "[0, null, 18336, -9004, -719162, -719163]");
|
||||
auto heap_date64_array = SliceArrayFromJSON(
|
||||
date64(), "[1584230400000, -777945600000, -62135596800000, -62135683200000, 123]");
|
||||
|
||||
const char* json_time_array = "[null, -123, 456]";
|
||||
auto heap_time32_array_s =
|
||||
SliceArrayFromJSON(time32(TimeUnit::SECOND), json_time_array);
|
||||
auto heap_time32_array_ms =
|
||||
SliceArrayFromJSON(time32(TimeUnit::MILLI), json_time_array);
|
||||
auto heap_time64_array_us =
|
||||
SliceArrayFromJSON(time64(TimeUnit::MICRO), json_time_array);
|
||||
auto heap_time64_array_ns = SliceArrayFromJSON(time64(TimeUnit::NANO), json_time_array);
|
||||
|
||||
auto heap_month_interval_array =
|
||||
SliceArrayFromJSON(month_interval(), "[123, -456, null]");
|
||||
auto heap_day_time_interval_array =
|
||||
SliceArrayFromJSON(day_time_interval(), "[[1, -600], null]");
|
||||
auto heap_month_day_nano_interval_array =
|
||||
SliceArrayFromJSON(month_day_nano_interval(), "[[1, -600, 5000], null]");
|
||||
|
||||
const char* json_duration_array = "[null, -1234567890123456789]";
|
||||
auto heap_duration_array_s =
|
||||
SliceArrayFromJSON(duration(TimeUnit::SECOND), json_duration_array);
|
||||
auto heap_duration_array_ns =
|
||||
SliceArrayFromJSON(duration(TimeUnit::NANO), json_duration_array);
|
||||
|
||||
auto heap_timestamp_array_s = SliceArrayFromJSON(
|
||||
timestamp(TimeUnit::SECOND),
|
||||
R"([null, "1970-01-01 00:00:00", "1900-02-28 12:34:56", "3989-07-14 00:00:00"])");
|
||||
auto heap_timestamp_array_ms = SliceArrayFromJSON(
|
||||
timestamp(TimeUnit::MILLI),
|
||||
R"([null, "1900-02-28 12:34:56.123", "3989-07-14 00:00:00.789"])");
|
||||
auto heap_timestamp_array_us = SliceArrayFromJSON(
|
||||
timestamp(TimeUnit::MICRO),
|
||||
R"([null, "1900-02-28 12:34:56.654321", "3989-07-14 00:00:00.456789"])");
|
||||
auto heap_timestamp_array_ns = SliceArrayFromJSON(
|
||||
timestamp(TimeUnit::NANO), R"([null, "1900-02-28 12:34:56.987654321"])");
|
||||
|
||||
auto heap_decimal128_array = SliceArrayFromJSON(
|
||||
decimal128(30, 6),
|
||||
R"([null, "-1234567890123456789.012345", "1234567890123456789.012345"])");
|
||||
auto heap_decimal256_array = SliceArrayFromJSON(
|
||||
decimal256(50, 6), R"([null, "-123456789012345678901234567890123456789.012345"])");
|
||||
auto heap_decimal128_array_sliced = heap_decimal128_array->Slice(1, 1);
|
||||
|
||||
auto heap_fixed_size_binary_array =
|
||||
SliceArrayFromJSON(fixed_size_binary(3), "[null, \"abc\", \"\\u0000\\u001f\xff\"]");
|
||||
auto heap_fixed_size_binary_array_zero_width =
|
||||
SliceArrayFromJSON(fixed_size_binary(0), R"([null, ""])");
|
||||
auto heap_fixed_size_binary_array_sliced = heap_fixed_size_binary_array->Slice(1, 1);
|
||||
|
||||
const char* json_binary_array = "[null, \"abcd\", \"\\u0000\\u001f\xff\"]";
|
||||
auto heap_binary_array = SliceArrayFromJSON(binary(), json_binary_array);
|
||||
auto heap_large_binary_array = SliceArrayFromJSON(large_binary(), json_binary_array);
|
||||
const char* json_string_array = "[null, \"héhé\", \"invalid \xff char\"]";
|
||||
auto heap_string_array = SliceArrayFromJSON(utf8(), json_string_array);
|
||||
auto heap_large_string_array = SliceArrayFromJSON(large_utf8(), json_string_array);
|
||||
auto heap_binary_array_sliced = heap_binary_array->Slice(1, 1);
|
||||
|
||||
// ChunkedArray
|
||||
ArrayVector array_chunks(2);
|
||||
array_chunks[0] = *ArrayFromJSONString(int32(), "[1, 2]");
|
||||
array_chunks[1] = *ArrayFromJSONString(int32(), "[3, null, 4]");
|
||||
ChunkedArray chunked_array{array_chunks};
|
||||
|
||||
// RecordBatch
|
||||
auto batch_schema = schema({field("ints", int32()), field("strs", utf8())});
|
||||
ArrayVector batch_columns{2};
|
||||
batch_columns[0] = *ArrayFromJSONString(int32(), "[1, 2, 3]");
|
||||
batch_columns[1] = *ArrayFromJSONString(utf8(), R"(["abc", null, "def"])");
|
||||
auto batch = RecordBatch::Make(batch_schema, /*num_rows=*/3, batch_columns);
|
||||
auto batch_with_metadata = batch->ReplaceSchemaMetadata(
|
||||
key_value_metadata({"key1", "key2", "key3"}, {"value1", "value2", "value3"}));
|
||||
|
||||
// Table
|
||||
auto col1 = ChunkedArrayFromJSONString(int32(), {"[1, 2, 3]", "[4, 5]"});
|
||||
auto col2 = ChunkedArrayFromJSONString(
|
||||
utf8(), {R"(["abc", null])", R"(["def"])", R"(["ghi", "jkl"])"});
|
||||
auto table = Table::Make(batch_schema, {*col1, *col2});
|
||||
|
||||
// Datum
|
||||
Datum empty_datum{};
|
||||
Datum scalar_datum{MakeNullScalar(boolean())};
|
||||
Datum array_datum{heap_int32_array};
|
||||
Datum chunked_array_datum{chunked_array};
|
||||
Datum batch_datum{batch};
|
||||
Datum table_datum{table};
|
||||
|
||||
#ifdef __clang__
|
||||
_Pragma("clang diagnostic pop");
|
||||
#elif defined(__GNUC__)
|
||||
_Pragma("GCC diagnostic pop");
|
||||
#endif
|
||||
|
||||
// Hook into debugger
|
||||
::arrow::internal::DebugTrap();
|
||||
}
|
||||
|
||||
} // namespace gdb
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,29 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace gdb {
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
void TestSession();
|
||||
|
||||
} // namespace gdb
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,504 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// helpers.h includes a NumPy header, so we include this first
|
||||
#include "arrow/python/numpy_init.h"
|
||||
#include "arrow/python/numpy_interop.h"
|
||||
|
||||
#include "arrow/python/helpers.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/decimal.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/config.h"
|
||||
#include "arrow/util/float16.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using internal::checked_cast;
|
||||
|
||||
namespace py {
|
||||
|
||||
#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \
|
||||
case Type::NAME: \
|
||||
return FACTORY()
|
||||
|
||||
std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
|
||||
switch (type) {
|
||||
case Type::NA:
|
||||
return null();
|
||||
GET_PRIMITIVE_TYPE(UINT8, uint8);
|
||||
GET_PRIMITIVE_TYPE(INT8, int8);
|
||||
GET_PRIMITIVE_TYPE(UINT16, uint16);
|
||||
GET_PRIMITIVE_TYPE(INT16, int16);
|
||||
GET_PRIMITIVE_TYPE(UINT32, uint32);
|
||||
GET_PRIMITIVE_TYPE(INT32, int32);
|
||||
GET_PRIMITIVE_TYPE(UINT64, uint64);
|
||||
GET_PRIMITIVE_TYPE(INT64, int64);
|
||||
GET_PRIMITIVE_TYPE(DATE32, date32);
|
||||
GET_PRIMITIVE_TYPE(DATE64, date64);
|
||||
GET_PRIMITIVE_TYPE(BOOL, boolean);
|
||||
GET_PRIMITIVE_TYPE(HALF_FLOAT, float16);
|
||||
GET_PRIMITIVE_TYPE(FLOAT, float32);
|
||||
GET_PRIMITIVE_TYPE(DOUBLE, float64);
|
||||
GET_PRIMITIVE_TYPE(BINARY, binary);
|
||||
GET_PRIMITIVE_TYPE(STRING, utf8);
|
||||
GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
|
||||
GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
|
||||
GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view);
|
||||
GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view);
|
||||
GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval);
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
PyObject* PyFloat_FromHalf(uint16_t value) {
|
||||
// Convert the uint16_t Float16 value to a PyFloat object
|
||||
arrow::util::Float16 half_val = arrow::util::Float16::FromBits(value);
|
||||
return PyFloat_FromDouble(half_val.ToDouble());
|
||||
}
|
||||
|
||||
Result<uint16_t> PyFloat_AsHalf(PyObject* obj) {
|
||||
if (PyFloat_Check(obj)) {
|
||||
arrow::util::Float16 half_val =
|
||||
arrow::util::Float16::FromDouble(PyFloat_AsDouble(obj));
|
||||
return half_val.bits();
|
||||
} else if (has_numpy() && PyArray_IsScalar(obj, Half)) {
|
||||
return PyArrayScalar_VAL(obj, Half);
|
||||
} else {
|
||||
return Status::TypeError("conversion to float16 expects a `float` or ",
|
||||
"`np.float16` object, got ", Py_TYPE(obj)->tp_name);
|
||||
}
|
||||
}
|
||||
|
||||
namespace internal {
|
||||
|
||||
std::string PyBytes_AsStdString(PyObject* obj) {
|
||||
ARROW_DCHECK(PyBytes_Check(obj));
|
||||
return std::string(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj));
|
||||
}
|
||||
|
||||
Status PyUnicode_AsStdString(PyObject* obj, std::string* out) {
|
||||
ARROW_DCHECK(PyUnicode_Check(obj));
|
||||
Py_ssize_t size;
|
||||
// The utf-8 representation is cached on the unicode object
|
||||
const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
|
||||
RETURN_IF_PYERROR();
|
||||
*out = std::string(data, size);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::string PyObject_StdStringRepr(PyObject* obj) {
|
||||
OwnedRef unicode_ref(PyObject_Repr(obj));
|
||||
OwnedRef bytes_ref;
|
||||
|
||||
if (unicode_ref) {
|
||||
bytes_ref.reset(
|
||||
PyUnicode_AsEncodedString(unicode_ref.obj(), "utf8", "backslashreplace"));
|
||||
}
|
||||
if (!bytes_ref) {
|
||||
PyErr_Clear();
|
||||
std::stringstream ss;
|
||||
ss << "<object of type '" << Py_TYPE(obj)->tp_name << "' repr() failed>";
|
||||
return ss.str();
|
||||
}
|
||||
return PyBytes_AsStdString(bytes_ref.obj());
|
||||
}
|
||||
|
||||
Status PyObject_StdStringStr(PyObject* obj, std::string* out) {
|
||||
OwnedRef string_ref(PyObject_Str(obj));
|
||||
RETURN_IF_PYERROR();
|
||||
return PyUnicode_AsStdString(string_ref.obj(), out);
|
||||
}
|
||||
|
||||
Result<bool> IsModuleImported(const std::string& module_name) {
|
||||
// PyImport_GetModuleDict returns with a borrowed reference
|
||||
OwnedRef key(PyUnicode_FromString(module_name.c_str()));
|
||||
auto is_imported = PyDict_Contains(PyImport_GetModuleDict(), key.obj());
|
||||
RETURN_IF_PYERROR();
|
||||
return is_imported;
|
||||
}
|
||||
|
||||
Status ImportModule(const std::string& module_name, OwnedRef* ref) {
|
||||
PyObject* module = PyImport_ImportModule(module_name.c_str());
|
||||
RETURN_IF_PYERROR();
|
||||
ref->reset(module);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref) {
|
||||
PyObject* attr = PyObject_GetAttrString(module, name.c_str());
|
||||
RETURN_IF_PYERROR();
|
||||
ref->reset(attr);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
Status IntegerOverflowStatus(PyObject* obj, const std::string& overflow_message) {
|
||||
if (overflow_message.empty()) {
|
||||
std::string obj_as_stdstring;
|
||||
RETURN_NOT_OK(PyObject_StdStringStr(obj, &obj_as_stdstring));
|
||||
return Status::Invalid("Value ", obj_as_stdstring,
|
||||
" too large to fit in C integer type");
|
||||
} else {
|
||||
return Status::Invalid(overflow_message);
|
||||
}
|
||||
}
|
||||
|
||||
Result<OwnedRef> PyObjectToPyInt(PyObject* obj) {
|
||||
// Try to call __index__ or __int__ on `obj`
|
||||
// (starting from Python 3.10, the latter isn't done anymore by PyLong_AsLong*).
|
||||
OwnedRef ref(PyNumber_Index(obj));
|
||||
if (ref) {
|
||||
return std::move(ref);
|
||||
}
|
||||
PyErr_Clear();
|
||||
const auto nb = Py_TYPE(obj)->tp_as_number;
|
||||
if (nb && nb->nb_int) {
|
||||
ref.reset(nb->nb_int(obj));
|
||||
if (!ref) {
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
ARROW_DCHECK(ref);
|
||||
return std::move(ref);
|
||||
}
|
||||
return Status::TypeError(
|
||||
"object of type ",
|
||||
PyObject_StdStringRepr(reinterpret_cast<PyObject*>(Py_TYPE(obj))),
|
||||
" cannot be converted to int");
|
||||
}
|
||||
|
||||
// Extract C signed int from Python object
|
||||
template <typename Int, enable_if_t<std::is_signed<Int>::value, Int> = 0>
|
||||
Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
|
||||
static_assert(sizeof(Int) <= sizeof(long long), // NOLINT
|
||||
"integer type larger than long long");
|
||||
|
||||
OwnedRef ref;
|
||||
if (!PyLong_Check(obj)) {
|
||||
ARROW_ASSIGN_OR_RAISE(ref, PyObjectToPyInt(obj));
|
||||
obj = ref.obj();
|
||||
}
|
||||
|
||||
if (sizeof(Int) > sizeof(long)) { // NOLINT
|
||||
const auto value = PyLong_AsLongLong(obj);
|
||||
if (ARROW_PREDICT_FALSE(value == -1)) {
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
|
||||
value > std::numeric_limits<Int>::max())) {
|
||||
return IntegerOverflowStatus(obj, overflow_message);
|
||||
}
|
||||
*out = static_cast<Int>(value);
|
||||
} else {
|
||||
const auto value = PyLong_AsLong(obj);
|
||||
if (ARROW_PREDICT_FALSE(value == -1)) {
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
|
||||
value > std::numeric_limits<Int>::max())) {
|
||||
return IntegerOverflowStatus(obj, overflow_message);
|
||||
}
|
||||
*out = static_cast<Int>(value);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Extract C unsigned int from Python object
|
||||
template <typename Int, enable_if_t<std::is_unsigned<Int>::value, Int> = 0>
|
||||
Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
|
||||
static_assert(sizeof(Int) <= sizeof(unsigned long long), // NOLINT
|
||||
"integer type larger than unsigned long long");
|
||||
|
||||
OwnedRef ref;
|
||||
if (!PyLong_Check(obj)) {
|
||||
ARROW_ASSIGN_OR_RAISE(ref, PyObjectToPyInt(obj));
|
||||
obj = ref.obj();
|
||||
}
|
||||
|
||||
if (sizeof(Int) > sizeof(unsigned long)) { // NOLINT
|
||||
const auto value = PyLong_AsUnsignedLongLong(obj);
|
||||
if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
|
||||
return IntegerOverflowStatus(obj, overflow_message);
|
||||
}
|
||||
*out = static_cast<Int>(value);
|
||||
} else {
|
||||
const auto value = PyLong_AsUnsignedLong(obj);
|
||||
if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
|
||||
return IntegerOverflowStatus(obj, overflow_message);
|
||||
}
|
||||
*out = static_cast<Int>(value);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template <typename Int>
|
||||
Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message) {
|
||||
if (PyBool_Check(obj)) {
|
||||
return Status::TypeError("Expected integer, got bool");
|
||||
}
|
||||
return CIntFromPythonImpl(obj, out, overflow_message);
|
||||
}
|
||||
|
||||
template Status CIntFromPython(PyObject*, int8_t*, const std::string&);
|
||||
template Status CIntFromPython(PyObject*, int16_t*, const std::string&);
|
||||
template Status CIntFromPython(PyObject*, int32_t*, const std::string&);
|
||||
template Status CIntFromPython(PyObject*, int64_t*, const std::string&);
|
||||
template Status CIntFromPython(PyObject*, uint8_t*, const std::string&);
|
||||
template Status CIntFromPython(PyObject*, uint16_t*, const std::string&);
|
||||
template Status CIntFromPython(PyObject*, uint32_t*, const std::string&);
|
||||
template Status CIntFromPython(PyObject*, uint64_t*, const std::string&);
|
||||
|
||||
inline bool MayHaveNaN(PyObject* obj) {
|
||||
// Some core types can be very quickly type-checked and do not allow NaN values
|
||||
const int64_t non_nan_tpflags = Py_TPFLAGS_LONG_SUBCLASS | Py_TPFLAGS_LIST_SUBCLASS |
|
||||
Py_TPFLAGS_TUPLE_SUBCLASS | Py_TPFLAGS_BYTES_SUBCLASS |
|
||||
Py_TPFLAGS_UNICODE_SUBCLASS | Py_TPFLAGS_DICT_SUBCLASS |
|
||||
Py_TPFLAGS_BASE_EXC_SUBCLASS | Py_TPFLAGS_TYPE_SUBCLASS;
|
||||
return !PyType_HasFeature(Py_TYPE(obj), non_nan_tpflags);
|
||||
}
|
||||
|
||||
bool PyFloat_IsNaN(PyObject* obj) {
|
||||
return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// This needs a conditional, because using std::once_flag could introduce
|
||||
// a deadlock when the GIL is enabled. See
|
||||
// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for
|
||||
// more info.
|
||||
#ifdef Py_GIL_DISABLED
|
||||
static std::once_flag pandas_static_initialized;
|
||||
#else
|
||||
static bool pandas_static_initialized = false;
|
||||
#endif
|
||||
|
||||
// Once initialized, these variables hold borrowed references to Pandas static data.
|
||||
// We should not use OwnedRef here because Python destructors would be
|
||||
// called on a finalized interpreter.
|
||||
static PyObject* pandas_NA = nullptr;
|
||||
static PyObject* pandas_NaT = nullptr;
|
||||
static PyObject* pandas_Timedelta = nullptr;
|
||||
static PyObject* pandas_Timestamp = nullptr;
|
||||
static PyTypeObject* pandas_NaTType = nullptr;
|
||||
static PyObject* pandas_DateOffset = nullptr;
|
||||
|
||||
void GetPandasStaticSymbols() {
|
||||
OwnedRef pandas;
|
||||
|
||||
// Import pandas
|
||||
Status s = ImportModule("pandas", &pandas);
|
||||
if (!s.ok()) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef Py_GIL_DISABLED
|
||||
// Since ImportModule can release the GIL, another thread could have
|
||||
// already initialized the static data.
|
||||
if (pandas_static_initialized) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
OwnedRef ref;
|
||||
|
||||
// set NaT sentinel and its type
|
||||
if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
|
||||
pandas_NaT = ref.obj();
|
||||
// PyObject_Type returns a new reference but we trust that pandas.NaT will
|
||||
// outlive our use of this PyObject*
|
||||
pandas_NaTType = Py_TYPE(ref.obj());
|
||||
}
|
||||
|
||||
// retain a reference to Timedelta
|
||||
if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) {
|
||||
pandas_Timedelta = ref.obj();
|
||||
}
|
||||
|
||||
// retain a reference to Timestamp
|
||||
if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) {
|
||||
pandas_Timestamp = ref.obj();
|
||||
}
|
||||
|
||||
// if pandas.NA exists, retain a reference to it
|
||||
if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
|
||||
pandas_NA = ref.obj();
|
||||
}
|
||||
|
||||
// Import DateOffset type
|
||||
if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) {
|
||||
pandas_DateOffset = ref.obj();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#ifdef Py_GIL_DISABLED
|
||||
void InitPandasStaticData() {
|
||||
std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
|
||||
}
|
||||
#else
|
||||
void InitPandasStaticData() {
|
||||
// NOTE: This is called with the GIL held. We needn't (and shouldn't,
|
||||
// to avoid deadlocks) use an additional C++ lock (ARROW-10519).
|
||||
if (pandas_static_initialized) {
|
||||
return;
|
||||
}
|
||||
GetPandasStaticSymbols();
|
||||
pandas_static_initialized = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool PandasObjectIsNull(PyObject* obj) {
|
||||
if (!MayHaveNaN(obj)) {
|
||||
return false;
|
||||
}
|
||||
if (obj == Py_None) {
|
||||
return true;
|
||||
}
|
||||
if (PyFloat_IsNaN(obj) || (pandas_NA && obj == pandas_NA) ||
|
||||
(pandas_NaTType && PyObject_TypeCheck(obj, pandas_NaTType)) ||
|
||||
(internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj))) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool IsPandasTimedelta(PyObject* obj) {
|
||||
return pandas_Timedelta && PyObject_IsInstance(obj, pandas_Timedelta);
|
||||
}
|
||||
|
||||
bool IsPandasTimestamp(PyObject* obj) {
|
||||
return pandas_Timestamp && PyObject_IsInstance(obj, pandas_Timestamp);
|
||||
}
|
||||
|
||||
PyObject* BorrowPandasDataOffsetType() { return pandas_DateOffset; }
|
||||
|
||||
Status InvalidValue(PyObject* obj, const std::string& why) {
|
||||
auto obj_as_str = PyObject_StdStringRepr(obj);
|
||||
return Status::Invalid("Could not convert ", std::move(obj_as_str), " with type ",
|
||||
Py_TYPE(obj)->tp_name, ": ", why);
|
||||
}
|
||||
|
||||
Status InvalidType(PyObject* obj, const std::string& why) {
|
||||
auto obj_as_str = PyObject_StdStringRepr(obj);
|
||||
return Status::TypeError("Could not convert ", std::move(obj_as_str), " with type ",
|
||||
Py_TYPE(obj)->tp_name, ": ", why);
|
||||
}
|
||||
|
||||
Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) {
|
||||
if (PyLong_Check(obj)) {
|
||||
int overflow = 0;
|
||||
*out = PyLong_AsLongLongAndOverflow(obj, &overflow);
|
||||
if (overflow) {
|
||||
return Status::Invalid("PyLong is too large to fit int64");
|
||||
}
|
||||
} else if (PyArray_IsScalar(obj, Byte)) {
|
||||
*out = reinterpret_cast<PyByteScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, UByte)) {
|
||||
*out = reinterpret_cast<PyUByteScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, Short)) {
|
||||
*out = reinterpret_cast<PyShortScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, UShort)) {
|
||||
*out = reinterpret_cast<PyUShortScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, Int)) {
|
||||
*out = reinterpret_cast<PyIntScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, UInt)) {
|
||||
*out = reinterpret_cast<PyUIntScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, Long)) {
|
||||
*out = reinterpret_cast<PyLongScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, ULong)) {
|
||||
*out = reinterpret_cast<PyULongScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, LongLong)) {
|
||||
*out = reinterpret_cast<PyLongLongScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, Int64)) {
|
||||
*out = reinterpret_cast<PyInt64ScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, ULongLong)) {
|
||||
*out = reinterpret_cast<PyULongLongScalarObject*>(obj)->obval;
|
||||
} else if (PyArray_IsScalar(obj, UInt64)) {
|
||||
*out = reinterpret_cast<PyUInt64ScalarObject*>(obj)->obval;
|
||||
} else {
|
||||
return Status::Invalid("Integer scalar type not recognized");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status IntegerScalarToDoubleSafe(PyObject* obj, double* out) {
|
||||
int64_t value = 0;
|
||||
RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value));
|
||||
|
||||
constexpr int64_t kDoubleMax = 1LL << 53;
|
||||
constexpr int64_t kDoubleMin = -(1LL << 53);
|
||||
|
||||
if (value < kDoubleMin || value > kDoubleMax) {
|
||||
return Status::Invalid("Integer value ", value, " is outside of the range exactly",
|
||||
" representable by a IEEE 754 double precision value");
|
||||
}
|
||||
*out = static_cast<double>(value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status IntegerScalarToFloat32Safe(PyObject* obj, float* out) {
|
||||
int64_t value = 0;
|
||||
RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value));
|
||||
|
||||
constexpr int64_t kFloatMax = 1LL << 24;
|
||||
constexpr int64_t kFloatMin = -(1LL << 24);
|
||||
|
||||
if (value < kFloatMin || value > kFloatMax) {
|
||||
return Status::Invalid("Integer value ", value, " is outside of the range exactly",
|
||||
" representable by a IEEE 754 single precision value");
|
||||
}
|
||||
*out = static_cast<float>(value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void DebugPrint(PyObject* obj) {
|
||||
std::string repr = PyObject_StdStringRepr(obj);
|
||||
PySys_WriteStderr("%s\n", repr.c_str());
|
||||
}
|
||||
|
||||
bool IsThreadingEnabled() {
|
||||
#ifdef ARROW_ENABLE_THREADING
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,160 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/python/numpy_interop.h"
|
||||
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace py {
|
||||
|
||||
class OwnedRef;
|
||||
|
||||
// \brief Get an arrow DataType instance from Arrow's Type::type enum
|
||||
// \param[in] type One of the values of Arrow's Type::type enum
|
||||
// \return A shared pointer to DataType
|
||||
ARROW_PYTHON_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
|
||||
|
||||
// \brief Construct a Python float object from a half-float uint16_t value.
|
||||
ARROW_PYTHON_EXPORT PyObject* PyFloat_FromHalf(uint16_t value);
|
||||
|
||||
// \brief Convert a Python object to a half-float uint16_t value.
|
||||
ARROW_PYTHON_EXPORT Result<uint16_t> PyFloat_AsHalf(PyObject* obj);
|
||||
|
||||
namespace internal {
|
||||
|
||||
// \brief Check that a Python module has been already imported
|
||||
// \param[in] module_name The name of the module
|
||||
Result<bool> IsModuleImported(const std::string& module_name);
|
||||
|
||||
// \brief Import a Python module
|
||||
// \param[in] module_name The name of the module
|
||||
// \param[out] ref The OwnedRef containing the module PyObject*
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status ImportModule(const std::string& module_name, OwnedRef* ref);
|
||||
|
||||
// \brief Import an object from a Python module
|
||||
// \param[in] module A Python module
|
||||
// \param[in] name The name of the object to import
|
||||
// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c
|
||||
// module
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref);
|
||||
|
||||
// \brief Check whether obj is an integer, independent of Python versions.
|
||||
inline bool IsPyInteger(PyObject* obj) { return PyLong_Check(obj); }
|
||||
|
||||
// \brief Import symbols from pandas that we need for various type-checking,
|
||||
// like pandas.NaT or pandas.NA
|
||||
void InitPandasStaticData();
|
||||
|
||||
// \brief Use pandas missing value semantics to check if a value is null
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool PandasObjectIsNull(PyObject* obj);
|
||||
|
||||
// \brief Check that obj is a pandas.Timedelta instance
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool IsPandasTimedelta(PyObject* obj);
|
||||
|
||||
// \brief Check that obj is a pandas.Timestamp instance
|
||||
bool IsPandasTimestamp(PyObject* obj);
|
||||
|
||||
// \brief Returned a borrowed reference to the pandas.tseries.offsets.DateOffset
|
||||
PyObject* BorrowPandasDataOffsetType();
|
||||
|
||||
// \brief Check whether obj is a floating-point NaN
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool PyFloat_IsNaN(PyObject* obj);
|
||||
|
||||
inline bool IsPyBinary(PyObject* obj) {
|
||||
return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj);
|
||||
}
|
||||
|
||||
// \brief Convert a Python integer into a C integer
|
||||
// \param[in] obj A Python integer
|
||||
// \param[out] out A pointer to a C integer to hold the result of the conversion
|
||||
// \return The status of the operation
|
||||
template <typename Int>
|
||||
Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = "");
|
||||
|
||||
// \brief Convert a Python unicode string to a std::string
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status PyUnicode_AsStdString(PyObject* obj, std::string* out);
|
||||
|
||||
// \brief Convert a Python bytes object to a std::string
|
||||
ARROW_PYTHON_EXPORT
|
||||
std::string PyBytes_AsStdString(PyObject* obj);
|
||||
|
||||
// \brief Call str() on the given object and return the result as a std::string
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status PyObject_StdStringStr(PyObject* obj, std::string* out);
|
||||
|
||||
// \brief Return the repr() of the given object (always succeeds)
|
||||
ARROW_PYTHON_EXPORT
|
||||
std::string PyObject_StdStringRepr(PyObject* obj);
|
||||
|
||||
// \brief Cast the given size to int32_t, with error checking
|
||||
inline Status CastSize(Py_ssize_t size, int32_t* out,
|
||||
const char* error_msg = "Maximum size exceeded (2GB)") {
|
||||
// size is assumed to be positive
|
||||
if (size > std::numeric_limits<int32_t>::max()) {
|
||||
return Status::Invalid(error_msg);
|
||||
}
|
||||
*out = static_cast<int32_t>(size);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NULLPTR) {
|
||||
// size is assumed to be positive
|
||||
*out = static_cast<int64_t>(size);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// \brief Print the Python object's __str__ form along with the passed error
|
||||
// message
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status InvalidValue(PyObject* obj, const std::string& why);
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status InvalidType(PyObject* obj, const std::string& why);
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status IntegerScalarToDoubleSafe(PyObject* obj, double* result);
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status IntegerScalarToFloat32Safe(PyObject* obj, float* result);
|
||||
|
||||
// \brief Print Python object __repr__
|
||||
void DebugPrint(PyObject* obj);
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool IsThreadingEnabled();
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,745 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/inference.h"
|
||||
#include "arrow/python/numpy_interop.h"
|
||||
|
||||
#include <datetime.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/scalar.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
#include "arrow/python/datetime.h"
|
||||
#include "arrow/python/decimal.h"
|
||||
#include "arrow/python/helpers.h"
|
||||
#include "arrow/python/iterators.h"
|
||||
#include "arrow/python/numpy_convert.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace {
|
||||
// Assigns a tuple to interval_types_tuple containing the nametuple for
|
||||
// MonthDayNanoIntervalType and if present dateutil's relativedelta and
|
||||
// pandas DateOffset.
|
||||
Status ImportPresentIntervalTypes(OwnedRefNoGIL* interval_types_tuple) {
|
||||
OwnedRef relative_delta_module;
|
||||
// These are Optional imports so swallow errors.
|
||||
OwnedRef relative_delta_type;
|
||||
// Try to import pandas to get types.
|
||||
internal::InitPandasStaticData();
|
||||
if (internal::ImportModule("dateutil.relativedelta", &relative_delta_module).ok()) {
|
||||
RETURN_NOT_OK(internal::ImportFromModule(relative_delta_module.obj(), "relativedelta",
|
||||
&relative_delta_type));
|
||||
}
|
||||
|
||||
PyObject* date_offset_type = internal::BorrowPandasDataOffsetType();
|
||||
interval_types_tuple->reset(
|
||||
PyTuple_New(1 + (date_offset_type != nullptr ? 1 : 0) +
|
||||
(relative_delta_type.obj() != nullptr ? 1 : 0)));
|
||||
RETURN_IF_PYERROR();
|
||||
int index = 0;
|
||||
PyTuple_SetItem(interval_types_tuple->obj(), index++,
|
||||
internal::NewMonthDayNanoTupleType());
|
||||
RETURN_IF_PYERROR();
|
||||
if (date_offset_type != nullptr) {
|
||||
Py_XINCREF(date_offset_type);
|
||||
PyTuple_SetItem(interval_types_tuple->obj(), index++, date_offset_type);
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
if (relative_delta_type.obj() != nullptr) {
|
||||
PyTuple_SetItem(interval_types_tuple->obj(), index++, relative_delta_type.detach());
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#define _NUMPY_UNIFY_NOOP(DTYPE) \
|
||||
case NPY_##DTYPE: \
|
||||
return OK;
|
||||
|
||||
#define _NUMPY_UNIFY_PROMOTE(DTYPE) \
|
||||
case NPY_##DTYPE: \
|
||||
current_type_num_ = dtype; \
|
||||
current_dtype_ = descr; \
|
||||
return OK;
|
||||
|
||||
#define _NUMPY_UNIFY_PROMOTE_TO(DTYPE, NEW_TYPE) \
|
||||
case NPY_##DTYPE: \
|
||||
current_type_num_ = NPY_##NEW_TYPE; \
|
||||
current_dtype_ = PyArray_DescrFromType(current_type_num_); \
|
||||
return OK;
|
||||
|
||||
// Form a consensus NumPy dtype to use for Arrow conversion for a
|
||||
// collection of dtype objects observed one at a time
|
||||
class NumPyDtypeUnifier {
|
||||
public:
|
||||
enum Action { OK, INVALID };
|
||||
|
||||
NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(nullptr) {}
|
||||
|
||||
Status InvalidMix(int new_dtype) {
|
||||
return Status::Invalid("Cannot mix NumPy dtypes ",
|
||||
GetNumPyTypeName(current_type_num_), " and ",
|
||||
GetNumPyTypeName(new_dtype));
|
||||
}
|
||||
|
||||
int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; }
|
||||
|
||||
int Observe_INT8(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_PROMOTE(INT16);
|
||||
_NUMPY_UNIFY_PROMOTE(INT32);
|
||||
_NUMPY_UNIFY_PROMOTE(INT64);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT32);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_INT16(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_NOOP(INT8);
|
||||
_NUMPY_UNIFY_PROMOTE(INT32);
|
||||
_NUMPY_UNIFY_PROMOTE(INT64);
|
||||
_NUMPY_UNIFY_NOOP(UINT8);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT32);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_INT32(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_NOOP(INT8);
|
||||
_NUMPY_UNIFY_NOOP(INT16);
|
||||
_NUMPY_UNIFY_PROMOTE(INT32);
|
||||
_NUMPY_UNIFY_PROMOTE(INT64);
|
||||
_NUMPY_UNIFY_NOOP(UINT8);
|
||||
_NUMPY_UNIFY_NOOP(UINT16);
|
||||
_NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_INT64(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_NOOP(INT8);
|
||||
_NUMPY_UNIFY_NOOP(INT16);
|
||||
_NUMPY_UNIFY_NOOP(INT32);
|
||||
_NUMPY_UNIFY_NOOP(INT64);
|
||||
_NUMPY_UNIFY_NOOP(UINT8);
|
||||
_NUMPY_UNIFY_NOOP(UINT16);
|
||||
_NUMPY_UNIFY_NOOP(UINT32);
|
||||
_NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_UINT8(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_PROMOTE(UINT16);
|
||||
_NUMPY_UNIFY_PROMOTE(UINT32);
|
||||
_NUMPY_UNIFY_PROMOTE(UINT64);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT32);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_UINT16(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_NOOP(UINT8);
|
||||
_NUMPY_UNIFY_PROMOTE(UINT32);
|
||||
_NUMPY_UNIFY_PROMOTE(UINT64);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT32);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_UINT32(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_NOOP(UINT8);
|
||||
_NUMPY_UNIFY_NOOP(UINT16);
|
||||
_NUMPY_UNIFY_PROMOTE(UINT64);
|
||||
_NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_UINT64(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_NOOP(UINT8);
|
||||
_NUMPY_UNIFY_NOOP(UINT16);
|
||||
_NUMPY_UNIFY_NOOP(UINT32);
|
||||
_NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_FLOAT16(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT32);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_FLOAT32(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_NOOP(INT8);
|
||||
_NUMPY_UNIFY_NOOP(INT16);
|
||||
_NUMPY_UNIFY_NOOP(INT32);
|
||||
_NUMPY_UNIFY_NOOP(INT64);
|
||||
_NUMPY_UNIFY_NOOP(UINT8);
|
||||
_NUMPY_UNIFY_NOOP(UINT16);
|
||||
_NUMPY_UNIFY_NOOP(UINT32);
|
||||
_NUMPY_UNIFY_NOOP(UINT64);
|
||||
_NUMPY_UNIFY_PROMOTE(FLOAT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_FLOAT64(PyArray_Descr* descr, int dtype) {
|
||||
switch (dtype) {
|
||||
_NUMPY_UNIFY_NOOP(INT8);
|
||||
_NUMPY_UNIFY_NOOP(INT16);
|
||||
_NUMPY_UNIFY_NOOP(INT32);
|
||||
_NUMPY_UNIFY_NOOP(INT64);
|
||||
_NUMPY_UNIFY_NOOP(UINT8);
|
||||
_NUMPY_UNIFY_NOOP(UINT16);
|
||||
_NUMPY_UNIFY_NOOP(UINT32);
|
||||
_NUMPY_UNIFY_NOOP(UINT64);
|
||||
default:
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
int Observe_DATETIME(PyArray_Descr* dtype_obj) {
|
||||
// TODO: check that units are all the same
|
||||
return OK;
|
||||
}
|
||||
|
||||
Status Observe(PyArray_Descr* descr) {
|
||||
int dtype = fix_numpy_type_num(descr->type_num);
|
||||
|
||||
if (current_type_num_ == -1) {
|
||||
current_dtype_ = descr;
|
||||
current_type_num_ = dtype;
|
||||
return Status::OK();
|
||||
} else if (current_type_num_ == dtype) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
#define OBSERVE_CASE(DTYPE) \
|
||||
case NPY_##DTYPE: \
|
||||
action = Observe_##DTYPE(descr, dtype); \
|
||||
break;
|
||||
|
||||
int action = OK;
|
||||
switch (current_type_num_) {
|
||||
OBSERVE_CASE(BOOL);
|
||||
OBSERVE_CASE(INT8);
|
||||
OBSERVE_CASE(INT16);
|
||||
OBSERVE_CASE(INT32);
|
||||
OBSERVE_CASE(INT64);
|
||||
OBSERVE_CASE(UINT8);
|
||||
OBSERVE_CASE(UINT16);
|
||||
OBSERVE_CASE(UINT32);
|
||||
OBSERVE_CASE(UINT64);
|
||||
OBSERVE_CASE(FLOAT16);
|
||||
OBSERVE_CASE(FLOAT32);
|
||||
OBSERVE_CASE(FLOAT64);
|
||||
case NPY_DATETIME:
|
||||
action = Observe_DATETIME(descr);
|
||||
break;
|
||||
default:
|
||||
return Status::NotImplemented("Unsupported numpy type ", GetNumPyTypeName(dtype));
|
||||
}
|
||||
|
||||
if (action == INVALID) {
|
||||
return InvalidMix(dtype);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool dtype_was_observed() const { return current_type_num_ != -1; }
|
||||
|
||||
PyArray_Descr* current_dtype() const { return current_dtype_; }
|
||||
|
||||
int current_type_num() const { return current_type_num_; }
|
||||
|
||||
private:
|
||||
int current_type_num_;
|
||||
PyArray_Descr* current_dtype_;
|
||||
};
|
||||
|
||||
class TypeInferrer {
|
||||
// A type inference visitor for Python values
|
||||
public:
|
||||
// \param validate_interval the number of elements to observe before checking
|
||||
// whether the data is mixed type or has other problems. This helps avoid
|
||||
// excess computation for each element while also making sure we "bail out"
|
||||
// early with long sequences that may have problems up front
|
||||
// \param make_unions permit mixed-type data by creating union types (not yet
|
||||
// implemented)
|
||||
explicit TypeInferrer(bool pandas_null_sentinels = false,
|
||||
int64_t validate_interval = 100, bool make_unions = false)
|
||||
: pandas_null_sentinels_(pandas_null_sentinels),
|
||||
validate_interval_(validate_interval),
|
||||
make_unions_(make_unions),
|
||||
total_count_(0),
|
||||
none_count_(0),
|
||||
bool_count_(0),
|
||||
int_count_(0),
|
||||
date_count_(0),
|
||||
time_count_(0),
|
||||
timestamp_micro_count_(0),
|
||||
duration_count_(0),
|
||||
float_count_(0),
|
||||
binary_count_(0),
|
||||
unicode_count_(0),
|
||||
decimal_count_(0),
|
||||
list_count_(0),
|
||||
struct_count_(0),
|
||||
arrow_scalar_count_(0),
|
||||
numpy_dtype_count_(0),
|
||||
interval_count_(0),
|
||||
max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
|
||||
std::numeric_limits<int32_t>::min()),
|
||||
decimal_type_() {
|
||||
ARROW_CHECK_OK(internal::ImportDecimalType(&decimal_type_));
|
||||
ARROW_CHECK_OK(ImportPresentIntervalTypes(&interval_types_));
|
||||
}
|
||||
|
||||
/// \param[in] obj a Python object in the sequence
|
||||
/// \param[out] keep_going if sufficient information has been gathered to
|
||||
/// attempt to begin converting the sequence, *keep_going will be set to true
|
||||
/// to signal to the calling visitor loop to terminate
|
||||
Status Visit(PyObject* obj, bool* keep_going) {
|
||||
++total_count_;
|
||||
|
||||
if (obj == Py_None || (pandas_null_sentinels_ && internal::PandasObjectIsNull(obj))) {
|
||||
++none_count_;
|
||||
} else if (PyBool_Check(obj)) {
|
||||
++bool_count_;
|
||||
*keep_going = make_unions_;
|
||||
} else if (PyFloat_Check(obj)) {
|
||||
++float_count_;
|
||||
*keep_going = make_unions_;
|
||||
} else if (internal::IsPyInteger(obj)) {
|
||||
++int_count_;
|
||||
} else if (PyDateTime_Check(obj)) {
|
||||
// infer timezone from the first encountered datetime object
|
||||
if (!timestamp_micro_count_) {
|
||||
OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo"));
|
||||
if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None) {
|
||||
ARROW_ASSIGN_OR_RAISE(timezone_, internal::TzinfoToString(tzinfo.obj()));
|
||||
}
|
||||
}
|
||||
++timestamp_micro_count_;
|
||||
*keep_going = make_unions_;
|
||||
} else if (PyDelta_Check(obj)) {
|
||||
++duration_count_;
|
||||
*keep_going = make_unions_;
|
||||
} else if (PyDate_Check(obj)) {
|
||||
++date_count_;
|
||||
*keep_going = make_unions_;
|
||||
} else if (PyTime_Check(obj)) {
|
||||
++time_count_;
|
||||
*keep_going = make_unions_;
|
||||
} else if (internal::IsPyBinary(obj)) {
|
||||
++binary_count_;
|
||||
*keep_going = make_unions_;
|
||||
} else if (PyUnicode_Check(obj)) {
|
||||
++unicode_count_;
|
||||
*keep_going = make_unions_;
|
||||
} else if (arrow::py::is_scalar(obj)) {
|
||||
RETURN_NOT_OK(VisitArrowScalar(obj, keep_going));
|
||||
} else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
|
||||
RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going));
|
||||
} else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) {
|
||||
RETURN_NOT_OK(VisitSet(obj, keep_going));
|
||||
} else if (has_numpy() && PyArray_Check(obj)) {
|
||||
RETURN_NOT_OK(VisitNdarray(obj, keep_going));
|
||||
} else if (PyDict_Check(obj)) {
|
||||
RETURN_NOT_OK(VisitDict(obj));
|
||||
} else if (PyList_Check(obj) ||
|
||||
(PyTuple_Check(obj) &&
|
||||
!PyObject_IsInstance(obj, PyTuple_GetItem(interval_types_.obj(), 0)))) {
|
||||
RETURN_NOT_OK(VisitList(obj, keep_going));
|
||||
} else if (PyObject_IsInstance(obj, decimal_type_.obj())) {
|
||||
RETURN_NOT_OK(max_decimal_metadata_.Update(obj));
|
||||
++decimal_count_;
|
||||
} else if (PyObject_IsInstance(obj, interval_types_.obj())) {
|
||||
++interval_count_;
|
||||
} else {
|
||||
return internal::InvalidValue(obj,
|
||||
"did not recognize Python value type when inferring "
|
||||
"an Arrow data type");
|
||||
}
|
||||
|
||||
if (total_count_ % validate_interval_ == 0) {
|
||||
RETURN_NOT_OK(Validate());
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Infer value type from a sequence of values
|
||||
Status VisitSequence(PyObject* obj, PyObject* mask = nullptr) {
|
||||
if (mask == nullptr || mask == Py_None) {
|
||||
return internal::VisitSequence(
|
||||
obj, /*offset=*/0,
|
||||
[this](PyObject* value, bool* keep_going) { return Visit(value, keep_going); });
|
||||
} else {
|
||||
return internal::VisitSequenceMasked(
|
||||
obj, mask, /*offset=*/0,
|
||||
[this](PyObject* value, uint8_t masked, bool* keep_going) {
|
||||
if (!masked) {
|
||||
return Visit(value, keep_going);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Infer value type from a sequence of values
|
||||
Status VisitIterable(PyObject* obj) {
|
||||
return internal::VisitIterable(obj, [this](PyObject* value, bool* keep_going) {
|
||||
return Visit(value, keep_going);
|
||||
});
|
||||
}
|
||||
|
||||
Status GetType(std::shared_ptr<DataType>* out) {
|
||||
// TODO(wesm): handling forming unions
|
||||
if (make_unions_) {
|
||||
return Status::NotImplemented("Creating union types not yet supported");
|
||||
}
|
||||
|
||||
RETURN_NOT_OK(Validate());
|
||||
|
||||
if (arrow_scalar_count_ > 0 && arrow_scalar_count_ + none_count_ != total_count_) {
|
||||
return Status::Invalid(
|
||||
"pyarrow scalars cannot be mixed "
|
||||
"with other Python scalar values currently");
|
||||
}
|
||||
|
||||
if (numpy_dtype_count_ > 0) {
|
||||
// All NumPy scalars and Nones/nulls
|
||||
if (numpy_dtype_count_ + none_count_ == total_count_) {
|
||||
return NumPyDtypeToArrow(numpy_unifier_.current_dtype()).Value(out);
|
||||
}
|
||||
|
||||
// The "bad path": data contains a mix of NumPy scalars and
|
||||
// other kinds of scalars. Note this can happen innocuously
|
||||
// because numpy.nan is not a NumPy scalar (it's a built-in
|
||||
// PyFloat)
|
||||
|
||||
// TODO(ARROW-5564): Merge together type unification so this
|
||||
// hack is not necessary
|
||||
switch (numpy_unifier_.current_type_num()) {
|
||||
case NPY_BOOL:
|
||||
bool_count_ += numpy_dtype_count_;
|
||||
break;
|
||||
case NPY_INT8:
|
||||
case NPY_INT16:
|
||||
case NPY_INT32:
|
||||
case NPY_INT64:
|
||||
case NPY_UINT8:
|
||||
case NPY_UINT16:
|
||||
case NPY_UINT32:
|
||||
case NPY_UINT64:
|
||||
int_count_ += numpy_dtype_count_;
|
||||
break;
|
||||
case NPY_FLOAT32:
|
||||
case NPY_FLOAT64:
|
||||
float_count_ += numpy_dtype_count_;
|
||||
break;
|
||||
case NPY_DATETIME:
|
||||
return Status::Invalid(
|
||||
"numpy.datetime64 scalars cannot be mixed "
|
||||
"with other Python scalar values currently");
|
||||
}
|
||||
}
|
||||
|
||||
if (list_count_) {
|
||||
std::shared_ptr<DataType> value_type;
|
||||
RETURN_NOT_OK(list_inferrer_->GetType(&value_type));
|
||||
*out = list(value_type);
|
||||
} else if (struct_count_) {
|
||||
RETURN_NOT_OK(GetStructType(out));
|
||||
} else if (decimal_count_) {
|
||||
if (max_decimal_metadata_.precision() > Decimal128Type::kMaxPrecision) {
|
||||
// the default constructor does not validate the precision and scale
|
||||
ARROW_ASSIGN_OR_RAISE(*out,
|
||||
Decimal256Type::Make(max_decimal_metadata_.precision(),
|
||||
max_decimal_metadata_.scale()));
|
||||
} else {
|
||||
ARROW_ASSIGN_OR_RAISE(*out,
|
||||
Decimal128Type::Make(max_decimal_metadata_.precision(),
|
||||
max_decimal_metadata_.scale()));
|
||||
}
|
||||
} else if (float_count_) {
|
||||
// Prioritize floats before integers
|
||||
*out = float64();
|
||||
} else if (int_count_) {
|
||||
*out = int64();
|
||||
} else if (date_count_) {
|
||||
*out = date32();
|
||||
} else if (time_count_) {
|
||||
*out = time64(TimeUnit::MICRO);
|
||||
} else if (timestamp_micro_count_) {
|
||||
*out = timestamp(TimeUnit::MICRO, timezone_);
|
||||
} else if (duration_count_) {
|
||||
*out = duration(TimeUnit::MICRO);
|
||||
} else if (bool_count_) {
|
||||
*out = boolean();
|
||||
} else if (binary_count_) {
|
||||
*out = binary();
|
||||
} else if (unicode_count_) {
|
||||
*out = utf8();
|
||||
} else if (interval_count_) {
|
||||
*out = month_day_nano_interval();
|
||||
} else if (arrow_scalar_count_) {
|
||||
*out = scalar_type_;
|
||||
} else {
|
||||
*out = null();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int64_t total_count() const { return total_count_; }
|
||||
|
||||
protected:
|
||||
Status Validate() const {
|
||||
if (list_count_ > 0) {
|
||||
if (list_count_ + none_count_ != total_count_) {
|
||||
return Status::Invalid("cannot mix list and non-list, non-null values");
|
||||
}
|
||||
RETURN_NOT_OK(list_inferrer_->Validate());
|
||||
} else if (struct_count_ > 0) {
|
||||
if (struct_count_ + none_count_ != total_count_) {
|
||||
return Status::Invalid("cannot mix struct and non-struct, non-null values");
|
||||
}
|
||||
for (const auto& it : struct_inferrers_) {
|
||||
RETURN_NOT_OK(it.second.Validate());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status VisitArrowScalar(PyObject* obj, bool* keep_going /* unused */) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto scalar, arrow::py::unwrap_scalar(obj));
|
||||
// Check that all the scalar types for the sequence are the same
|
||||
if (arrow_scalar_count_ > 0 && *scalar->type != *scalar_type_) {
|
||||
return internal::InvalidValue(obj, "cannot mix scalars with different types");
|
||||
}
|
||||
scalar_type_ = scalar->type;
|
||||
++arrow_scalar_count_;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status VisitDType(PyArray_Descr* dtype, bool* keep_going) {
|
||||
// Continue visiting dtypes for now.
|
||||
// TODO(wesm): devise approach for unions
|
||||
++numpy_dtype_count_;
|
||||
*keep_going = true;
|
||||
return numpy_unifier_.Observe(dtype);
|
||||
}
|
||||
|
||||
Status VisitList(PyObject* obj, bool* keep_going /* unused */) {
|
||||
if (!list_inferrer_) {
|
||||
list_inferrer_.reset(
|
||||
new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
|
||||
}
|
||||
++list_count_;
|
||||
return list_inferrer_->VisitSequence(obj);
|
||||
}
|
||||
|
||||
Status VisitSet(PyObject* obj, bool* keep_going /* unused */) {
|
||||
if (!list_inferrer_) {
|
||||
list_inferrer_.reset(
|
||||
new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
|
||||
}
|
||||
++list_count_;
|
||||
return list_inferrer_->VisitIterable(obj);
|
||||
}
|
||||
|
||||
Status VisitNdarray(PyObject* obj, bool* keep_going) {
|
||||
PyArray_Descr* dtype = PyArray_DESCR(reinterpret_cast<PyArrayObject*>(obj));
|
||||
if (dtype->type_num == NPY_OBJECT) {
|
||||
return VisitList(obj, keep_going);
|
||||
}
|
||||
// Not an object array: infer child Arrow type from dtype
|
||||
if (!list_inferrer_) {
|
||||
list_inferrer_.reset(
|
||||
new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
|
||||
}
|
||||
++list_count_;
|
||||
|
||||
// XXX(wesm): In ARROW-4324 I added accounting to check whether
|
||||
// all of the non-null values have NumPy dtypes, but the
|
||||
// total_count not being properly incremented here
|
||||
++(*list_inferrer_).total_count_;
|
||||
return list_inferrer_->VisitDType(dtype, keep_going);
|
||||
}
|
||||
|
||||
Status VisitDict(PyObject* obj) {
|
||||
PyObject* key_obj;
|
||||
PyObject* value_obj;
|
||||
Py_ssize_t pos = 0;
|
||||
|
||||
while (PyDict_Next(obj, &pos, &key_obj, &value_obj)) {
|
||||
std::string key;
|
||||
if (PyUnicode_Check(key_obj)) {
|
||||
RETURN_NOT_OK(internal::PyUnicode_AsStdString(key_obj, &key));
|
||||
} else if (PyBytes_Check(key_obj)) {
|
||||
key = internal::PyBytes_AsStdString(key_obj);
|
||||
} else {
|
||||
return Status::TypeError("Expected dict key of type str or bytes, got '",
|
||||
Py_TYPE(key_obj)->tp_name, "'");
|
||||
}
|
||||
// Get or create visitor for this key
|
||||
auto it = struct_inferrers_.find(key);
|
||||
if (it == struct_inferrers_.end()) {
|
||||
it = struct_inferrers_
|
||||
.insert(
|
||||
std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
|
||||
validate_interval_, make_unions_)))
|
||||
.first;
|
||||
}
|
||||
TypeInferrer* visitor = &it->second;
|
||||
|
||||
// We ignore termination signals from child visitors for now
|
||||
//
|
||||
// TODO(wesm): keep track of whether type inference has terminated for
|
||||
// the child visitors to avoid doing unneeded work
|
||||
bool keep_going = true;
|
||||
RETURN_NOT_OK(visitor->Visit(value_obj, &keep_going));
|
||||
}
|
||||
|
||||
// We do not terminate visiting dicts since we want the union of all
|
||||
// observed keys
|
||||
++struct_count_;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GetStructType(std::shared_ptr<DataType>* out) {
|
||||
std::vector<std::shared_ptr<Field>> fields;
|
||||
for (auto&& it : struct_inferrers_) {
|
||||
std::shared_ptr<DataType> field_type;
|
||||
RETURN_NOT_OK(it.second.GetType(&field_type));
|
||||
fields.emplace_back(field(it.first, field_type));
|
||||
}
|
||||
*out = struct_(fields);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
bool pandas_null_sentinels_;
|
||||
int64_t validate_interval_;
|
||||
bool make_unions_;
|
||||
int64_t total_count_;
|
||||
int64_t none_count_;
|
||||
int64_t bool_count_;
|
||||
int64_t int_count_;
|
||||
int64_t date_count_;
|
||||
int64_t time_count_;
|
||||
int64_t timestamp_micro_count_;
|
||||
std::string timezone_;
|
||||
int64_t duration_count_;
|
||||
int64_t float_count_;
|
||||
int64_t binary_count_;
|
||||
int64_t unicode_count_;
|
||||
int64_t decimal_count_;
|
||||
int64_t list_count_;
|
||||
int64_t struct_count_;
|
||||
int64_t arrow_scalar_count_;
|
||||
int64_t numpy_dtype_count_;
|
||||
int64_t interval_count_;
|
||||
std::unique_ptr<TypeInferrer> list_inferrer_;
|
||||
std::map<std::string, TypeInferrer> struct_inferrers_;
|
||||
std::shared_ptr<DataType> scalar_type_;
|
||||
|
||||
// If we observe a strongly-typed value in e.g. a NumPy array, we can store
|
||||
// it here to skip the type counting logic above
|
||||
NumPyDtypeUnifier numpy_unifier_;
|
||||
|
||||
internal::DecimalMetadata max_decimal_metadata_;
|
||||
|
||||
OwnedRefNoGIL decimal_type_;
|
||||
OwnedRefNoGIL interval_types_;
|
||||
};
|
||||
|
||||
// Non-exhaustive type inference
|
||||
Result<std::shared_ptr<DataType>> InferArrowType(PyObject* obj, PyObject* mask,
|
||||
bool pandas_null_sentinels) {
|
||||
if (pandas_null_sentinels) {
|
||||
// ARROW-842: If pandas is not installed then null checks will be less
|
||||
// comprehensive, but that is okay.
|
||||
internal::InitPandasStaticData();
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> out_type;
|
||||
TypeInferrer inferrer(pandas_null_sentinels);
|
||||
RETURN_NOT_OK(inferrer.VisitSequence(obj, mask));
|
||||
RETURN_NOT_OK(inferrer.GetType(&out_type));
|
||||
if (out_type == nullptr) {
|
||||
return Status::TypeError("Unable to determine data type");
|
||||
} else {
|
||||
return std::move(out_type);
|
||||
}
|
||||
}
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool IsPyBool(PyObject* obj) { return internal::PyBoolScalar_Check(obj); }
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool IsPyInt(PyObject* obj) { return internal::PyIntScalar_Check(obj); }
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool IsPyFloat(PyObject* obj) { return internal::PyFloatScalar_Check(obj); }
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,64 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Functions for converting between CPython built-in data structures and Arrow
|
||||
// data structures
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
#include "common.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class Status;
|
||||
|
||||
namespace py {
|
||||
|
||||
// These functions take a sequence input, not arbitrary iterables
|
||||
|
||||
/// \brief Infer Arrow type from a Python sequence
|
||||
/// \param[in] obj the sequence of values
|
||||
/// \param[in] mask an optional mask where True values are null. May
|
||||
/// be nullptr
|
||||
/// \param[in] pandas_null_sentinels use pandas's null value markers
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<std::shared_ptr<arrow::DataType>> InferArrowType(PyObject* obj, PyObject* mask,
|
||||
bool pandas_null_sentinels);
|
||||
|
||||
/// Checks whether the passed Python object is a boolean scalar
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool IsPyBool(PyObject* obj);
|
||||
|
||||
/// Checks whether the passed Python object is an integer scalar
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool IsPyInt(PyObject* obj);
|
||||
|
||||
/// Checks whether the passed Python object is a float scalar
|
||||
ARROW_PYTHON_EXPORT
|
||||
bool IsPyFloat(PyObject* obj);
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
387
venv/lib/python3.10/site-packages/pyarrow/src/arrow/python/io.cc
Normal file
387
venv/lib/python3.10/site-packages/pyarrow/src/arrow/python/io.cc
Normal file
@@ -0,0 +1,387 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "io.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/io/memory.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/pyarrow.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using arrow::io::TransformInputStream;
|
||||
|
||||
namespace py {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Python file
|
||||
|
||||
// A common interface to a Python file-like object. Must acquire GIL before
|
||||
// calling any methods
|
||||
class PythonFile {
|
||||
public:
|
||||
explicit PythonFile(PyObject* file) : file_(file), checked_read_buffer_(false) {
|
||||
Py_INCREF(file);
|
||||
}
|
||||
|
||||
Status CheckClosed() const {
|
||||
if (!file_) {
|
||||
return Status::Invalid("operation on closed Python file");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Close() {
|
||||
if (file_) {
|
||||
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "close", "()");
|
||||
Py_XDECREF(result);
|
||||
file_.reset();
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Abort() {
|
||||
file_.reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool closed() const {
|
||||
if (!file_) {
|
||||
return true;
|
||||
}
|
||||
PyObject* result = PyObject_GetAttrString(file_.obj(), "closed");
|
||||
if (result == NULL) {
|
||||
// Can't propagate the error, so write it out and return an arbitrary value
|
||||
PyErr_WriteUnraisable(NULL);
|
||||
return true;
|
||||
}
|
||||
int ret = PyObject_IsTrue(result);
|
||||
Py_XDECREF(result);
|
||||
if (ret < 0) {
|
||||
PyErr_WriteUnraisable(NULL);
|
||||
return true;
|
||||
}
|
||||
return ret != 0;
|
||||
}
|
||||
|
||||
Status Seek(int64_t position, int whence) {
|
||||
RETURN_NOT_OK(CheckClosed());
|
||||
|
||||
// NOTE: `long long` is at least 64 bits in the C standard, the cast below is
|
||||
// therefore safe.
|
||||
|
||||
// whence: 0 for relative to start of file, 2 for end of file
|
||||
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(Li)",
|
||||
static_cast<long long>(position), whence);
|
||||
Py_XDECREF(result);
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Read(int64_t nbytes, PyObject** out) {
|
||||
RETURN_NOT_OK(CheckClosed());
|
||||
|
||||
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(L)",
|
||||
static_cast<long long>(nbytes));
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
*out = result;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ReadBuffer(int64_t nbytes, PyObject** out) {
|
||||
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(L)",
|
||||
static_cast<long long>(nbytes));
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
*out = result;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Write(const void* data, int64_t nbytes) {
|
||||
RETURN_NOT_OK(CheckClosed());
|
||||
|
||||
// Since the data isn't owned, we have to make a copy
|
||||
PyObject* py_data =
|
||||
PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), nbytes);
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
|
||||
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data);
|
||||
Py_XDECREF(py_data);
|
||||
Py_XDECREF(result);
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Write(const std::shared_ptr<Buffer>& buffer) {
|
||||
RETURN_NOT_OK(CheckClosed());
|
||||
|
||||
PyObject* py_data = wrap_buffer(buffer);
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
|
||||
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data);
|
||||
Py_XDECREF(py_data);
|
||||
Py_XDECREF(result);
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Result<int64_t> Tell() {
|
||||
RETURN_NOT_OK(CheckClosed());
|
||||
|
||||
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "tell", "()");
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
|
||||
int64_t position = PyLong_AsLongLong(result);
|
||||
Py_DECREF(result);
|
||||
|
||||
// PyLong_AsLongLong can raise OverflowError
|
||||
PY_RETURN_IF_ERROR(StatusCode::IOError);
|
||||
return position;
|
||||
}
|
||||
|
||||
std::mutex& lock() { return lock_; }
|
||||
|
||||
bool HasReadBuffer() {
|
||||
if (!checked_read_buffer_) { // we don't want to check this each time
|
||||
has_read_buffer_ = PyObject_HasAttrString(file_.obj(), "read_buffer") == 1;
|
||||
checked_read_buffer_ = true;
|
||||
}
|
||||
return has_read_buffer_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex lock_;
|
||||
OwnedRefNoGIL file_;
|
||||
bool has_read_buffer_;
|
||||
bool checked_read_buffer_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Seekable input stream
|
||||
|
||||
PyReadableFile::PyReadableFile(PyObject* file) { file_.reset(new PythonFile(file)); }
|
||||
|
||||
// The destructor does not close the underlying Python file object, as
|
||||
// there may be multiple references to it. Instead let the Python
|
||||
// destructor do its job.
|
||||
PyReadableFile::~PyReadableFile() {}
|
||||
|
||||
Status PyReadableFile::Abort() {
|
||||
return SafeCallIntoPython([this]() { return file_->Abort(); });
|
||||
}
|
||||
|
||||
Status PyReadableFile::Close() {
|
||||
return SafeCallIntoPython([this]() { return file_->Close(); });
|
||||
}
|
||||
|
||||
bool PyReadableFile::closed() const {
|
||||
bool res;
|
||||
Status st = SafeCallIntoPython([this, &res]() {
|
||||
res = file_->closed();
|
||||
return Status::OK();
|
||||
});
|
||||
return res;
|
||||
}
|
||||
|
||||
Status PyReadableFile::Seek(int64_t position) {
|
||||
return SafeCallIntoPython([=] { return file_->Seek(position, 0); });
|
||||
}
|
||||
|
||||
Result<int64_t> PyReadableFile::Tell() const {
|
||||
return SafeCallIntoPython([=]() -> Result<int64_t> { return file_->Tell(); });
|
||||
}
|
||||
|
||||
Result<int64_t> PyReadableFile::Read(int64_t nbytes, void* out) {
|
||||
return SafeCallIntoPython([=]() -> Result<int64_t> {
|
||||
OwnedRef bytes;
|
||||
RETURN_NOT_OK(file_->Read(nbytes, bytes.ref()));
|
||||
PyObject* bytes_obj = bytes.obj();
|
||||
ARROW_DCHECK(bytes_obj != NULL);
|
||||
|
||||
Py_buffer py_buf;
|
||||
if (!PyObject_GetBuffer(bytes_obj, &py_buf, PyBUF_ANY_CONTIGUOUS)) {
|
||||
const uint8_t* data = reinterpret_cast<const uint8_t*>(py_buf.buf);
|
||||
std::memcpy(out, data, py_buf.len);
|
||||
int64_t len = py_buf.len;
|
||||
PyBuffer_Release(&py_buf);
|
||||
return len;
|
||||
} else {
|
||||
return Status::TypeError(
|
||||
"Python file read() should have returned a bytes object or an object "
|
||||
"supporting the buffer protocol, got '",
|
||||
Py_TYPE(bytes_obj)->tp_name, "' (did you open the file in binary mode?)");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> PyReadableFile::Read(int64_t nbytes) {
|
||||
return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
|
||||
OwnedRef buffer_obj;
|
||||
if (file_->HasReadBuffer()) {
|
||||
RETURN_NOT_OK(file_->ReadBuffer(nbytes, buffer_obj.ref()));
|
||||
} else {
|
||||
RETURN_NOT_OK(file_->Read(nbytes, buffer_obj.ref()));
|
||||
}
|
||||
ARROW_DCHECK(buffer_obj.obj() != NULL);
|
||||
|
||||
return PyBuffer::FromPyObject(buffer_obj.obj());
|
||||
});
|
||||
}
|
||||
|
||||
Result<int64_t> PyReadableFile::ReadAt(int64_t position, int64_t nbytes, void* out) {
|
||||
std::lock_guard<std::mutex> guard(file_->lock());
|
||||
return SafeCallIntoPython([=]() -> Result<int64_t> {
|
||||
RETURN_NOT_OK(Seek(position));
|
||||
return Read(nbytes, out);
|
||||
});
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> PyReadableFile::ReadAt(int64_t position, int64_t nbytes) {
|
||||
std::lock_guard<std::mutex> guard(file_->lock());
|
||||
return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
|
||||
RETURN_NOT_OK(Seek(position));
|
||||
return Read(nbytes);
|
||||
});
|
||||
}
|
||||
|
||||
Result<int64_t> PyReadableFile::GetSize() {
|
||||
return SafeCallIntoPython([=]() -> Result<int64_t> {
|
||||
ARROW_ASSIGN_OR_RAISE(int64_t current_position, file_->Tell());
|
||||
RETURN_NOT_OK(file_->Seek(0, 2));
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(int64_t file_size, file_->Tell());
|
||||
// Restore previous file position
|
||||
RETURN_NOT_OK(file_->Seek(current_position, 0));
|
||||
|
||||
return file_size;
|
||||
});
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Output stream
|
||||
|
||||
PyOutputStream::PyOutputStream(PyObject* file) : position_(0) {
|
||||
file_.reset(new PythonFile(file));
|
||||
}
|
||||
|
||||
// The destructor does not close the underlying Python file object, as
|
||||
// there may be multiple references to it. Instead let the Python
|
||||
// destructor do its job.
|
||||
PyOutputStream::~PyOutputStream() {}
|
||||
|
||||
Status PyOutputStream::Abort() {
|
||||
return SafeCallIntoPython([=]() { return file_->Abort(); });
|
||||
}
|
||||
|
||||
Status PyOutputStream::Close() {
|
||||
return SafeCallIntoPython([=]() { return file_->Close(); });
|
||||
}
|
||||
|
||||
bool PyOutputStream::closed() const {
|
||||
bool res;
|
||||
Status st = SafeCallIntoPython([this, &res]() {
|
||||
res = file_->closed();
|
||||
return Status::OK();
|
||||
});
|
||||
return res;
|
||||
}
|
||||
|
||||
Result<int64_t> PyOutputStream::Tell() const { return position_; }
|
||||
|
||||
Status PyOutputStream::Write(const void* data, int64_t nbytes) {
|
||||
return SafeCallIntoPython([=]() {
|
||||
position_ += nbytes;
|
||||
return file_->Write(data, nbytes);
|
||||
});
|
||||
}
|
||||
|
||||
Status PyOutputStream::Write(const std::shared_ptr<Buffer>& buffer) {
|
||||
return SafeCallIntoPython([=]() {
|
||||
position_ += buffer->size();
|
||||
return file_->Write(buffer);
|
||||
});
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Foreign buffer
|
||||
|
||||
Status PyForeignBuffer::Make(const uint8_t* data, int64_t size, PyObject* base,
|
||||
std::shared_ptr<Buffer>* out) {
|
||||
PyForeignBuffer* buf = new PyForeignBuffer(data, size, base);
|
||||
if (buf == NULL) {
|
||||
return Status::OutOfMemory("could not allocate foreign buffer object");
|
||||
} else {
|
||||
*out = std::shared_ptr<Buffer>(buf);
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// TransformInputStream::TransformFunc wrapper
|
||||
|
||||
struct TransformFunctionWrapper {
|
||||
TransformFunctionWrapper(TransformCallback cb, PyObject* arg)
|
||||
: cb_(std::move(cb)), arg_(std::make_shared<OwnedRefNoGIL>(arg)) {
|
||||
Py_INCREF(arg);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> operator()(const std::shared_ptr<Buffer>& src) {
|
||||
return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
|
||||
std::shared_ptr<Buffer> dest;
|
||||
cb_(arg_->obj(), src, &dest);
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
return dest;
|
||||
});
|
||||
}
|
||||
|
||||
protected:
|
||||
// Need to wrap OwnedRefNoGIL because std::function needs the callable
|
||||
// to be copy-constructible...
|
||||
TransformCallback cb_;
|
||||
std::shared_ptr<OwnedRefNoGIL> arg_;
|
||||
};
|
||||
|
||||
std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream(
|
||||
std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable,
|
||||
PyObject* handler) {
|
||||
TransformInputStream::TransformFunc transform(
|
||||
TransformFunctionWrapper{std::move(vtable.transform), handler});
|
||||
return std::make_shared<TransformInputStream>(std::move(wrapped), std::move(transform));
|
||||
}
|
||||
|
||||
std::shared_ptr<StreamWrapFunc> MakeStreamTransformFunc(TransformInputStreamVTable vtable,
|
||||
PyObject* handler) {
|
||||
TransformInputStream::TransformFunc transform(
|
||||
TransformFunctionWrapper{std::move(vtable.transform), handler});
|
||||
StreamWrapFunc func = [transform](std::shared_ptr<::arrow::io::InputStream> wrapped) {
|
||||
return std::make_shared<TransformInputStream>(wrapped, transform);
|
||||
};
|
||||
return std::make_shared<StreamWrapFunc>(func);
|
||||
}
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
121
venv/lib/python3.10/site-packages/pyarrow/src/arrow/python/io.h
Normal file
121
venv/lib/python3.10/site-packages/pyarrow/src/arrow/python/io.h
Normal file
@@ -0,0 +1,121 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/io/transform.h"
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
class ARROW_NO_EXPORT PythonFile;
|
||||
|
||||
class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile {
|
||||
public:
|
||||
explicit PyReadableFile(PyObject* file);
|
||||
~PyReadableFile() override;
|
||||
|
||||
Status Close() override;
|
||||
Status Abort() override;
|
||||
bool closed() const override;
|
||||
|
||||
Result<int64_t> Read(int64_t nbytes, void* out) override;
|
||||
Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
|
||||
|
||||
// Thread-safe version
|
||||
Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
|
||||
|
||||
// Thread-safe version
|
||||
Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
|
||||
|
||||
Result<int64_t> GetSize() override;
|
||||
|
||||
Status Seek(int64_t position) override;
|
||||
|
||||
Result<int64_t> Tell() const override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<PythonFile> file_;
|
||||
};
|
||||
|
||||
class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream {
|
||||
public:
|
||||
explicit PyOutputStream(PyObject* file);
|
||||
~PyOutputStream() override;
|
||||
|
||||
Status Close() override;
|
||||
Status Abort() override;
|
||||
bool closed() const override;
|
||||
Result<int64_t> Tell() const override;
|
||||
Status Write(const void* data, int64_t nbytes) override;
|
||||
Status Write(const std::shared_ptr<Buffer>& buffer) override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<PythonFile> file_;
|
||||
int64_t position_;
|
||||
};
|
||||
|
||||
// TODO(wesm): seekable output files
|
||||
|
||||
// A Buffer subclass that keeps a PyObject reference throughout its
|
||||
// lifetime, such that the Python object is kept alive as long as the
|
||||
// C++ buffer is still needed.
|
||||
// Keeping the reference in a Python wrapper would be incorrect as
|
||||
// the Python wrapper can get destroyed even though the wrapped C++
|
||||
// buffer is still alive (ARROW-2270).
|
||||
class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer {
|
||||
public:
|
||||
static Status Make(const uint8_t* data, int64_t size, PyObject* base,
|
||||
std::shared_ptr<Buffer>* out);
|
||||
|
||||
private:
|
||||
PyForeignBuffer(const uint8_t* data, int64_t size, PyObject* base)
|
||||
: Buffer(data, size) {
|
||||
Py_INCREF(base);
|
||||
base_.reset(base);
|
||||
}
|
||||
|
||||
OwnedRefNoGIL base_;
|
||||
};
|
||||
|
||||
// All this rigamarole because Cython is really poor with std::function<>
|
||||
|
||||
using TransformCallback = std::function<void(
|
||||
PyObject*, const std::shared_ptr<Buffer>& src, std::shared_ptr<Buffer>* out)>;
|
||||
|
||||
struct TransformInputStreamVTable {
|
||||
TransformCallback transform;
|
||||
};
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream(
|
||||
std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable,
|
||||
PyObject* arg);
|
||||
|
||||
using StreamWrapFunc = std::function<Result<std::shared_ptr<io::InputStream>>(
|
||||
std::shared_ptr<io::InputStream>)>;
|
||||
ARROW_PYTHON_EXPORT
|
||||
std::shared_ptr<StreamWrapFunc> MakeStreamTransformFunc(TransformInputStreamVTable vtable,
|
||||
PyObject* handler);
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,135 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "ipc.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/compute/cast.h"
|
||||
#include "arrow/python/pyarrow.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
PyRecordBatchReader::PyRecordBatchReader() {}
|
||||
|
||||
Status PyRecordBatchReader::Init(std::shared_ptr<Schema> schema, PyObject* iterable) {
|
||||
schema_ = std::move(schema);
|
||||
|
||||
iterator_.reset(PyObject_GetIter(iterable));
|
||||
return CheckPyError();
|
||||
}
|
||||
|
||||
std::shared_ptr<Schema> PyRecordBatchReader::schema() const { return schema_; }
|
||||
|
||||
Status PyRecordBatchReader::ReadNext(std::shared_ptr<RecordBatch>* batch) {
|
||||
PyAcquireGIL lock;
|
||||
|
||||
if (!iterator_) {
|
||||
// End of stream
|
||||
batch->reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
OwnedRef py_batch(PyIter_Next(iterator_.obj()));
|
||||
if (!py_batch) {
|
||||
RETURN_IF_PYERROR();
|
||||
// End of stream
|
||||
batch->reset();
|
||||
iterator_.reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
return unwrap_batch(py_batch.obj()).Value(batch);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<RecordBatchReader>> PyRecordBatchReader::Make(
|
||||
std::shared_ptr<Schema> schema, PyObject* iterable) {
|
||||
auto reader = std::shared_ptr<PyRecordBatchReader>(new PyRecordBatchReader());
|
||||
RETURN_NOT_OK(reader->Init(std::move(schema), iterable));
|
||||
return reader;
|
||||
}
|
||||
|
||||
CastingRecordBatchReader::CastingRecordBatchReader() = default;
|
||||
|
||||
Status CastingRecordBatchReader::Init(std::shared_ptr<RecordBatchReader> parent,
|
||||
std::shared_ptr<Schema> schema) {
|
||||
std::shared_ptr<Schema> src = parent->schema();
|
||||
|
||||
// The check for names has already been done in Python where it's easier to
|
||||
// generate a nice error message.
|
||||
int num_fields = schema->num_fields();
|
||||
if (src->num_fields() != num_fields) {
|
||||
return Status::Invalid("Number of fields not equal");
|
||||
}
|
||||
|
||||
// Ensure all columns can be cast before succeeding
|
||||
for (int i = 0; i < num_fields; i++) {
|
||||
auto& src_type = src->field(i)->type();
|
||||
auto& schema_type = schema->field(i)->type();
|
||||
if (!src_type->Equals(schema_type) && !compute::CanCast(*src_type, *schema_type)) {
|
||||
return Status::TypeError("Field ", i, " cannot be cast from ",
|
||||
src->field(i)->type()->ToString(), " to ",
|
||||
schema->field(i)->type()->ToString());
|
||||
}
|
||||
}
|
||||
|
||||
parent_ = std::move(parent);
|
||||
schema_ = std::move(schema);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<Schema> CastingRecordBatchReader::schema() const { return schema_; }
|
||||
|
||||
Status CastingRecordBatchReader::ReadNext(std::shared_ptr<RecordBatch>* batch) {
|
||||
std::shared_ptr<RecordBatch> out;
|
||||
ARROW_RETURN_NOT_OK(parent_->ReadNext(&out));
|
||||
if (!out) {
|
||||
batch->reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
auto num_columns = out->num_columns();
|
||||
auto options = compute::CastOptions::Safe();
|
||||
ArrayVector columns(num_columns);
|
||||
for (int i = 0; i < num_columns; i++) {
|
||||
const Array& src = *out->column(i);
|
||||
if (!schema_->field(i)->nullable() && src.null_count() > 0) {
|
||||
return Status::Invalid(
|
||||
"Can't cast array that contains nulls to non-nullable field at index ", i);
|
||||
}
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(columns[i],
|
||||
compute::Cast(src, schema_->field(i)->type(), options));
|
||||
}
|
||||
|
||||
*batch = RecordBatch::Make(schema_, out->num_rows(), std::move(columns));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<RecordBatchReader>> CastingRecordBatchReader::Make(
|
||||
std::shared_ptr<RecordBatchReader> parent, std::shared_ptr<Schema> schema) {
|
||||
auto reader = std::shared_ptr<CastingRecordBatchReader>(new CastingRecordBatchReader());
|
||||
ARROW_RETURN_NOT_OK(reader->Init(parent, schema));
|
||||
return reader;
|
||||
}
|
||||
|
||||
Status CastingRecordBatchReader::Close() { return parent_->Close(); }
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,72 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
class ARROW_PYTHON_EXPORT PyRecordBatchReader : public RecordBatchReader {
|
||||
public:
|
||||
std::shared_ptr<Schema> schema() const override;
|
||||
|
||||
Status ReadNext(std::shared_ptr<RecordBatch>* batch) override;
|
||||
|
||||
// For use from Cython
|
||||
// Assumes that `iterable` is borrowed
|
||||
static Result<std::shared_ptr<RecordBatchReader>> Make(std::shared_ptr<Schema>,
|
||||
PyObject* iterable);
|
||||
|
||||
protected:
|
||||
PyRecordBatchReader();
|
||||
|
||||
Status Init(std::shared_ptr<Schema>, PyObject* iterable);
|
||||
|
||||
std::shared_ptr<Schema> schema_;
|
||||
OwnedRefNoGIL iterator_;
|
||||
};
|
||||
|
||||
class ARROW_PYTHON_EXPORT CastingRecordBatchReader : public RecordBatchReader {
|
||||
public:
|
||||
std::shared_ptr<Schema> schema() const override;
|
||||
|
||||
Status ReadNext(std::shared_ptr<RecordBatch>* batch) override;
|
||||
|
||||
static Result<std::shared_ptr<RecordBatchReader>> Make(
|
||||
std::shared_ptr<RecordBatchReader> parent, std::shared_ptr<Schema> schema);
|
||||
|
||||
Status Close() override;
|
||||
|
||||
protected:
|
||||
CastingRecordBatchReader();
|
||||
|
||||
Status Init(std::shared_ptr<RecordBatchReader> parent, std::shared_ptr<Schema> schema);
|
||||
|
||||
std::shared_ptr<RecordBatchReader> parent_;
|
||||
std::shared_ptr<Schema> schema_;
|
||||
};
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,200 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/array/array_primitive.h"
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/numpy_init.h"
|
||||
#include "arrow/python/numpy_internal.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace internal {
|
||||
|
||||
using arrow::internal::checked_cast;
|
||||
|
||||
// Visit the Python sequence, calling the given callable on each element. If
|
||||
// the callable returns a non-OK status, iteration stops and the status is
|
||||
// returned.
|
||||
//
|
||||
// The call signature for Visitor must be
|
||||
//
|
||||
// Visit(PyObject* obj, int64_t index, bool* keep_going)
|
||||
//
|
||||
// If keep_going is set to false, the iteration terminates
|
||||
template <class VisitorFunc>
|
||||
inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&& func) {
|
||||
// VisitorFunc may set to false to terminate iteration
|
||||
bool keep_going = true;
|
||||
|
||||
if (has_numpy() && PyArray_Check(obj)) {
|
||||
PyArrayObject* arr_obj = reinterpret_cast<PyArrayObject*>(obj);
|
||||
if (PyArray_NDIM(arr_obj) != 1) {
|
||||
return Status::Invalid("Only 1D arrays accepted");
|
||||
}
|
||||
|
||||
if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) {
|
||||
// It's an array object, we can fetch object pointers directly
|
||||
const Ndarray1DIndexer<PyObject*> objects(arr_obj);
|
||||
for (int64_t i = offset; keep_going && i < objects.size(); ++i) {
|
||||
RETURN_NOT_OK(func(objects[i], i, &keep_going));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
// It's a non-object array, fall back on regular sequence access.
|
||||
// (note PyArray_GETITEM() is slightly different: it returns standard
|
||||
// Python types, not Numpy scalar types)
|
||||
// This code path is inefficient: callers should implement dedicated
|
||||
// logic for non-object arrays.
|
||||
}
|
||||
|
||||
if (PySequence_Check(obj)) {
|
||||
#ifdef Py_GIL_DISABLED
|
||||
if (PyTuple_Check(obj)) {
|
||||
#else
|
||||
if (PyList_Check(obj) || PyTuple_Check(obj)) {
|
||||
#endif
|
||||
// Use fast item access
|
||||
const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj);
|
||||
for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
|
||||
PyObject* value = PySequence_Fast_GET_ITEM(obj, i);
|
||||
RETURN_NOT_OK(func(value, static_cast<int64_t>(i), &keep_going));
|
||||
}
|
||||
} else {
|
||||
// Regular sequence: avoid making a potentially large copy
|
||||
const Py_ssize_t size = PySequence_Size(obj);
|
||||
RETURN_IF_PYERROR();
|
||||
for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
|
||||
OwnedRef value_ref(PySequence_ITEM(obj, i));
|
||||
RETURN_IF_PYERROR();
|
||||
RETURN_NOT_OK(func(value_ref.obj(), static_cast<int64_t>(i), &keep_going));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Status::TypeError("Object is not a sequence");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Visit sequence with no null mask
|
||||
template <class VisitorFunc>
|
||||
inline Status VisitSequence(PyObject* obj, int64_t offset, VisitorFunc&& func) {
|
||||
return VisitSequenceGeneric(
|
||||
obj, offset, [&func](PyObject* value, int64_t i /* unused */, bool* keep_going) {
|
||||
return func(value, keep_going);
|
||||
});
|
||||
}
|
||||
|
||||
/// Visit sequence with null mask
|
||||
template <class VisitorFunc>
|
||||
inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
|
||||
VisitorFunc&& func) {
|
||||
if (has_numpy() && PyArray_Check(mo)) {
|
||||
PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
|
||||
if (PyArray_NDIM(mask) != 1) {
|
||||
return Status::Invalid("Mask must be 1D array");
|
||||
}
|
||||
if (PyArray_SIZE(mask) != static_cast<int64_t>(PySequence_Size(obj))) {
|
||||
return Status::Invalid("Mask was a different length from sequence being converted");
|
||||
}
|
||||
|
||||
const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num);
|
||||
if (dtype == NPY_BOOL) {
|
||||
Ndarray1DIndexer<uint8_t> mask_values(mask);
|
||||
|
||||
return VisitSequenceGeneric(
|
||||
obj, offset,
|
||||
[&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
|
||||
return func(value, mask_values[i], keep_going);
|
||||
});
|
||||
} else {
|
||||
return Status::TypeError("Mask must be boolean dtype");
|
||||
}
|
||||
} else if (py::is_array(mo)) {
|
||||
auto unwrap_mask_result = unwrap_array(mo);
|
||||
ARROW_RETURN_NOT_OK(unwrap_mask_result);
|
||||
std::shared_ptr<Array> mask_ = unwrap_mask_result.ValueOrDie();
|
||||
if (mask_->type_id() != Type::type::BOOL) {
|
||||
return Status::TypeError("Mask must be an array of booleans");
|
||||
}
|
||||
|
||||
if (mask_->length() != PySequence_Size(obj)) {
|
||||
return Status::Invalid("Mask was a different length from sequence being converted");
|
||||
}
|
||||
|
||||
if (mask_->null_count() != 0) {
|
||||
return Status::TypeError("Mask must be an array of booleans");
|
||||
}
|
||||
|
||||
BooleanArray* boolmask = checked_cast<BooleanArray*>(mask_.get());
|
||||
return VisitSequenceGeneric(
|
||||
obj, offset, [&func, &boolmask](PyObject* value, int64_t i, bool* keep_going) {
|
||||
return func(value, boolmask->Value(i), keep_going);
|
||||
});
|
||||
} else if (PySequence_Check(mo)) {
|
||||
if (PySequence_Size(mo) != PySequence_Size(obj)) {
|
||||
return Status::Invalid("Mask was a different length from sequence being converted");
|
||||
}
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
return VisitSequenceGeneric(
|
||||
obj, offset, [&func, &mo](PyObject* value, int64_t i, bool* keep_going) {
|
||||
OwnedRef value_ref(PySequence_ITEM(mo, i));
|
||||
if (!PyBool_Check(value_ref.obj()))
|
||||
return Status::TypeError("Mask must be a sequence of booleans");
|
||||
return func(value, value_ref.obj() == Py_True, keep_going);
|
||||
});
|
||||
} else {
|
||||
return Status::Invalid("Null mask must be a NumPy array, Arrow array or a Sequence");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Like IterateSequence, but accepts any generic iterable (including
|
||||
// non-restartable iterators, e.g. generators).
|
||||
//
|
||||
// The call signature for VisitorFunc must be Visit(PyObject*, bool*
|
||||
// keep_going). If keep_going is set to false, the iteration terminates
|
||||
template <class VisitorFunc>
|
||||
inline Status VisitIterable(PyObject* obj, VisitorFunc&& func) {
|
||||
if (PySequence_Check(obj)) {
|
||||
// Numpy arrays fall here as well
|
||||
return VisitSequence(obj, /*offset=*/0, std::forward<VisitorFunc>(func));
|
||||
}
|
||||
// Fall back on the iterator protocol
|
||||
OwnedRef iter_ref(PyObject_GetIter(obj));
|
||||
PyObject* iter = iter_ref.obj();
|
||||
RETURN_IF_PYERROR();
|
||||
PyObject* value;
|
||||
|
||||
bool keep_going = true;
|
||||
while (keep_going && (value = PyIter_Next(iter))) {
|
||||
OwnedRef value_ref(value);
|
||||
RETURN_NOT_OK(func(value_ref.obj(), &keep_going));
|
||||
}
|
||||
RETURN_IF_PYERROR(); // __next__() might have raised
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,563 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/numpy_interop.h"
|
||||
|
||||
#include "arrow/python/numpy_convert.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/sparse_tensor.h"
|
||||
#include "arrow/tensor.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/pyarrow.h"
|
||||
#include "arrow/python/type_traits.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) {
|
||||
PyAcquireGIL lock;
|
||||
arr_ = ao;
|
||||
Py_INCREF(ao);
|
||||
|
||||
if (PyArray_Check(ao)) {
|
||||
PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(ao);
|
||||
auto ptr = reinterpret_cast<uint8_t*>(PyArray_DATA(ndarray));
|
||||
data_ = const_cast<const uint8_t*>(ptr);
|
||||
size_ = PyArray_NBYTES(ndarray);
|
||||
capacity_ = size_;
|
||||
is_mutable_ = !!(PyArray_FLAGS(ndarray) & NPY_ARRAY_WRITEABLE);
|
||||
}
|
||||
}
|
||||
|
||||
NumPyBuffer::~NumPyBuffer() {
|
||||
PyAcquireGIL lock;
|
||||
Py_XDECREF(arr_);
|
||||
}
|
||||
|
||||
#define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \
|
||||
case NPY_##NPY_NAME: \
|
||||
return FACTORY();
|
||||
|
||||
namespace {
|
||||
|
||||
Result<std::shared_ptr<DataType>> GetTensorType(PyObject* dtype) {
|
||||
if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
|
||||
return Status::TypeError("Did not pass numpy.dtype object");
|
||||
}
|
||||
PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
|
||||
int type_num = fix_numpy_type_num(descr->type_num);
|
||||
|
||||
switch (type_num) {
|
||||
TO_ARROW_TYPE_CASE(BOOL, uint8);
|
||||
TO_ARROW_TYPE_CASE(INT8, int8);
|
||||
TO_ARROW_TYPE_CASE(INT16, int16);
|
||||
TO_ARROW_TYPE_CASE(INT32, int32);
|
||||
TO_ARROW_TYPE_CASE(INT64, int64);
|
||||
TO_ARROW_TYPE_CASE(UINT8, uint8);
|
||||
TO_ARROW_TYPE_CASE(UINT16, uint16);
|
||||
TO_ARROW_TYPE_CASE(UINT32, uint32);
|
||||
TO_ARROW_TYPE_CASE(UINT64, uint64);
|
||||
TO_ARROW_TYPE_CASE(FLOAT16, float16);
|
||||
TO_ARROW_TYPE_CASE(FLOAT32, float32);
|
||||
TO_ARROW_TYPE_CASE(FLOAT64, float64);
|
||||
}
|
||||
return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
|
||||
}
|
||||
|
||||
Status GetNumPyType(const DataType& type, int* type_num) {
|
||||
#define NUMPY_TYPE_CASE(ARROW_NAME, NPY_NAME) \
|
||||
case Type::ARROW_NAME: \
|
||||
*type_num = NPY_##NPY_NAME; \
|
||||
break;
|
||||
|
||||
switch (type.id()) {
|
||||
NUMPY_TYPE_CASE(UINT8, UINT8);
|
||||
NUMPY_TYPE_CASE(INT8, INT8);
|
||||
NUMPY_TYPE_CASE(UINT16, UINT16);
|
||||
NUMPY_TYPE_CASE(INT16, INT16);
|
||||
NUMPY_TYPE_CASE(UINT32, UINT32);
|
||||
NUMPY_TYPE_CASE(INT32, INT32);
|
||||
NUMPY_TYPE_CASE(UINT64, UINT64);
|
||||
NUMPY_TYPE_CASE(INT64, INT64);
|
||||
NUMPY_TYPE_CASE(HALF_FLOAT, FLOAT16);
|
||||
NUMPY_TYPE_CASE(FLOAT, FLOAT32);
|
||||
NUMPY_TYPE_CASE(DOUBLE, FLOAT64);
|
||||
default: {
|
||||
return Status::NotImplemented("Unsupported tensor type: ", type.ToString());
|
||||
}
|
||||
}
|
||||
#undef NUMPY_TYPE_CASE
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar) {
|
||||
PyArray_Descr* descr = PyArray_DescrFromScalar(scalar);
|
||||
OwnedRef descr_ref(reinterpret_cast<PyObject*>(descr));
|
||||
return NumPyDtypeToArrow(descr);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) {
|
||||
if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
|
||||
return Status::TypeError("Did not pass numpy.dtype object");
|
||||
}
|
||||
PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
|
||||
return NumPyDtypeToArrow(descr);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) {
|
||||
int type_num = fix_numpy_type_num(descr->type_num);
|
||||
|
||||
switch (type_num) {
|
||||
TO_ARROW_TYPE_CASE(BOOL, boolean);
|
||||
TO_ARROW_TYPE_CASE(INT8, int8);
|
||||
TO_ARROW_TYPE_CASE(INT16, int16);
|
||||
TO_ARROW_TYPE_CASE(INT32, int32);
|
||||
TO_ARROW_TYPE_CASE(INT64, int64);
|
||||
TO_ARROW_TYPE_CASE(UINT8, uint8);
|
||||
TO_ARROW_TYPE_CASE(UINT16, uint16);
|
||||
TO_ARROW_TYPE_CASE(UINT32, uint32);
|
||||
TO_ARROW_TYPE_CASE(UINT64, uint64);
|
||||
TO_ARROW_TYPE_CASE(FLOAT16, float16);
|
||||
TO_ARROW_TYPE_CASE(FLOAT32, float32);
|
||||
TO_ARROW_TYPE_CASE(FLOAT64, float64);
|
||||
TO_ARROW_TYPE_CASE(STRING, binary);
|
||||
TO_ARROW_TYPE_CASE(UNICODE, utf8);
|
||||
case NPY_DATETIME: {
|
||||
auto date_dtype =
|
||||
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(descr));
|
||||
switch (date_dtype->meta.base) {
|
||||
case NPY_FR_s:
|
||||
return timestamp(TimeUnit::SECOND);
|
||||
case NPY_FR_ms:
|
||||
return timestamp(TimeUnit::MILLI);
|
||||
case NPY_FR_us:
|
||||
return timestamp(TimeUnit::MICRO);
|
||||
case NPY_FR_ns:
|
||||
return timestamp(TimeUnit::NANO);
|
||||
case NPY_FR_D:
|
||||
return date32();
|
||||
case NPY_FR_GENERIC:
|
||||
return Status::NotImplemented("Unbound or generic datetime64 time unit");
|
||||
default:
|
||||
return Status::NotImplemented("Unsupported datetime64 time unit");
|
||||
}
|
||||
} break;
|
||||
case NPY_TIMEDELTA: {
|
||||
auto timedelta_dtype =
|
||||
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(descr));
|
||||
switch (timedelta_dtype->meta.base) {
|
||||
case NPY_FR_s:
|
||||
return duration(TimeUnit::SECOND);
|
||||
case NPY_FR_ms:
|
||||
return duration(TimeUnit::MILLI);
|
||||
case NPY_FR_us:
|
||||
return duration(TimeUnit::MICRO);
|
||||
case NPY_FR_ns:
|
||||
return duration(TimeUnit::NANO);
|
||||
case NPY_FR_GENERIC:
|
||||
return Status::NotImplemented("Unbound or generic timedelta64 time unit");
|
||||
default:
|
||||
return Status::NotImplemented("Unsupported timedelta64 time unit");
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
|
||||
}
|
||||
|
||||
#undef TO_ARROW_TYPE_CASE
|
||||
|
||||
Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
|
||||
const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<Tensor>* out) {
|
||||
if (!PyArray_Check(ao)) {
|
||||
return Status::TypeError("Did not pass ndarray object");
|
||||
}
|
||||
|
||||
PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(ao);
|
||||
|
||||
// TODO(wesm): What do we want to do with non-contiguous memory and negative strides?
|
||||
|
||||
int ndim = PyArray_NDIM(ndarray);
|
||||
|
||||
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
|
||||
std::vector<int64_t> shape(ndim);
|
||||
std::vector<int64_t> strides(ndim);
|
||||
|
||||
npy_intp* array_strides = PyArray_STRIDES(ndarray);
|
||||
npy_intp* array_shape = PyArray_SHAPE(ndarray);
|
||||
for (int i = 0; i < ndim; ++i) {
|
||||
if (array_strides[i] < 0) {
|
||||
return Status::Invalid("Negative ndarray strides not supported");
|
||||
}
|
||||
shape[i] = array_shape[i];
|
||||
strides[i] = array_strides[i];
|
||||
}
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
auto type, GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray))));
|
||||
*out = std::make_shared<Tensor>(type, data, shape, strides, dim_names);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
|
||||
PyObject** out) {
|
||||
int type_num = 0;
|
||||
RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num));
|
||||
PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num);
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
const int ndim = tensor->ndim();
|
||||
std::vector<npy_intp> npy_shape(ndim);
|
||||
std::vector<npy_intp> npy_strides(ndim);
|
||||
|
||||
for (int i = 0; i < ndim; ++i) {
|
||||
npy_shape[i] = tensor->shape()[i];
|
||||
npy_strides[i] = tensor->strides()[i];
|
||||
}
|
||||
|
||||
const void* immutable_data = nullptr;
|
||||
if (tensor->data()) {
|
||||
immutable_data = tensor->data()->data();
|
||||
}
|
||||
|
||||
// Remove const =(
|
||||
void* mutable_data = const_cast<void*>(immutable_data);
|
||||
|
||||
int array_flags = 0;
|
||||
if (tensor->is_row_major()) {
|
||||
array_flags |= NPY_ARRAY_C_CONTIGUOUS;
|
||||
}
|
||||
if (tensor->is_column_major()) {
|
||||
array_flags |= NPY_ARRAY_F_CONTIGUOUS;
|
||||
}
|
||||
if (tensor->is_mutable()) {
|
||||
array_flags |= NPY_ARRAY_WRITEABLE;
|
||||
}
|
||||
|
||||
PyObject* result =
|
||||
PyArray_NewFromDescr(&PyArray_Type, dtype, ndim, npy_shape.data(),
|
||||
npy_strides.data(), mutable_data, array_flags, nullptr);
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
if (base == Py_None || base == nullptr) {
|
||||
base = py::wrap_tensor(tensor);
|
||||
} else {
|
||||
Py_XINCREF(base);
|
||||
}
|
||||
PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result), base);
|
||||
*out = result;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Wrap the dense data of a sparse tensor in a ndarray
|
||||
static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor,
|
||||
std::vector<npy_intp> data_shape, PyObject* base,
|
||||
PyObject** out_data) {
|
||||
int type_num_data = 0;
|
||||
RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data));
|
||||
PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data);
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
const void* immutable_data = sparse_tensor.data()->data();
|
||||
// Remove const =(
|
||||
void* mutable_data = const_cast<void*>(immutable_data);
|
||||
int array_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS;
|
||||
if (sparse_tensor.is_mutable()) {
|
||||
array_flags |= NPY_ARRAY_WRITEABLE;
|
||||
}
|
||||
|
||||
*out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data,
|
||||
static_cast<int>(data_shape.size()), data_shape.data(),
|
||||
nullptr, mutable_data, array_flags, nullptr);
|
||||
RETURN_IF_PYERROR();
|
||||
Py_XINCREF(base);
|
||||
PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(*out_data), base);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SparseCOOTensorToNdarray(const std::shared_ptr<SparseCOOTensor>& sparse_tensor,
|
||||
PyObject* base, PyObject** out_data,
|
||||
PyObject** out_coords) {
|
||||
const auto& sparse_index = arrow::internal::checked_cast<const SparseCOOIndex&>(
|
||||
*sparse_tensor->sparse_index());
|
||||
|
||||
// Wrap tensor data
|
||||
OwnedRef result_data;
|
||||
RETURN_NOT_OK(SparseTensorDataToNdarray(
|
||||
*sparse_tensor, {static_cast<npy_intp>(sparse_tensor->non_zero_length()), 1}, base,
|
||||
result_data.ref()));
|
||||
|
||||
// Wrap indices
|
||||
PyObject* result_coords;
|
||||
RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords));
|
||||
|
||||
*out_data = result_data.detach();
|
||||
*out_coords = result_coords;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SparseCSXMatrixToNdarray(const std::shared_ptr<SparseTensor>& sparse_tensor,
|
||||
PyObject* base, PyObject** out_data,
|
||||
PyObject** out_indptr, PyObject** out_indices) {
|
||||
// Wrap indices
|
||||
OwnedRef result_indptr;
|
||||
OwnedRef result_indices;
|
||||
|
||||
switch (sparse_tensor->format_id()) {
|
||||
case SparseTensorFormat::CSR: {
|
||||
const auto& sparse_index = arrow::internal::checked_cast<const SparseCSRIndex&>(
|
||||
*sparse_tensor->sparse_index());
|
||||
RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref()));
|
||||
RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref()));
|
||||
break;
|
||||
}
|
||||
case SparseTensorFormat::CSC: {
|
||||
const auto& sparse_index = arrow::internal::checked_cast<const SparseCSCIndex&>(
|
||||
*sparse_tensor->sparse_index());
|
||||
RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref()));
|
||||
RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref()));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return Status::NotImplemented("Invalid SparseTensor type.");
|
||||
}
|
||||
|
||||
// Wrap tensor data
|
||||
OwnedRef result_data;
|
||||
RETURN_NOT_OK(SparseTensorDataToNdarray(
|
||||
*sparse_tensor, {static_cast<npy_intp>(sparse_tensor->non_zero_length()), 1}, base,
|
||||
result_data.ref()));
|
||||
|
||||
*out_data = result_data.detach();
|
||||
*out_indptr = result_indptr.detach();
|
||||
*out_indices = result_indices.detach();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SparseCSRMatrixToNdarray(const std::shared_ptr<SparseCSRMatrix>& sparse_tensor,
|
||||
PyObject* base, PyObject** out_data,
|
||||
PyObject** out_indptr, PyObject** out_indices) {
|
||||
return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices);
|
||||
}
|
||||
|
||||
Status SparseCSCMatrixToNdarray(const std::shared_ptr<SparseCSCMatrix>& sparse_tensor,
|
||||
PyObject* base, PyObject** out_data,
|
||||
PyObject** out_indptr, PyObject** out_indices) {
|
||||
return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices);
|
||||
}
|
||||
|
||||
Status SparseCSFTensorToNdarray(const std::shared_ptr<SparseCSFTensor>& sparse_tensor,
|
||||
PyObject* base, PyObject** out_data,
|
||||
PyObject** out_indptr, PyObject** out_indices) {
|
||||
const auto& sparse_index = arrow::internal::checked_cast<const SparseCSFIndex&>(
|
||||
*sparse_tensor->sparse_index());
|
||||
|
||||
// Wrap tensor data
|
||||
OwnedRef result_data;
|
||||
RETURN_NOT_OK(SparseTensorDataToNdarray(
|
||||
*sparse_tensor, {static_cast<npy_intp>(sparse_tensor->non_zero_length()), 1}, base,
|
||||
result_data.ref()));
|
||||
|
||||
// Wrap indices
|
||||
int ndim = static_cast<int>(sparse_index.indices().size());
|
||||
OwnedRef indptr(PyList_New(ndim - 1));
|
||||
OwnedRef indices(PyList_New(ndim));
|
||||
RETURN_IF_PYERROR();
|
||||
|
||||
for (int i = 0; i < ndim - 1; ++i) {
|
||||
PyObject* item;
|
||||
RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr()[i], base, &item));
|
||||
if (PyList_SetItem(indptr.obj(), i, item) < 0) {
|
||||
Py_XDECREF(item);
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < ndim; ++i) {
|
||||
PyObject* item;
|
||||
RETURN_NOT_OK(TensorToNdarray(sparse_index.indices()[i], base, &item));
|
||||
if (PyList_SetItem(indices.obj(), i, item) < 0) {
|
||||
Py_XDECREF(item);
|
||||
RETURN_IF_PYERROR();
|
||||
}
|
||||
}
|
||||
|
||||
*out_indptr = indptr.detach();
|
||||
*out_indices = indices.detach();
|
||||
*out_data = result_data.detach();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NdarraysToSparseCOOTensor(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
|
||||
const std::vector<int64_t>& shape,
|
||||
const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<SparseCOOTensor>* out) {
|
||||
if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) {
|
||||
return Status::TypeError("Did not pass ndarray object");
|
||||
}
|
||||
|
||||
PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
|
||||
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
auto type_data,
|
||||
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));
|
||||
|
||||
std::shared_ptr<Tensor> coords;
|
||||
RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords));
|
||||
ARROW_CHECK_EQ(coords->type_id(), Type::INT64); // Should be ensured by caller
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<SparseCOOIndex> sparse_index,
|
||||
SparseCOOIndex::Make(coords));
|
||||
*out = std::make_shared<SparseTensorImpl<SparseCOOIndex>>(sparse_index, type_data, data,
|
||||
shape, dim_names);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class IndexType>
|
||||
Status NdarraysToSparseCSXMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
|
||||
PyObject* indices_ao, const std::vector<int64_t>& shape,
|
||||
const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<SparseTensorImpl<IndexType>>* out) {
|
||||
if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) ||
|
||||
!PyArray_Check(indices_ao)) {
|
||||
return Status::TypeError("Did not pass ndarray object");
|
||||
}
|
||||
|
||||
PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
|
||||
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
auto type_data,
|
||||
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));
|
||||
|
||||
std::shared_ptr<Tensor> indptr, indices;
|
||||
RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr));
|
||||
RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices));
|
||||
ARROW_CHECK_EQ(indptr->type_id(), Type::INT64); // Should be ensured by caller
|
||||
ARROW_CHECK_EQ(indices->type_id(), Type::INT64); // Should be ensured by caller
|
||||
|
||||
auto sparse_index = std::make_shared<IndexType>(
|
||||
std::static_pointer_cast<NumericTensor<Int64Type>>(indptr),
|
||||
std::static_pointer_cast<NumericTensor<Int64Type>>(indices));
|
||||
*out = std::make_shared<SparseTensorImpl<IndexType>>(sparse_index, type_data, data,
|
||||
shape, dim_names);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
|
||||
PyObject* indices_ao, const std::vector<int64_t>& shape,
|
||||
const std::vector<int64_t>& axis_order,
|
||||
const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<SparseCSFTensor>* out) {
|
||||
if (!PyArray_Check(data_ao)) {
|
||||
return Status::TypeError("Did not pass ndarray object for data");
|
||||
}
|
||||
const int ndim = static_cast<const int>(shape.size());
|
||||
PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
|
||||
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
auto type_data,
|
||||
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));
|
||||
|
||||
std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
|
||||
std::vector<std::shared_ptr<Tensor>> indices(ndim);
|
||||
|
||||
for (int i = 0; i < ndim - 1; ++i) {
|
||||
#ifdef Py_GIL_DISABLED
|
||||
PyObject* item = PySequence_ITEM(indptr_ao, i);
|
||||
RETURN_IF_PYERROR();
|
||||
OwnedRef item_ref(item);
|
||||
#else
|
||||
PyObject* item = PySequence_Fast_GET_ITEM(indptr_ao, i);
|
||||
#endif
|
||||
if (!PyArray_Check(item)) {
|
||||
return Status::TypeError("Did not pass ndarray object for indptr");
|
||||
}
|
||||
RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indptr[i]));
|
||||
ARROW_CHECK_EQ(indptr[i]->type_id(), Type::INT64); // Should be ensured by caller
|
||||
}
|
||||
|
||||
for (int i = 0; i < ndim; ++i) {
|
||||
#ifdef Py_GIL_DISABLED
|
||||
PyObject* item = PySequence_ITEM(indices_ao, i);
|
||||
RETURN_IF_PYERROR();
|
||||
OwnedRef item_ref(item);
|
||||
#else
|
||||
PyObject* item = PySequence_Fast_GET_ITEM(indices_ao, i);
|
||||
#endif
|
||||
if (!PyArray_Check(item)) {
|
||||
return Status::TypeError("Did not pass ndarray object for indices");
|
||||
}
|
||||
RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indices[i]));
|
||||
ARROW_CHECK_EQ(indices[i]->type_id(), Type::INT64); // Should be ensured by caller
|
||||
}
|
||||
|
||||
auto sparse_index = std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
|
||||
*out = std::make_shared<SparseTensorImpl<SparseCSFIndex>>(sparse_index, type_data, data,
|
||||
shape, dim_names);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NdarraysToSparseCSRMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
|
||||
PyObject* indices_ao, const std::vector<int64_t>& shape,
|
||||
const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<SparseCSRMatrix>* out) {
|
||||
return NdarraysToSparseCSXMatrix<SparseCSRIndex>(pool, data_ao, indptr_ao, indices_ao,
|
||||
shape, dim_names, out);
|
||||
}
|
||||
|
||||
Status NdarraysToSparseCSCMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
|
||||
PyObject* indices_ao, const std::vector<int64_t>& shape,
|
||||
const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<SparseCSCMatrix>* out) {
|
||||
return NdarraysToSparseCSXMatrix<SparseCSCIndex>(pool, data_ao, indptr_ao, indices_ao,
|
||||
shape, dim_names, out);
|
||||
}
|
||||
|
||||
Status TensorToSparseCOOTensor(const std::shared_ptr<Tensor>& tensor,
|
||||
std::shared_ptr<SparseCOOTensor>* out) {
|
||||
return SparseCOOTensor::Make(*tensor).Value(out);
|
||||
}
|
||||
|
||||
Status TensorToSparseCSRMatrix(const std::shared_ptr<Tensor>& tensor,
|
||||
std::shared_ptr<SparseCSRMatrix>* out) {
|
||||
return SparseCSRMatrix::Make(*tensor).Value(out);
|
||||
}
|
||||
|
||||
Status TensorToSparseCSCMatrix(const std::shared_ptr<Tensor>& tensor,
|
||||
std::shared_ptr<SparseCSCMatrix>* out) {
|
||||
return SparseCSCMatrix::Make(*tensor).Value(out);
|
||||
}
|
||||
|
||||
Status TensorToSparseCSFTensor(const std::shared_ptr<Tensor>& tensor,
|
||||
std::shared_ptr<SparseCSFTensor>* out) {
|
||||
return SparseCSFTensor::Make(*tensor).Value(out);
|
||||
}
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,122 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Functions for converting between pandas's NumPy-based data representation
|
||||
// and Arrow data structures
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/sparse_tensor.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class DataType;
|
||||
class MemoryPool;
|
||||
class Status;
|
||||
class Tensor;
|
||||
|
||||
namespace py {
|
||||
|
||||
class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer {
|
||||
public:
|
||||
explicit NumPyBuffer(PyObject* arr);
|
||||
virtual ~NumPyBuffer();
|
||||
|
||||
private:
|
||||
PyObject* arr_;
|
||||
};
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype);
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr);
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
|
||||
const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<Tensor>* out);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor,
|
||||
PyObject* base, PyObject** out);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status
|
||||
SparseCOOTensorToNdarray(const std::shared_ptr<SparseCOOTensor>& sparse_tensor,
|
||||
PyObject* base, PyObject** out_data, PyObject** out_coords);
|
||||
|
||||
Status SparseCSXMatrixToNdarray(const std::shared_ptr<SparseTensor>& sparse_tensor,
|
||||
PyObject* base, PyObject** out_data,
|
||||
PyObject** out_indptr, PyObject** out_indices);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status SparseCSRMatrixToNdarray(
|
||||
const std::shared_ptr<SparseCSRMatrix>& sparse_tensor, PyObject* base,
|
||||
PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status SparseCSCMatrixToNdarray(
|
||||
const std::shared_ptr<SparseCSCMatrix>& sparse_tensor, PyObject* base,
|
||||
PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status SparseCSFTensorToNdarray(
|
||||
const std::shared_ptr<SparseCSFTensor>& sparse_tensor, PyObject* base,
|
||||
PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status NdarraysToSparseCOOTensor(
|
||||
MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
|
||||
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<SparseCOOTensor>* out);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status NdarraysToSparseCSRMatrix(
|
||||
MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
|
||||
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<SparseCSRMatrix>* out);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status NdarraysToSparseCSCMatrix(
|
||||
MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
|
||||
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
|
||||
std::shared_ptr<SparseCSCMatrix>* out);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status NdarraysToSparseCSFTensor(
|
||||
MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
|
||||
const std::vector<int64_t>& shape, const std::vector<int64_t>& axis_order,
|
||||
const std::vector<std::string>& dim_names, std::shared_ptr<SparseCSFTensor>* out);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status
|
||||
TensorToSparseCOOTensor(const std::shared_ptr<Tensor>& tensor,
|
||||
std::shared_ptr<SparseCOOTensor>* csparse_tensor);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status
|
||||
TensorToSparseCSRMatrix(const std::shared_ptr<Tensor>& tensor,
|
||||
std::shared_ptr<SparseCSRMatrix>* csparse_tensor);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status
|
||||
TensorToSparseCSCMatrix(const std::shared_ptr<Tensor>& tensor,
|
||||
std::shared_ptr<SparseCSCMatrix>* csparse_tensor);
|
||||
|
||||
ARROW_PYTHON_EXPORT Status
|
||||
TensorToSparseCSFTensor(const std::shared_ptr<Tensor>& tensor,
|
||||
std::shared_ptr<SparseCSFTensor>* csparse_tensor);
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,33 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Trigger the array import (inversion of NO_IMPORT_ARRAY)
|
||||
#define NUMPY_IMPORT_ARRAY
|
||||
|
||||
#include "arrow/python/numpy_init.h"
|
||||
#include "arrow/python/numpy_interop.h"
|
||||
|
||||
namespace arrow::py {
|
||||
bool numpy_imported = false;
|
||||
|
||||
int arrow_init_numpy() {
|
||||
numpy_imported = true;
|
||||
return arrow::py::import_numpy();
|
||||
}
|
||||
|
||||
bool has_numpy() { return numpy_imported; }
|
||||
} // namespace arrow::py
|
||||
@@ -0,0 +1,27 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow::py {
|
||||
ARROW_PYTHON_EXPORT
|
||||
int arrow_init_numpy();
|
||||
bool has_numpy();
|
||||
} // namespace arrow::py
|
||||
@@ -0,0 +1,195 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Internal utilities for dealing with NumPy
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/numpy_init.h"
|
||||
#include "arrow/python/numpy_interop.h"
|
||||
|
||||
#include "arrow/status.h"
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
/// Indexing convenience for interacting with strided 1-dim ndarray objects
|
||||
template <typename T>
|
||||
class Ndarray1DIndexer {
|
||||
public:
|
||||
typedef int64_t size_type;
|
||||
|
||||
Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {}
|
||||
|
||||
explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() {
|
||||
arr_ = arr;
|
||||
ARROW_DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays";
|
||||
data_ = reinterpret_cast<uint8_t*>(PyArray_DATA(arr));
|
||||
stride_ = PyArray_STRIDES(arr)[0];
|
||||
}
|
||||
|
||||
~Ndarray1DIndexer() = default;
|
||||
|
||||
int64_t size() const { return PyArray_SIZE(arr_); }
|
||||
|
||||
const T* data() const { return reinterpret_cast<const T*>(data_); }
|
||||
|
||||
bool is_strided() const { return stride_ != sizeof(T); }
|
||||
|
||||
T& operator[](size_type index) {
|
||||
return *reinterpret_cast<T*>(data_ + index * stride_);
|
||||
}
|
||||
const T& operator[](size_type index) const {
|
||||
return *reinterpret_cast<const T*>(data_ + index * stride_);
|
||||
}
|
||||
|
||||
private:
|
||||
PyArrayObject* arr_;
|
||||
uint8_t* data_;
|
||||
int64_t stride_;
|
||||
};
|
||||
|
||||
// Handling of Numpy Types by their static numbers
|
||||
// (the NPY_TYPES enum and related defines)
|
||||
|
||||
static inline std::string GetNumPyTypeName(int npy_type) {
|
||||
#define TYPE_CASE(TYPE, NAME) \
|
||||
case NPY_##TYPE: \
|
||||
return NAME;
|
||||
|
||||
switch (npy_type) {
|
||||
TYPE_CASE(BOOL, "bool")
|
||||
TYPE_CASE(INT8, "int8")
|
||||
TYPE_CASE(INT16, "int16")
|
||||
TYPE_CASE(INT32, "int32")
|
||||
TYPE_CASE(INT64, "int64")
|
||||
#if !NPY_INT32_IS_INT
|
||||
TYPE_CASE(INT, "intc")
|
||||
#endif
|
||||
#if !NPY_INT64_IS_LONG_LONG
|
||||
TYPE_CASE(LONGLONG, "longlong")
|
||||
#endif
|
||||
TYPE_CASE(UINT8, "uint8")
|
||||
TYPE_CASE(UINT16, "uint16")
|
||||
TYPE_CASE(UINT32, "uint32")
|
||||
TYPE_CASE(UINT64, "uint64")
|
||||
#if !NPY_INT32_IS_INT
|
||||
TYPE_CASE(UINT, "uintc")
|
||||
#endif
|
||||
#if !NPY_INT64_IS_LONG_LONG
|
||||
TYPE_CASE(ULONGLONG, "ulonglong")
|
||||
#endif
|
||||
TYPE_CASE(FLOAT16, "float16")
|
||||
TYPE_CASE(FLOAT32, "float32")
|
||||
TYPE_CASE(FLOAT64, "float64")
|
||||
TYPE_CASE(DATETIME, "datetime64")
|
||||
TYPE_CASE(TIMEDELTA, "timedelta64")
|
||||
TYPE_CASE(OBJECT, "object")
|
||||
TYPE_CASE(VOID, "void")
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
#undef TYPE_CASE
|
||||
std::stringstream ss;
|
||||
ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
#define TYPE_VISIT_INLINE(TYPE) \
|
||||
case NPY_##TYPE: \
|
||||
return visitor->template Visit<NPY_##TYPE>(arr);
|
||||
|
||||
template <typename VISITOR>
|
||||
inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
|
||||
switch (PyArray_TYPE(arr)) {
|
||||
TYPE_VISIT_INLINE(BOOL);
|
||||
TYPE_VISIT_INLINE(INT8);
|
||||
TYPE_VISIT_INLINE(UINT8);
|
||||
TYPE_VISIT_INLINE(INT16);
|
||||
TYPE_VISIT_INLINE(UINT16);
|
||||
TYPE_VISIT_INLINE(INT32);
|
||||
TYPE_VISIT_INLINE(UINT32);
|
||||
TYPE_VISIT_INLINE(INT64);
|
||||
TYPE_VISIT_INLINE(UINT64);
|
||||
#if !NPY_INT32_IS_INT
|
||||
TYPE_VISIT_INLINE(INT);
|
||||
TYPE_VISIT_INLINE(UINT);
|
||||
#endif
|
||||
#if !NPY_INT64_IS_LONG_LONG
|
||||
TYPE_VISIT_INLINE(LONGLONG);
|
||||
TYPE_VISIT_INLINE(ULONGLONG);
|
||||
#endif
|
||||
TYPE_VISIT_INLINE(FLOAT16);
|
||||
TYPE_VISIT_INLINE(FLOAT32);
|
||||
TYPE_VISIT_INLINE(FLOAT64);
|
||||
TYPE_VISIT_INLINE(DATETIME);
|
||||
TYPE_VISIT_INLINE(TIMEDELTA);
|
||||
TYPE_VISIT_INLINE(OBJECT);
|
||||
}
|
||||
return Status::NotImplemented("NumPy type not implemented: ",
|
||||
GetNumPyTypeName(PyArray_TYPE(arr)));
|
||||
}
|
||||
|
||||
#undef TYPE_VISIT_INLINE
|
||||
|
||||
namespace internal {
|
||||
|
||||
inline bool PyFloatScalar_Check(PyObject* obj) {
|
||||
if (has_numpy()) {
|
||||
return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating);
|
||||
} else {
|
||||
return PyFloat_Check(obj);
|
||||
}
|
||||
}
|
||||
|
||||
inline bool PyIntScalar_Check(PyObject* obj) {
|
||||
if (has_numpy()) {
|
||||
return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer);
|
||||
} else {
|
||||
return PyLong_Check(obj);
|
||||
}
|
||||
}
|
||||
|
||||
inline bool PyBoolScalar_Check(PyObject* obj) {
|
||||
if (has_numpy()) {
|
||||
return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
|
||||
} else {
|
||||
return PyBool_Check(obj);
|
||||
}
|
||||
}
|
||||
|
||||
static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
|
||||
if (type == NPY_DATETIME || type == NPY_TIMEDELTA) {
|
||||
// It is not safe to mutate the result of DescrFromType for datetime and
|
||||
// timedelta descriptors
|
||||
return PyArray_DescrNewFromType(type);
|
||||
} else {
|
||||
return PyArray_DescrFromType(type);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,103 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h" // IWYU pragma: export
|
||||
|
||||
#include <numpy/numpyconfig.h> // IWYU pragma: export
|
||||
|
||||
// Don't use the deprecated Numpy functions
|
||||
#ifdef NPY_1_7_API_VERSION
|
||||
# define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
|
||||
#else
|
||||
# define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED
|
||||
# define NPY_ARRAY_ALIGNED NPY_ALIGNED
|
||||
# define NPY_ARRAY_WRITEABLE NPY_WRITEABLE
|
||||
# define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY
|
||||
#endif
|
||||
|
||||
// This is required to be able to access the NumPy C API properly in C++ files
|
||||
// other than init.cc.
|
||||
#define PY_ARRAY_UNIQUE_SYMBOL arrow_ARRAY_API
|
||||
#ifndef NUMPY_IMPORT_ARRAY
|
||||
# define NO_IMPORT_ARRAY
|
||||
#endif
|
||||
|
||||
#include <numpy/arrayobject.h> // IWYU pragma: export
|
||||
#include <numpy/arrayscalars.h> // IWYU pragma: export
|
||||
#include <numpy/ufuncobject.h> // IWYU pragma: export
|
||||
|
||||
// A bit subtle. Numpy has 5 canonical integer types:
|
||||
// (or, rather, type pairs: signed and unsigned)
|
||||
// NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG
|
||||
// It also has 4 fixed-width integer aliases.
|
||||
// When mapping Arrow integer types to these 4 fixed-width aliases,
|
||||
// we always miss one of the canonical types (even though it may
|
||||
// have the same width as one of the aliases).
|
||||
// Which one depends on the platform...
|
||||
// On a LP64 system, NPY_INT64 maps to NPY_LONG and
|
||||
// NPY_LONGLONG needs to be handled separately.
|
||||
// On a LLP64 system, NPY_INT32 maps to NPY_LONG and
|
||||
// NPY_INT needs to be handled separately.
|
||||
|
||||
#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64
|
||||
# define NPY_INT64_IS_LONG_LONG 1
|
||||
#else
|
||||
# define NPY_INT64_IS_LONG_LONG 0
|
||||
#endif
|
||||
|
||||
#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64
|
||||
# define NPY_INT32_IS_INT 1
|
||||
#else
|
||||
# define NPY_INT32_IS_INT 0
|
||||
#endif
|
||||
|
||||
// Backported NumPy 2 API (can be removed if numpy 2 is required)
|
||||
#if NPY_ABI_VERSION < 0x02000000
|
||||
# define PyDataType_ELSIZE(descr) ((descr)->elsize)
|
||||
# define PyDataType_C_METADATA(descr) ((descr)->c_metadata)
|
||||
# define PyDataType_FIELDS(descr) ((descr)->fields)
|
||||
#endif
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
inline int import_numpy() {
|
||||
#ifdef NUMPY_IMPORT_ARRAY
|
||||
import_array1(-1);
|
||||
import_umath1(-1);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// See above about the missing Numpy integer type numbers
|
||||
inline int fix_numpy_type_num(int type_num) {
|
||||
#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
|
||||
if (type_num == NPY_INT) return NPY_INT32;
|
||||
if (type_num == NPY_UINT) return NPY_UINT32;
|
||||
#endif
|
||||
#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64
|
||||
if (type_num == NPY_LONGLONG) return NPY_INT64;
|
||||
if (type_num == NPY_ULONGLONG) return NPY_UINT64;
|
||||
#endif
|
||||
return type_num;
|
||||
}
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,945 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Functions for pandas conversion via NumPy
|
||||
|
||||
#include "arrow/python/numpy_to_arrow.h"
|
||||
#include "arrow/python/numpy_interop.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/table.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/bitmap_generate.h"
|
||||
#include "arrow/util/bitmap_ops.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/endian.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/string.h"
|
||||
#include "arrow/util/utf8.h"
|
||||
#include "arrow/visit_type_inline.h"
|
||||
|
||||
#include "arrow/compute/api_scalar.h"
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/datetime.h"
|
||||
#include "arrow/python/helpers.h"
|
||||
#include "arrow/python/iterators.h"
|
||||
#include "arrow/python/numpy_convert.h"
|
||||
#include "arrow/python/numpy_internal.h"
|
||||
#include "arrow/python/python_to_arrow.h"
|
||||
#include "arrow/python/type_traits.h"
|
||||
#include "arrow/python/vendored/pythoncapi_compat.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using internal::checked_cast;
|
||||
using internal::CopyBitmap;
|
||||
using internal::GenerateBitsUnrolled;
|
||||
|
||||
namespace py {
|
||||
|
||||
using internal::NumPyTypeSize;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Conversion utilities
|
||||
|
||||
namespace {
|
||||
|
||||
Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
|
||||
std::shared_ptr<ResizableBuffer>* out) {
|
||||
int64_t null_bytes = bit_util::BytesForBits(length);
|
||||
ARROW_ASSIGN_OR_RAISE(auto null_bitmap, AllocateResizableBuffer(null_bytes, pool));
|
||||
|
||||
// Padding zeroed by AllocateResizableBuffer
|
||||
memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes));
|
||||
*out = std::move(null_bitmap);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Conversion from NumPy-in-Pandas to Arrow null bitmap
|
||||
|
||||
template <int TYPE>
|
||||
inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
|
||||
typedef internal::npy_traits<TYPE> traits;
|
||||
typedef typename traits::value_type T;
|
||||
|
||||
int64_t null_count = 0;
|
||||
|
||||
Ndarray1DIndexer<T> values(arr);
|
||||
for (int i = 0; i < values.size(); ++i) {
|
||||
if (traits::isnull(values[i])) {
|
||||
++null_count;
|
||||
} else {
|
||||
bit_util::SetBit(bitmap, i);
|
||||
}
|
||||
}
|
||||
|
||||
return null_count;
|
||||
}
|
||||
|
||||
class NumPyNullsConverter {
|
||||
public:
|
||||
/// Convert the given array's null values to a null bitmap.
|
||||
/// The null bitmap is only allocated if null values are ever possible.
|
||||
static Status Convert(MemoryPool* pool, PyArrayObject* arr, bool from_pandas,
|
||||
std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
|
||||
int64_t* out_null_count) {
|
||||
NumPyNullsConverter converter(pool, arr, from_pandas);
|
||||
RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
|
||||
*out_null_bitmap_ = converter.null_bitmap_;
|
||||
*out_null_count = converter.null_count_;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <int TYPE>
|
||||
Status Visit(PyArrayObject* arr) {
|
||||
typedef internal::npy_traits<TYPE> traits;
|
||||
|
||||
const bool null_sentinels_possible =
|
||||
// Always treat Numpy's NaT as null
|
||||
TYPE == NPY_DATETIME || TYPE == NPY_TIMEDELTA ||
|
||||
// Observing pandas's null sentinels
|
||||
(from_pandas_ && traits::supports_nulls);
|
||||
|
||||
if (null_sentinels_possible) {
|
||||
RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_));
|
||||
null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, bool from_pandas)
|
||||
: pool_(pool),
|
||||
arr_(arr),
|
||||
from_pandas_(from_pandas),
|
||||
null_bitmap_data_(nullptr),
|
||||
null_count_(0) {}
|
||||
|
||||
MemoryPool* pool_;
|
||||
PyArrayObject* arr_;
|
||||
bool from_pandas_;
|
||||
std::shared_ptr<ResizableBuffer> null_bitmap_;
|
||||
uint8_t* null_bitmap_data_;
|
||||
int64_t null_count_;
|
||||
};
|
||||
|
||||
// Returns null count
|
||||
int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
|
||||
int64_t null_count = 0;
|
||||
|
||||
if (!PyArray_Check(mask)) return -1;
|
||||
|
||||
Ndarray1DIndexer<uint8_t> mask_values(mask);
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (mask_values[i]) {
|
||||
++null_count;
|
||||
bit_util::ClearBit(bitmap, i);
|
||||
} else {
|
||||
bit_util::SetBit(bitmap, i);
|
||||
}
|
||||
}
|
||||
return null_count;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Conversion from NumPy arrays (possibly originating from pandas) to Arrow
|
||||
// format. Does not handle NPY_OBJECT dtype arrays; use ConvertPySequence for
|
||||
// that
|
||||
|
||||
class NumPyConverter {
|
||||
public:
|
||||
NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
|
||||
const std::shared_ptr<DataType>& type, bool from_pandas,
|
||||
const compute::CastOptions& cast_options = compute::CastOptions())
|
||||
: pool_(pool),
|
||||
type_(type),
|
||||
arr_(reinterpret_cast<PyArrayObject*>(arr)),
|
||||
dtype_(PyArray_DESCR(arr_)),
|
||||
mask_(nullptr),
|
||||
from_pandas_(from_pandas),
|
||||
cast_options_(cast_options),
|
||||
null_bitmap_data_(nullptr),
|
||||
null_count_(0) {
|
||||
if (mo != nullptr && mo != Py_None) {
|
||||
mask_ = reinterpret_cast<PyArrayObject*>(mo);
|
||||
}
|
||||
length_ = static_cast<int64_t>(PyArray_SIZE(arr_));
|
||||
itemsize_ = static_cast<int64_t>(PyArray_ITEMSIZE(arr_));
|
||||
stride_ = static_cast<int64_t>(PyArray_STRIDES(arr_)[0]);
|
||||
}
|
||||
|
||||
bool is_strided() const { return itemsize_ != stride_; }
|
||||
|
||||
Status Convert();
|
||||
|
||||
const ArrayVector& result() const { return out_arrays_; }
|
||||
|
||||
template <typename T>
|
||||
enable_if_primitive_ctype<T, Status> Visit(const T& type) {
|
||||
return VisitNative<T>();
|
||||
}
|
||||
|
||||
Status Visit(const HalfFloatType& type) { return VisitNative<UInt16Type>(); }
|
||||
|
||||
Status Visit(const Date32Type& type) { return VisitNative<Date32Type>(); }
|
||||
Status Visit(const Date64Type& type) { return VisitNative<Date64Type>(); }
|
||||
Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); }
|
||||
Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
|
||||
Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); }
|
||||
Status Visit(const DurationType& type) { return VisitNative<DurationType>(); }
|
||||
|
||||
Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); }
|
||||
|
||||
// NumPy ascii string arrays
|
||||
Status Visit(const BinaryType& type);
|
||||
Status Visit(const LargeBinaryType& type);
|
||||
Status Visit(const BinaryViewType& type);
|
||||
|
||||
// NumPy unicode arrays
|
||||
Status Visit(const StringType& type);
|
||||
Status Visit(const LargeStringType& type);
|
||||
Status Visit(const StringViewType& type);
|
||||
|
||||
Status Visit(const StructType& type);
|
||||
|
||||
Status Visit(const FixedSizeBinaryType& type);
|
||||
|
||||
// Default case
|
||||
Status Visit(const DataType& type) { return TypeNotImplemented(type.ToString()); }
|
||||
|
||||
protected:
|
||||
Status InitNullBitmap() {
|
||||
RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_));
|
||||
null_bitmap_data_ = null_bitmap_->mutable_data();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Called before ConvertData to ensure Numpy input buffer is in expected
|
||||
// Arrow layout
|
||||
template <typename ArrowType>
|
||||
Status PrepareInputData(std::shared_ptr<Buffer>* data);
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Traditional visitor conversion for non-object arrays
|
||||
|
||||
template <typename ArrowType>
|
||||
Status ConvertData(std::shared_ptr<Buffer>* data);
|
||||
|
||||
template <typename T>
|
||||
Status PushBuilderResult(T* builder) {
|
||||
std::shared_ptr<Array> out;
|
||||
RETURN_NOT_OK(builder->Finish(&out));
|
||||
out_arrays_.emplace_back(out);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PushArray(const std::shared_ptr<ArrayData>& data) {
|
||||
out_arrays_.emplace_back(MakeArray(data));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
Status VisitNative() {
|
||||
if (mask_ != nullptr) {
|
||||
RETURN_NOT_OK(InitNullBitmap());
|
||||
null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
|
||||
if (null_count_ == -1) return Status::Invalid("Invalid mask type");
|
||||
} else {
|
||||
RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
|
||||
&null_count_));
|
||||
}
|
||||
|
||||
std::shared_ptr<Buffer> data;
|
||||
RETURN_NOT_OK(ConvertData<ArrowType>(&data));
|
||||
|
||||
auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
|
||||
return PushArray(arr_data);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status VisitBinary(T* builder);
|
||||
|
||||
template <typename T>
|
||||
Status VisitString(T* builder);
|
||||
|
||||
Status TypeNotImplemented(std::string type_name) {
|
||||
return Status::NotImplemented("NumPyConverter doesn't implement <", type_name,
|
||||
"> conversion. ");
|
||||
}
|
||||
|
||||
MemoryPool* pool_;
|
||||
std::shared_ptr<DataType> type_;
|
||||
PyArrayObject* arr_;
|
||||
PyArray_Descr* dtype_;
|
||||
PyArrayObject* mask_;
|
||||
int64_t length_;
|
||||
int64_t stride_;
|
||||
int64_t itemsize_;
|
||||
|
||||
bool from_pandas_;
|
||||
compute::CastOptions cast_options_;
|
||||
|
||||
// Used in visitor pattern
|
||||
ArrayVector out_arrays_;
|
||||
|
||||
std::shared_ptr<ResizableBuffer> null_bitmap_;
|
||||
uint8_t* null_bitmap_data_;
|
||||
int64_t null_count_;
|
||||
};
|
||||
|
||||
Status NumPyConverter::Convert() {
|
||||
if (PyArray_NDIM(arr_) != 1) {
|
||||
return Status::Invalid("only handle 1-dimensional arrays");
|
||||
}
|
||||
|
||||
if (dtype_->type_num == NPY_OBJECT) {
|
||||
// If an object array, convert it like a normal Python sequence
|
||||
PyConversionOptions py_options;
|
||||
py_options.type = type_;
|
||||
py_options.from_pandas = from_pandas_;
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
auto chunked_array,
|
||||
ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
|
||||
reinterpret_cast<PyObject*>(mask_), py_options, pool_));
|
||||
out_arrays_ = chunked_array->chunks();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
if (type_ == nullptr) {
|
||||
return Status::Invalid("Must pass data type for non-object arrays");
|
||||
}
|
||||
|
||||
// Visit the type to perform conversion
|
||||
return VisitTypeInline(*type_, this);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
Status CastBuffer(const std::shared_ptr<DataType>& in_type,
|
||||
const std::shared_ptr<Buffer>& input, const int64_t length,
|
||||
const std::shared_ptr<Buffer>& valid_bitmap, const int64_t null_count,
|
||||
const std::shared_ptr<DataType>& out_type,
|
||||
const compute::CastOptions& cast_options, MemoryPool* pool,
|
||||
std::shared_ptr<Buffer>* out) {
|
||||
// Must cast
|
||||
auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
|
||||
compute::ExecContext context(pool);
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
std::shared_ptr<Array> casted_array,
|
||||
compute::Cast(*MakeArray(tmp_data), out_type, cast_options, &context));
|
||||
*out = casted_array->data()->buffers[1];
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename FromType, typename ToType>
|
||||
Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool,
|
||||
std::shared_ptr<Buffer>* out) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool));
|
||||
|
||||
auto in_values = reinterpret_cast<const FromType*>(input.data());
|
||||
auto out_values = reinterpret_cast<ToType*>(result->mutable_data());
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
*out_values++ = static_cast<ToType>(*in_values++);
|
||||
}
|
||||
*out = std::move(result);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CopyStridedBytewise(int8_t* input_data, int64_t length, int64_t stride,
|
||||
T* output_data) {
|
||||
// Passing input_data as non-const is a concession to PyObject*
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
memcpy(output_data + i, input_data, sizeof(T));
|
||||
input_data += stride;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output_data) {
|
||||
// Passing input_data as non-const is a concession to PyObject*
|
||||
int64_t j = 0;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
output_data[i] = input_data[j];
|
||||
j += stride;
|
||||
}
|
||||
}
|
||||
|
||||
class NumPyStridedConverter {
|
||||
public:
|
||||
static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool,
|
||||
std::shared_ptr<Buffer>* out) {
|
||||
NumPyStridedConverter converter(arr, length, pool);
|
||||
RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
|
||||
*out = converter.buffer_;
|
||||
return Status::OK();
|
||||
}
|
||||
template <int TYPE>
|
||||
Status Visit(PyArrayObject* arr) {
|
||||
using traits = internal::npy_traits<TYPE>;
|
||||
using T = typename traits::value_type;
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(buffer_, AllocateBuffer(sizeof(T) * length_, pool_));
|
||||
|
||||
const int64_t stride = PyArray_STRIDES(arr)[0];
|
||||
// ARROW-16013: convert sizeof(T) to signed int64 first, otherwise dividing by it
|
||||
// would do an unsigned division. This cannot be caught by tests without ubsan, since
|
||||
// common signed overflow behavior and the fact that the sizeof(T) is currently always
|
||||
// a power of two here cause CopyStridedNatural to still produce correct results
|
||||
const int64_t element_size = sizeof(T);
|
||||
if (stride % element_size == 0) {
|
||||
const int64_t stride_elements = stride / element_size;
|
||||
CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length_,
|
||||
stride_elements, reinterpret_cast<T*>(buffer_->mutable_data()));
|
||||
} else {
|
||||
CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length_, stride,
|
||||
reinterpret_cast<T*>(buffer_->mutable_data()));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool)
|
||||
: arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {}
|
||||
PyArrayObject* arr_;
|
||||
int64_t length_;
|
||||
MemoryPool* pool_;
|
||||
std::shared_ptr<Buffer> buffer_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
template <typename ArrowType>
|
||||
inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
|
||||
if (PyArray_ISBYTESWAPPED(arr_)) {
|
||||
// TODO
|
||||
return Status::NotImplemented("Byte-swapped arrays not supported");
|
||||
}
|
||||
|
||||
if (dtype_->type_num == NPY_BOOL) {
|
||||
int64_t nbytes = bit_util::BytesForBits(length_);
|
||||
ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(nbytes, pool_));
|
||||
|
||||
Ndarray1DIndexer<uint8_t> values(arr_);
|
||||
int64_t i = 0;
|
||||
const auto generate = [&values, &i]() -> bool { return values[i++] > 0; };
|
||||
GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);
|
||||
|
||||
*data = std::move(buffer);
|
||||
} else if (is_strided()) {
|
||||
RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data));
|
||||
} else {
|
||||
// Can zero-copy
|
||||
*data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
|
||||
RETURN_NOT_OK(PrepareInputData<ArrowType>(data));
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto input_type, NumPyDtypeToArrow(dtype_));
|
||||
|
||||
if (!input_type->Equals(*type_)) {
|
||||
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_,
|
||||
cast_options_, pool_, data));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
|
||||
std::shared_ptr<DataType> input_type;
|
||||
|
||||
RETURN_NOT_OK(PrepareInputData<Date32Type>(data));
|
||||
|
||||
auto date_dtype =
|
||||
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(dtype_));
|
||||
if (dtype_->type_num == NPY_DATETIME) {
|
||||
// If we have inbound datetime64[D] data, this needs to be downcasted
|
||||
// separately here from int64_t to int32_t, because this data is not
|
||||
// supported in compute::Cast
|
||||
if (date_dtype->meta.base == NPY_FR_D) {
|
||||
// TODO(wesm): How pedantic do we really want to be about checking for int32
|
||||
// overflow here?
|
||||
Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data);
|
||||
RETURN_NOT_OK(s);
|
||||
} else {
|
||||
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
|
||||
if (!input_type->Equals(*type_)) {
|
||||
// The null bitmap was already computed in VisitNative()
|
||||
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
|
||||
type_, cast_options_, pool_, data));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
|
||||
if (!input_type->Equals(*type_)) {
|
||||
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
|
||||
type_, cast_options_, pool_, data));
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* data) {
|
||||
constexpr int64_t kMillisecondsInDay = 86400000;
|
||||
std::shared_ptr<DataType> input_type;
|
||||
|
||||
RETURN_NOT_OK(PrepareInputData<Date64Type>(data));
|
||||
|
||||
auto date_dtype =
|
||||
reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(dtype_));
|
||||
if (dtype_->type_num == NPY_DATETIME) {
|
||||
// If we have inbound datetime64[D] data, this needs to be downcasted
|
||||
// separately here from int64_t to int32_t, because this data is not
|
||||
// supported in compute::Cast
|
||||
if (date_dtype->meta.base == NPY_FR_D) {
|
||||
ARROW_ASSIGN_OR_RAISE(auto result,
|
||||
AllocateBuffer(sizeof(int64_t) * length_, pool_));
|
||||
|
||||
auto in_values = reinterpret_cast<const int64_t*>((*data)->data());
|
||||
auto out_values = reinterpret_cast<int64_t*>(result->mutable_data());
|
||||
for (int64_t i = 0; i < length_; ++i) {
|
||||
*out_values++ = kMillisecondsInDay * (*in_values++);
|
||||
}
|
||||
*data = std::move(result);
|
||||
} else {
|
||||
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
|
||||
if (!input_type->Equals(*type_)) {
|
||||
// The null bitmap was already computed in VisitNative()
|
||||
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
|
||||
type_, cast_options_, pool_, data));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
|
||||
if (!input_type->Equals(*type_)) {
|
||||
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
|
||||
type_, cast_options_, pool_, data));
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Create 16MB chunks for binary data
|
||||
constexpr int32_t kBinaryChunksize = 1 << 24;
|
||||
|
||||
template <typename T>
|
||||
Status NumPyConverter::VisitBinary(T* builder) {
|
||||
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
|
||||
|
||||
auto AppendNotNull = [builder, this](const uint8_t* data) {
|
||||
// This is annoying. NumPy allows strings to have nul-terminators, so
|
||||
// we must check for them here
|
||||
const size_t item_size =
|
||||
strnlen(reinterpret_cast<const char*>(data), static_cast<size_t>(itemsize_));
|
||||
return builder->Append(data, static_cast<int32_t>(item_size));
|
||||
};
|
||||
|
||||
if (mask_ != nullptr) {
|
||||
Ndarray1DIndexer<uint8_t> mask_values(mask_);
|
||||
for (int64_t i = 0; i < length_; ++i) {
|
||||
if (mask_values[i]) {
|
||||
RETURN_NOT_OK(builder->AppendNull());
|
||||
} else {
|
||||
RETURN_NOT_OK(AppendNotNull(data));
|
||||
}
|
||||
data += stride_;
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < length_; ++i) {
|
||||
RETURN_NOT_OK(AppendNotNull(data));
|
||||
data += stride_;
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NumPyConverter::Visit(const BinaryType& type) {
|
||||
::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);
|
||||
|
||||
RETURN_NOT_OK(VisitBinary(&builder));
|
||||
|
||||
ArrayVector result;
|
||||
RETURN_NOT_OK(builder.Finish(&result));
|
||||
for (auto arr : result) {
|
||||
RETURN_NOT_OK(PushArray(arr->data()));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NumPyConverter::Visit(const LargeBinaryType& type) {
|
||||
::arrow::LargeBinaryBuilder builder(pool_);
|
||||
|
||||
RETURN_NOT_OK(VisitBinary(&builder));
|
||||
|
||||
std::shared_ptr<Array> result;
|
||||
RETURN_NOT_OK(builder.Finish(&result));
|
||||
return PushArray(result->data());
|
||||
}
|
||||
|
||||
Status NumPyConverter::Visit(const BinaryViewType& type) {
|
||||
::arrow::BinaryViewBuilder builder(pool_);
|
||||
|
||||
RETURN_NOT_OK(VisitBinary(&builder));
|
||||
|
||||
std::shared_ptr<Array> result;
|
||||
RETURN_NOT_OK(builder.Finish(&result));
|
||||
return PushArray(result->data());
|
||||
}
|
||||
|
||||
Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
|
||||
auto byte_width = type.byte_width();
|
||||
|
||||
if (itemsize_ != byte_width) {
|
||||
return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ",
|
||||
byte_width, ")");
|
||||
}
|
||||
|
||||
FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_);
|
||||
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
|
||||
|
||||
if (mask_ != nullptr) {
|
||||
Ndarray1DIndexer<uint8_t> mask_values(mask_);
|
||||
RETURN_NOT_OK(builder.Reserve(length_));
|
||||
for (int64_t i = 0; i < length_; ++i) {
|
||||
if (mask_values[i]) {
|
||||
RETURN_NOT_OK(builder.AppendNull());
|
||||
} else {
|
||||
RETURN_NOT_OK(builder.Append(data));
|
||||
}
|
||||
data += stride_;
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < length_; ++i) {
|
||||
RETURN_NOT_OK(builder.Append(data));
|
||||
data += stride_;
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<Array> result;
|
||||
RETURN_NOT_OK(builder.Finish(&result));
|
||||
return PushArray(result->data());
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// NumPy unicode is UCS4/UTF32 always
|
||||
constexpr int kNumPyUnicodeSize = 4;
|
||||
|
||||
template <typename T>
|
||||
Status AppendUTF32(const char* data, int64_t itemsize, int byteorder, T* builder) {
|
||||
// The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode,
|
||||
// so we need to detect that here to truncate if necessary. Yep.
|
||||
Py_ssize_t actual_length = 0;
|
||||
for (; actual_length < itemsize / kNumPyUnicodeSize; ++actual_length) {
|
||||
const char* code_point = data + actual_length * kNumPyUnicodeSize;
|
||||
if ((*code_point == '\0') && (*(code_point + 1) == '\0') &&
|
||||
(*(code_point + 2) == '\0') && (*(code_point + 3) == '\0')) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
OwnedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * kNumPyUnicodeSize,
|
||||
nullptr, &byteorder));
|
||||
RETURN_IF_PYERROR();
|
||||
OwnedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.obj()));
|
||||
if (utf8_obj.obj() == NULL) {
|
||||
PyErr_Clear();
|
||||
return Status::Invalid("failed converting UTF32 to UTF8");
|
||||
}
|
||||
|
||||
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(utf8_obj.obj()));
|
||||
return builder->Append(
|
||||
reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(utf8_obj.obj())), length);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
Status NumPyConverter::VisitString(T* builder) {
|
||||
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
|
||||
|
||||
char numpy_byteorder = dtype_->byteorder;
|
||||
|
||||
// For Python C API, -1 is little-endian, 1 is big-endian
|
||||
#if ARROW_LITTLE_ENDIAN
|
||||
// Yield little-endian from both '|' (native) and '<'
|
||||
int byteorder = numpy_byteorder == '>' ? 1 : -1;
|
||||
#else
|
||||
// Yield big-endian from both '|' (native) and '>'
|
||||
int byteorder = numpy_byteorder == '<' ? -1 : 1;
|
||||
#endif
|
||||
|
||||
PyAcquireGIL gil_lock;
|
||||
|
||||
const bool is_binary_type = dtype_->type_num == NPY_STRING;
|
||||
const bool is_unicode_type = dtype_->type_num == NPY_UNICODE;
|
||||
|
||||
if (!is_binary_type && !is_unicode_type) {
|
||||
const bool is_float_type = dtype_->kind == 'f';
|
||||
if (from_pandas_ && is_float_type) {
|
||||
// in case of from_pandas=True, accept an all-NaN float array as input
|
||||
RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
|
||||
&null_count_));
|
||||
if (null_count_ == length_) {
|
||||
auto arr = std::make_shared<NullArray>(length_);
|
||||
compute::ExecContext context(pool_);
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
std::shared_ptr<Array> out,
|
||||
compute::Cast(*arr, arrow::utf8(), cast_options_, &context));
|
||||
out_arrays_.emplace_back(out);
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
std::string dtype_string;
|
||||
RETURN_NOT_OK(internal::PyObject_StdStringStr(reinterpret_cast<PyObject*>(dtype_),
|
||||
&dtype_string));
|
||||
return Status::TypeError("Expected a string or bytes dtype, got ", dtype_string);
|
||||
}
|
||||
|
||||
auto AppendNonNullValue = [&](const uint8_t* data) {
|
||||
if (is_binary_type) {
|
||||
if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
|
||||
return builder->Append(data, static_cast<int32_t>(itemsize_));
|
||||
} else {
|
||||
return Status::Invalid("Encountered non-UTF8 binary value: ",
|
||||
HexEncode(data, itemsize_));
|
||||
}
|
||||
} else {
|
||||
// is_unicode_type case
|
||||
return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
|
||||
builder);
|
||||
}
|
||||
};
|
||||
|
||||
if (mask_ != nullptr) {
|
||||
Ndarray1DIndexer<uint8_t> mask_values(mask_);
|
||||
for (int64_t i = 0; i < length_; ++i) {
|
||||
if (mask_values[i]) {
|
||||
RETURN_NOT_OK(builder->AppendNull());
|
||||
} else {
|
||||
RETURN_NOT_OK(AppendNonNullValue(data));
|
||||
}
|
||||
data += stride_;
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < length_; ++i) {
|
||||
RETURN_NOT_OK(AppendNonNullValue(data));
|
||||
data += stride_;
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NumPyConverter::Visit(const StringType& type) {
|
||||
util::InitializeUTF8();
|
||||
|
||||
::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);
|
||||
|
||||
RETURN_NOT_OK(VisitString(&builder));
|
||||
|
||||
ArrayVector result;
|
||||
RETURN_NOT_OK(builder.Finish(&result));
|
||||
for (auto arr : result) {
|
||||
RETURN_NOT_OK(PushArray(arr->data()));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NumPyConverter::Visit(const LargeStringType& type) {
|
||||
util::InitializeUTF8();
|
||||
|
||||
::arrow::LargeStringBuilder builder(pool_);
|
||||
|
||||
RETURN_NOT_OK(VisitString(&builder));
|
||||
|
||||
std::shared_ptr<Array> result;
|
||||
RETURN_NOT_OK(builder.Finish(&result));
|
||||
RETURN_NOT_OK(PushArray(result->data()));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NumPyConverter::Visit(const StringViewType& type) {
|
||||
util::InitializeUTF8();
|
||||
|
||||
::arrow::StringViewBuilder builder(pool_);
|
||||
|
||||
RETURN_NOT_OK(VisitString(&builder));
|
||||
|
||||
std::shared_ptr<Array> result;
|
||||
RETURN_NOT_OK(builder.Finish(&result));
|
||||
RETURN_NOT_OK(PushArray(result->data()));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NumPyConverter::Visit(const StructType& type) {
|
||||
std::vector<NumPyConverter> sub_converters;
|
||||
std::vector<OwnedRefNoGIL> sub_arrays;
|
||||
|
||||
{
|
||||
PyAcquireGIL gil_lock;
|
||||
|
||||
// Create converters for each struct type field
|
||||
if (PyDataType_FIELDS(dtype_) == NULL || !PyDict_Check(PyDataType_FIELDS(dtype_))) {
|
||||
return Status::TypeError("Expected struct array");
|
||||
}
|
||||
|
||||
for (auto field : type.fields()) {
|
||||
PyObject* tup;
|
||||
PyDict_GetItemStringRef(PyDataType_FIELDS(dtype_), field->name().c_str(), &tup);
|
||||
RETURN_IF_PYERROR();
|
||||
OwnedRef tupref(tup);
|
||||
if (tup == NULL) {
|
||||
return Status::Invalid("Missing field '", field->name(), "' in struct array");
|
||||
}
|
||||
PyArray_Descr* sub_dtype =
|
||||
reinterpret_cast<PyArray_Descr*>(PyTuple_GET_ITEM(tup, 0));
|
||||
ARROW_DCHECK(PyObject_TypeCheck(sub_dtype, &PyArrayDescr_Type));
|
||||
int offset = static_cast<int>(PyLong_AsLong(PyTuple_GET_ITEM(tup, 1)));
|
||||
RETURN_IF_PYERROR();
|
||||
Py_INCREF(sub_dtype); /* PyArray_GetField() steals ref */
|
||||
PyObject* sub_array = PyArray_GetField(arr_, sub_dtype, offset);
|
||||
RETURN_IF_PYERROR();
|
||||
sub_arrays.emplace_back(sub_array);
|
||||
sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(),
|
||||
from_pandas_);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<ArrayVector> groups;
|
||||
int64_t null_count = 0;
|
||||
|
||||
// Compute null bitmap and store it as a Boolean Array to include it
|
||||
// in the rechunking below
|
||||
{
|
||||
if (mask_ != nullptr) {
|
||||
RETURN_NOT_OK(InitNullBitmap());
|
||||
null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
|
||||
if (null_count_ == -1) return Status::Invalid("Invalid mask type");
|
||||
}
|
||||
groups.push_back({std::make_shared<BooleanArray>(length_, null_bitmap_)});
|
||||
}
|
||||
|
||||
// Convert child data
|
||||
for (auto& converter : sub_converters) {
|
||||
RETURN_NOT_OK(converter.Convert());
|
||||
groups.push_back(converter.result());
|
||||
}
|
||||
// Ensure the different array groups are chunked consistently
|
||||
groups = ::arrow::internal::RechunkArraysConsistently(groups);
|
||||
|
||||
// Make struct array chunks by combining groups
|
||||
size_t ngroups = groups.size();
|
||||
size_t nchunks = groups[0].size();
|
||||
for (size_t chunk = 0; chunk < nchunks; chunk++) {
|
||||
// First group has the null bitmaps as Boolean Arrays
|
||||
const auto& null_data = groups[0][chunk]->data();
|
||||
ARROW_DCHECK_EQ(null_data->type->id(), Type::BOOL);
|
||||
ARROW_DCHECK_EQ(null_data->buffers.size(), 2);
|
||||
const auto& null_buffer = null_data->buffers[1];
|
||||
// Careful: the rechunked null bitmap may have a non-zero offset
|
||||
// to its buffer, and it may not even start on a byte boundary
|
||||
int64_t null_offset = null_data->offset;
|
||||
std::shared_ptr<Buffer> fixed_null_buffer;
|
||||
|
||||
if (!null_buffer) {
|
||||
fixed_null_buffer = null_buffer;
|
||||
} else if (null_offset % 8 == 0) {
|
||||
fixed_null_buffer =
|
||||
std::make_shared<Buffer>(null_buffer,
|
||||
// byte offset
|
||||
null_offset / 8,
|
||||
// byte size
|
||||
bit_util::BytesForBits(null_data->length));
|
||||
} else {
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
fixed_null_buffer,
|
||||
CopyBitmap(pool_, null_buffer->data(), null_offset, null_data->length));
|
||||
}
|
||||
|
||||
// Create struct array chunk and populate it
|
||||
auto arr_data =
|
||||
ArrayData::Make(type_, null_data->length, null_count ? kUnknownNullCount : 0, 0);
|
||||
arr_data->buffers.push_back(fixed_null_buffer);
|
||||
// Append child chunks
|
||||
for (size_t i = 1; i < ngroups; i++) {
|
||||
arr_data->child_data.push_back(groups[i][chunk]->data());
|
||||
}
|
||||
RETURN_NOT_OK(PushArray(arr_data));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
const compute::CastOptions& cast_options,
|
||||
std::shared_ptr<ChunkedArray>* out) {
|
||||
if (!PyArray_Check(ao)) {
|
||||
// This code path cannot be reached by Python unit tests currently so this
|
||||
// is only a sanity check.
|
||||
return Status::TypeError("Input object was not a NumPy array");
|
||||
}
|
||||
if (PyArray_NDIM(reinterpret_cast<PyArrayObject*>(ao)) != 1) {
|
||||
return Status::Invalid("only handle 1-dimensional arrays");
|
||||
}
|
||||
|
||||
NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
|
||||
RETURN_NOT_OK(converter.Convert());
|
||||
const auto& output_arrays = converter.result();
|
||||
ARROW_DCHECK_GT(output_arrays.size(), 0);
|
||||
*out = std::make_shared<ChunkedArray>(output_arrays);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
std::shared_ptr<ChunkedArray>* out) {
|
||||
return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
|
||||
}
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,72 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Converting from pandas memory representation to Arrow data structures
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/compute/api.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class ChunkedArray;
|
||||
class DataType;
|
||||
class MemoryPool;
|
||||
class Status;
|
||||
|
||||
namespace py {
|
||||
|
||||
/// Convert NumPy arrays to Arrow. If target data type is not known, pass a
|
||||
/// type with null
|
||||
///
|
||||
/// \param[in] pool Memory pool for any memory allocations
|
||||
/// \param[in] ao an ndarray with the array data
|
||||
/// \param[in] mo an ndarray with a null mask (True is null), optional
|
||||
/// \param[in] from_pandas If true, use pandas's null sentinels to determine
|
||||
/// whether values are null
|
||||
/// \param[in] type a specific type to cast to, may be null
|
||||
/// \param[in] cast_options casting options
|
||||
/// \param[out] out a ChunkedArray, to accommodate chunked output
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
const compute::CastOptions& cast_options,
|
||||
std::shared_ptr<ChunkedArray>* out);
|
||||
|
||||
/// Safely convert NumPy arrays to Arrow. If target data type is not known,
|
||||
/// pass a type with null.
|
||||
///
|
||||
/// \param[in] pool Memory pool for any memory allocations
|
||||
/// \param[in] ao an ndarray with the array data
|
||||
/// \param[in] mo an ndarray with a null mask (True is null), optional
|
||||
/// \param[in] from_pandas If true, use pandas's null sentinels to determine
|
||||
/// whether values are null
|
||||
/// \param[in] type a specific type to cast to, may be null
|
||||
/// \param[out] out a ChunkedArray, to accommodate chunked output
|
||||
ARROW_PYTHON_EXPORT
|
||||
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
std::shared_ptr<ChunkedArray>* out);
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,98 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/parquet_encryption.h"
|
||||
#include "parquet/exception.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
PyKmsClient::PyKmsClient(PyObject* handler, PyKmsClientVtable vtable)
|
||||
: handler_(handler), vtable_(std::move(vtable)) {
|
||||
Py_INCREF(handler);
|
||||
}
|
||||
|
||||
PyKmsClient::~PyKmsClient() {}
|
||||
|
||||
std::string PyKmsClient::WrapKey(const ::arrow::util::SecureString& key,
|
||||
const std::string& master_key_identifier) {
|
||||
std::string wrapped;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.wrap_key(handler_.obj(), key, master_key_identifier, &wrapped);
|
||||
return CheckPyError();
|
||||
});
|
||||
if (!st.ok()) {
|
||||
throw ::parquet::ParquetStatusException(st);
|
||||
}
|
||||
return wrapped;
|
||||
}
|
||||
|
||||
::arrow::util::SecureString PyKmsClient::UnwrapKey(
|
||||
const std::string& wrapped_key, const std::string& master_key_identifier) {
|
||||
arrow::util::SecureString unwrapped;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.unwrap_key(handler_.obj(), wrapped_key, master_key_identifier, &unwrapped);
|
||||
return CheckPyError();
|
||||
});
|
||||
if (!st.ok()) {
|
||||
throw ::parquet::ParquetStatusException(st);
|
||||
}
|
||||
return unwrapped;
|
||||
}
|
||||
|
||||
PyKmsClientFactory::PyKmsClientFactory(PyObject* handler, PyKmsClientFactoryVtable vtable)
|
||||
: handler_(handler), vtable_(std::move(vtable)) {
|
||||
Py_INCREF(handler);
|
||||
}
|
||||
|
||||
PyKmsClientFactory::~PyKmsClientFactory() {}
|
||||
|
||||
std::shared_ptr<::parquet::encryption::KmsClient> PyKmsClientFactory::CreateKmsClient(
|
||||
const ::parquet::encryption::KmsConnectionConfig& kms_connection_config) {
|
||||
std::shared_ptr<::parquet::encryption::KmsClient> kms_client;
|
||||
auto st = SafeCallIntoPython([&]() -> Status {
|
||||
vtable_.create_kms_client(handler_.obj(), kms_connection_config, &kms_client);
|
||||
return CheckPyError();
|
||||
});
|
||||
if (!st.ok()) {
|
||||
throw ::parquet::ParquetStatusException(st);
|
||||
}
|
||||
return kms_client;
|
||||
}
|
||||
|
||||
arrow::Result<std::shared_ptr<::parquet::FileEncryptionProperties>>
|
||||
PyCryptoFactory::SafeGetFileEncryptionProperties(
|
||||
const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
|
||||
const ::parquet::encryption::EncryptionConfiguration& encryption_config) {
|
||||
PARQUET_CATCH_AND_RETURN(
|
||||
this->GetFileEncryptionProperties(kms_connection_config, encryption_config));
|
||||
}
|
||||
|
||||
arrow::Result<std::shared_ptr<::parquet::FileDecryptionProperties>>
|
||||
PyCryptoFactory::SafeGetFileDecryptionProperties(
|
||||
const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
|
||||
const ::parquet::encryption::DecryptionConfiguration& decryption_config) {
|
||||
PARQUET_CATCH_AND_RETURN(
|
||||
this->GetFileDecryptionProperties(kms_connection_config, decryption_config));
|
||||
}
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,134 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/secure_string.h"
|
||||
#include "parquet/encryption/crypto_factory.h"
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/encryption/kms_client_factory.h"
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__) // Windows
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(disable : 4251)
|
||||
# else
|
||||
# pragma GCC diagnostic ignored "-Wattributes"
|
||||
# endif
|
||||
|
||||
# ifdef ARROW_PYTHON_STATIC
|
||||
# define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
|
||||
# elif defined(ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING)
|
||||
# define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllimport)
|
||||
# endif
|
||||
|
||||
#else // Not Windows
|
||||
# ifndef ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
|
||||
# define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __attribute__((visibility("default")))
|
||||
# endif
|
||||
#endif // Non-Windows
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
/// \brief A table of function pointers for calling from C++ into
|
||||
/// Python.
|
||||
class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientVtable {
|
||||
public:
|
||||
std::function<void(PyObject*, const ::arrow::util::SecureString& key,
|
||||
const std::string& master_key_identifier, std::string* out)>
|
||||
wrap_key;
|
||||
std::function<void(PyObject*, const std::string& wrapped_key,
|
||||
const std::string& master_key_identifier,
|
||||
::arrow::util::SecureString* out)>
|
||||
unwrap_key;
|
||||
};
|
||||
|
||||
/// \brief A helper for KmsClient implementation in Python.
|
||||
class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClient
|
||||
: public ::parquet::encryption::KmsClient {
|
||||
public:
|
||||
PyKmsClient(PyObject* handler, PyKmsClientVtable vtable);
|
||||
~PyKmsClient() override;
|
||||
|
||||
std::string WrapKey(const ::arrow::util::SecureString& key,
|
||||
const std::string& master_key_identifier) override;
|
||||
|
||||
::arrow::util::SecureString UnwrapKey(
|
||||
const std::string& wrapped_key, const std::string& master_key_identifier) override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL handler_;
|
||||
PyKmsClientVtable vtable_;
|
||||
};
|
||||
|
||||
/// \brief A table of function pointers for calling from C++ into
|
||||
/// Python.
|
||||
class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientFactoryVtable {
|
||||
public:
|
||||
std::function<void(
|
||||
PyObject*, const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
|
||||
std::shared_ptr<::parquet::encryption::KmsClient>* out)>
|
||||
create_kms_client;
|
||||
};
|
||||
|
||||
/// \brief A helper for KmsClientFactory implementation in Python.
|
||||
class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientFactory
|
||||
: public ::parquet::encryption::KmsClientFactory {
|
||||
public:
|
||||
PyKmsClientFactory(PyObject* handler, PyKmsClientFactoryVtable vtable);
|
||||
~PyKmsClientFactory() override;
|
||||
|
||||
std::shared_ptr<::parquet::encryption::KmsClient> CreateKmsClient(
|
||||
const ::parquet::encryption::KmsConnectionConfig& kms_connection_config) override;
|
||||
|
||||
private:
|
||||
OwnedRefNoGIL handler_;
|
||||
PyKmsClientFactoryVtable vtable_;
|
||||
};
|
||||
|
||||
/// \brief A CryptoFactory that returns Results instead of throwing exceptions.
|
||||
class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyCryptoFactory
|
||||
: public ::parquet::encryption::CryptoFactory {
|
||||
public:
|
||||
arrow::Result<std::shared_ptr<::parquet::FileEncryptionProperties>>
|
||||
SafeGetFileEncryptionProperties(
|
||||
const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
|
||||
const ::parquet::encryption::EncryptionConfiguration& encryption_config);
|
||||
|
||||
/// The returned FileDecryptionProperties object will use the cache inside this
|
||||
/// CryptoFactory object, so please keep this
|
||||
/// CryptoFactory object alive along with the returned
|
||||
/// FileDecryptionProperties object.
|
||||
arrow::Result<std::shared_ptr<::parquet::FileDecryptionProperties>>
|
||||
SafeGetFileDecryptionProperties(
|
||||
const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
|
||||
const ::parquet::encryption::DecryptionConfiguration& decryption_config);
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,41 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Functions for converting between pandas's NumPy-based data representation
|
||||
// and Arrow data structures
|
||||
|
||||
#pragma once
|
||||
|
||||
// If PY_SSIZE_T_CLEAN is defined, argument parsing functions treat #-specifier
|
||||
// to mean Py_ssize_t (defining this to suppress deprecation warning)
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
|
||||
#include <Python.h> // IWYU pragma: export
|
||||
#include <datetime.h>
|
||||
|
||||
// Work around C2528 error
|
||||
#ifdef _MSC_VER
|
||||
# if _MSC_VER >= 1900
|
||||
# undef timezone
|
||||
# endif
|
||||
|
||||
// https://bugs.python.org/issue36020
|
||||
// TODO(wjones127): Can remove once we drop support for CPython 3.9
|
||||
# ifdef snprintf
|
||||
# undef snprintf
|
||||
# endif
|
||||
#endif
|
||||
@@ -0,0 +1,100 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/pyarrow.h"
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/table.h"
|
||||
#include "arrow/tensor.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/datetime.h"
|
||||
namespace {
|
||||
#include "arrow/python/pyarrow_api.h"
|
||||
}
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
static Status UnwrapError(PyObject* obj, const char* expected_type) {
|
||||
return Status::TypeError("Could not unwrap ", expected_type,
|
||||
" from Python object of type '", Py_TYPE(obj)->tp_name, "'");
|
||||
}
|
||||
|
||||
int import_pyarrow() {
|
||||
#ifdef PYPY_VERSION
|
||||
PyDateTime_IMPORT;
|
||||
#else
|
||||
internal::InitDatetime();
|
||||
#endif
|
||||
return ::import_pyarrow__lib();
|
||||
}
|
||||
|
||||
#define DEFINE_WRAP_FUNCTIONS(FUNC_SUFFIX, TYPE_NAME) \
|
||||
bool is_##FUNC_SUFFIX(PyObject* obj) { return ::pyarrow_is_##FUNC_SUFFIX(obj) != 0; } \
|
||||
\
|
||||
PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr<TYPE_NAME>& src) { \
|
||||
return ::pyarrow_wrap_##FUNC_SUFFIX(src); \
|
||||
} \
|
||||
Result<std::shared_ptr<TYPE_NAME>> unwrap_##FUNC_SUFFIX(PyObject* obj) { \
|
||||
auto out = ::pyarrow_unwrap_##FUNC_SUFFIX(obj); \
|
||||
if (out) { \
|
||||
return std::move(out); \
|
||||
} else { \
|
||||
return UnwrapError(obj, #TYPE_NAME); \
|
||||
} \
|
||||
}
|
||||
|
||||
DEFINE_WRAP_FUNCTIONS(buffer, Buffer)
|
||||
|
||||
DEFINE_WRAP_FUNCTIONS(data_type, DataType)
|
||||
DEFINE_WRAP_FUNCTIONS(field, Field)
|
||||
DEFINE_WRAP_FUNCTIONS(schema, Schema)
|
||||
|
||||
DEFINE_WRAP_FUNCTIONS(scalar, Scalar)
|
||||
|
||||
DEFINE_WRAP_FUNCTIONS(array, Array)
|
||||
DEFINE_WRAP_FUNCTIONS(chunked_array, ChunkedArray)
|
||||
|
||||
DEFINE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor)
|
||||
DEFINE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix)
|
||||
DEFINE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor)
|
||||
DEFINE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix)
|
||||
DEFINE_WRAP_FUNCTIONS(tensor, Tensor)
|
||||
|
||||
DEFINE_WRAP_FUNCTIONS(batch, RecordBatch)
|
||||
DEFINE_WRAP_FUNCTIONS(table, Table)
|
||||
|
||||
#undef DEFINE_WRAP_FUNCTIONS
|
||||
|
||||
namespace internal {
|
||||
|
||||
int check_status(const Status& status) { return ::pyarrow_internal_check_status(status); }
|
||||
|
||||
PyObject* convert_status(const Status& status) {
|
||||
ARROW_DCHECK(!status.ok());
|
||||
return ::pyarrow_internal_convert_status(status);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,89 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
#include "arrow/sparse_tensor.h"
|
||||
|
||||
// Work around ARROW-2317 (C linkage warning from Cython)
|
||||
extern "C++" {
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class Buffer;
|
||||
class DataType;
|
||||
class Field;
|
||||
class RecordBatch;
|
||||
class Schema;
|
||||
class Status;
|
||||
class Table;
|
||||
class Tensor;
|
||||
|
||||
namespace py {
|
||||
|
||||
// Returns 0 on success, -1 on error.
|
||||
ARROW_PYTHON_EXPORT int import_pyarrow();
|
||||
|
||||
#define DECLARE_WRAP_FUNCTIONS(FUNC_SUFFIX, TYPE_NAME) \
|
||||
ARROW_PYTHON_EXPORT bool is_##FUNC_SUFFIX(PyObject*); \
|
||||
ARROW_PYTHON_EXPORT Result<std::shared_ptr<TYPE_NAME>> unwrap_##FUNC_SUFFIX( \
|
||||
PyObject*); \
|
||||
ARROW_PYTHON_EXPORT PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr<TYPE_NAME>&);
|
||||
|
||||
DECLARE_WRAP_FUNCTIONS(buffer, Buffer)
|
||||
|
||||
DECLARE_WRAP_FUNCTIONS(data_type, DataType)
|
||||
DECLARE_WRAP_FUNCTIONS(field, Field)
|
||||
DECLARE_WRAP_FUNCTIONS(schema, Schema)
|
||||
|
||||
DECLARE_WRAP_FUNCTIONS(scalar, Scalar)
|
||||
|
||||
DECLARE_WRAP_FUNCTIONS(array, Array)
|
||||
DECLARE_WRAP_FUNCTIONS(chunked_array, ChunkedArray)
|
||||
|
||||
DECLARE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor)
|
||||
DECLARE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix)
|
||||
DECLARE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor)
|
||||
DECLARE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix)
|
||||
DECLARE_WRAP_FUNCTIONS(tensor, Tensor)
|
||||
|
||||
DECLARE_WRAP_FUNCTIONS(batch, RecordBatch)
|
||||
DECLARE_WRAP_FUNCTIONS(table, Table)
|
||||
|
||||
#undef DECLARE_WRAP_FUNCTIONS
|
||||
|
||||
namespace internal {
|
||||
|
||||
// If status is ok, return 0.
|
||||
// If status is not ok, set Python error indicator and return -1.
|
||||
ARROW_PYTHON_EXPORT int check_status(const Status& status);
|
||||
|
||||
// Convert status to a Python exception object. Status must not be ok.
|
||||
ARROW_PYTHON_EXPORT PyObject* convert_status(const Status& status);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
|
||||
} // extern "C++"
|
||||
@@ -0,0 +1,19 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// For backward compatibility.
|
||||
#include "arrow/python/lib_api.h"
|
||||
@@ -0,0 +1,19 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// For backward compatibility.
|
||||
#include "arrow/python/lib.h"
|
||||
@@ -0,0 +1,894 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "platform.h"
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/table.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
#include "arrow/python/arrow_to_pandas.h"
|
||||
#include "arrow/python/decimal.h"
|
||||
#include "arrow/python/helpers.h"
|
||||
#include "arrow/python/numpy_convert.h"
|
||||
#include "arrow/python/numpy_interop.h"
|
||||
#include "arrow/python/python_test.h"
|
||||
#include "arrow/python/python_to_arrow.h"
|
||||
|
||||
#define ASSERT_EQ(x, y) \
|
||||
{ \
|
||||
auto&& _left = (x); \
|
||||
auto&& _right = (y); \
|
||||
if (_left != _right) { \
|
||||
return Status::Invalid("Expected equality between `", #x, "` and `", #y, \
|
||||
"`, but ", arrow::py::testing::ToString(_left), \
|
||||
" != ", arrow::py::testing::ToString(_right)); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ASSERT_NE(x, y) \
|
||||
{ \
|
||||
auto&& _left = (x); \
|
||||
auto&& _right = (y); \
|
||||
if (_left == _right) { \
|
||||
return Status::Invalid("Expected inequality between `", #x, "` and `", #y, \
|
||||
"`, but ", arrow::py::testing::ToString(_left), \
|
||||
" == ", arrow::py::testing::ToString(_right)); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ASSERT_FALSE(v) \
|
||||
{ \
|
||||
auto&& _v = (v); \
|
||||
if (!!_v) { \
|
||||
return Status::Invalid("Expected `", #v, "` to evaluate to false, but got ", \
|
||||
arrow::py::testing::ToString(_v)); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ASSERT_TRUE(v) \
|
||||
{ \
|
||||
auto&& _v = (v); \
|
||||
if (!_v) { \
|
||||
return Status::Invalid("Expected `", #v, "` to evaluate to true, but got ", \
|
||||
arrow::py::testing::ToString(_v)); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ASSERT_FALSE_MSG(v, msg) \
|
||||
{ \
|
||||
auto&& _v = (v); \
|
||||
if (!!_v) { \
|
||||
return Status::Invalid("Expected `", #v, "` to evaluate to false, but got ", \
|
||||
arrow::py::testing::ToString(_v), ": ", msg); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ASSERT_TRUE_MSG(v, msg) \
|
||||
{ \
|
||||
auto&& _v = (v); \
|
||||
if (!_v) { \
|
||||
return Status::Invalid("Expected `", #v, "` to evaluate to true, but got ", \
|
||||
arrow::py::testing::ToString(_v), ": ", msg); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ASSERT_OK(expr) \
|
||||
{ \
|
||||
for (::arrow::Status _st = ::arrow::ToStatus((expr)); !_st.ok();) \
|
||||
return Status::Invalid("`", #expr, "` failed with ", _st.ToString()); \
|
||||
}
|
||||
|
||||
#define ASSERT_RAISES(code, expr) \
|
||||
{ \
|
||||
for (::arrow::Status _st_expr = ::arrow::ToStatus((expr)); !_st_expr.Is##code();) \
|
||||
return Status::Invalid("Expected `", #expr, "` to fail with ", #code, \
|
||||
", but got ", _st_expr.ToString()); \
|
||||
}
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using internal::checked_cast;
|
||||
|
||||
namespace py {
|
||||
namespace testing {
|
||||
|
||||
// ARROW-17938: Some standard libraries have ambiguous operator<<(nullptr_t),
|
||||
// work around it using a custom printer function.
|
||||
|
||||
template <typename T>
|
||||
std::string ToString(const T& t) {
|
||||
std::stringstream ss;
|
||||
ss << t;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template <>
|
||||
std::string ToString(const std::nullptr_t&) {
|
||||
return "nullptr";
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
Status TestOwnedRefMoves() {
|
||||
std::vector<OwnedRef> vec;
|
||||
PyObject *u, *v;
|
||||
u = PyList_New(0);
|
||||
v = PyList_New(0);
|
||||
|
||||
{
|
||||
OwnedRef ref(u);
|
||||
vec.push_back(std::move(ref));
|
||||
ASSERT_EQ(ref.obj(), nullptr);
|
||||
}
|
||||
vec.emplace_back(v);
|
||||
ASSERT_EQ(Py_REFCNT(u), 1);
|
||||
ASSERT_EQ(Py_REFCNT(v), 1);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestOwnedRefNoGILMoves() {
|
||||
PyAcquireGIL lock;
|
||||
lock.release();
|
||||
|
||||
{
|
||||
std::vector<OwnedRef> vec;
|
||||
PyObject *u, *v;
|
||||
{
|
||||
lock.acquire();
|
||||
u = PyList_New(0);
|
||||
v = PyList_New(0);
|
||||
lock.release();
|
||||
}
|
||||
{
|
||||
OwnedRefNoGIL ref(u);
|
||||
vec.push_back(std::move(ref));
|
||||
ASSERT_EQ(ref.obj(), nullptr);
|
||||
}
|
||||
vec.emplace_back(v);
|
||||
ASSERT_EQ(Py_REFCNT(u), 1);
|
||||
ASSERT_EQ(Py_REFCNT(v), 1);
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
std::string FormatPythonException(const std::string& exc_class_name,
|
||||
const std::string& exc_value) {
|
||||
std::stringstream ss;
|
||||
ss << "Python exception: ";
|
||||
ss << exc_class_name;
|
||||
ss << ": ";
|
||||
ss << exc_value;
|
||||
ss << "\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
Status TestCheckPyErrorStatus() {
|
||||
Status st;
|
||||
std::string expected_detail = "";
|
||||
|
||||
auto check_error = [](Status& st, const char* expected_message = "some error",
|
||||
std::string expected_detail = "") {
|
||||
st = CheckPyError();
|
||||
ASSERT_EQ(st.message(), expected_message);
|
||||
ASSERT_FALSE(PyErr_Occurred());
|
||||
if (expected_detail.size() > 0) {
|
||||
auto detail = st.detail();
|
||||
ASSERT_NE(detail, nullptr);
|
||||
ASSERT_EQ(detail->ToString(), expected_detail);
|
||||
}
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
for (PyObject* exc_type : {PyExc_Exception, PyExc_SyntaxError}) {
|
||||
PyErr_SetString(exc_type, "some error");
|
||||
ASSERT_OK(check_error(st));
|
||||
ASSERT_TRUE(st.IsUnknownError());
|
||||
}
|
||||
|
||||
PyErr_SetString(PyExc_TypeError, "some error");
|
||||
ASSERT_OK(
|
||||
check_error(st, "some error", FormatPythonException("TypeError", "some error")));
|
||||
ASSERT_TRUE(st.IsTypeError());
|
||||
|
||||
PyErr_SetString(PyExc_ValueError, "some error");
|
||||
ASSERT_OK(check_error(st));
|
||||
ASSERT_TRUE(st.IsInvalid());
|
||||
|
||||
PyErr_SetString(PyExc_KeyError, "some error");
|
||||
ASSERT_OK(check_error(st, "'some error'"));
|
||||
ASSERT_TRUE(st.IsKeyError());
|
||||
|
||||
for (PyObject* exc_type : {PyExc_OSError, PyExc_IOError}) {
|
||||
PyErr_SetString(exc_type, "some error");
|
||||
ASSERT_OK(check_error(st));
|
||||
ASSERT_TRUE(st.IsIOError());
|
||||
}
|
||||
|
||||
PyErr_SetString(PyExc_NotImplementedError, "some error");
|
||||
ASSERT_OK(check_error(st, "some error",
|
||||
FormatPythonException("NotImplementedError", "some error")));
|
||||
ASSERT_TRUE(st.IsNotImplemented());
|
||||
|
||||
// No override if a specific status code is given
|
||||
PyErr_SetString(PyExc_TypeError, "some error");
|
||||
st = CheckPyError(StatusCode::SerializationError);
|
||||
ASSERT_TRUE(st.IsSerializationError());
|
||||
ASSERT_EQ(st.message(), "some error");
|
||||
ASSERT_FALSE(PyErr_Occurred());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestCheckPyErrorStatusNoGIL() {
|
||||
PyAcquireGIL lock;
|
||||
{
|
||||
Status st;
|
||||
PyErr_SetString(PyExc_ZeroDivisionError, "zzzt");
|
||||
st = ConvertPyError();
|
||||
ASSERT_FALSE(PyErr_Occurred());
|
||||
lock.release();
|
||||
ASSERT_TRUE(st.IsUnknownError());
|
||||
ASSERT_EQ(st.message(), "zzzt");
|
||||
ASSERT_EQ(st.detail()->ToString(),
|
||||
FormatPythonException("ZeroDivisionError", "zzzt"));
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
Status TestRestorePyErrorBasics() {
|
||||
PyErr_SetString(PyExc_ZeroDivisionError, "zzzt");
|
||||
auto st = ConvertPyError();
|
||||
ASSERT_FALSE(PyErr_Occurred());
|
||||
ASSERT_TRUE(st.IsUnknownError());
|
||||
ASSERT_EQ(st.message(), "zzzt");
|
||||
ASSERT_EQ(st.detail()->ToString(), FormatPythonException("ZeroDivisionError", "zzzt"));
|
||||
|
||||
RestorePyError(st);
|
||||
ASSERT_TRUE(PyErr_Occurred());
|
||||
PyObject* exc_type;
|
||||
PyObject* exc_value;
|
||||
PyObject* exc_traceback;
|
||||
PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
|
||||
ASSERT_TRUE(PyErr_GivenExceptionMatches(exc_type, PyExc_ZeroDivisionError));
|
||||
std::string py_message;
|
||||
ASSERT_OK(internal::PyObject_StdStringStr(exc_value, &py_message));
|
||||
ASSERT_EQ(py_message, "zzzt");
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestPyBufferInvalidInputObject() {
|
||||
std::shared_ptr<Buffer> res;
|
||||
PyObject* input = Py_None;
|
||||
auto old_refcnt = Py_REFCNT(input);
|
||||
{
|
||||
Status st = PyBuffer::FromPyObject(input).status();
|
||||
ASSERT_TRUE_MSG(IsPyError(st), st.ToString());
|
||||
ASSERT_FALSE(PyErr_Occurred());
|
||||
}
|
||||
ASSERT_EQ(old_refcnt, Py_REFCNT(input));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Because of how it is declared, the Numpy C API instance initialized
|
||||
// within libarrow_python.dll may not be visible in this test under Windows
|
||||
// ("unresolved external symbol arrow_ARRAY_API referenced").
|
||||
#ifndef _WIN32
|
||||
Status TestPyBufferNumpyArray() {
|
||||
npy_intp dims[1] = {10};
|
||||
|
||||
OwnedRef arr_ref(PyArray_SimpleNew(1, dims, NPY_FLOAT));
|
||||
PyObject* arr = arr_ref.obj();
|
||||
ASSERT_NE(arr, nullptr);
|
||||
auto old_refcnt = Py_REFCNT(arr);
|
||||
auto buf = std::move(PyBuffer::FromPyObject(arr)).ValueOrDie();
|
||||
|
||||
ASSERT_TRUE(buf->is_cpu());
|
||||
ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
|
||||
ASSERT_TRUE(buf->is_mutable());
|
||||
ASSERT_EQ(buf->mutable_data(), buf->data());
|
||||
ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr));
|
||||
buf.reset();
|
||||
ASSERT_EQ(old_refcnt, Py_REFCNT(arr));
|
||||
|
||||
// Read-only
|
||||
PyArray_CLEARFLAGS(reinterpret_cast<PyArrayObject*>(arr), NPY_ARRAY_WRITEABLE);
|
||||
buf = std::move(PyBuffer::FromPyObject(arr)).ValueOrDie();
|
||||
ASSERT_TRUE(buf->is_cpu());
|
||||
ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
|
||||
ASSERT_FALSE(buf->is_mutable());
|
||||
ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr));
|
||||
buf.reset();
|
||||
ASSERT_EQ(old_refcnt, Py_REFCNT(arr));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestNumPyBufferNumpyArray() {
|
||||
npy_intp dims[1] = {10};
|
||||
|
||||
OwnedRef arr_ref(PyArray_SimpleNew(1, dims, NPY_FLOAT));
|
||||
PyObject* arr = arr_ref.obj();
|
||||
ASSERT_NE(arr, nullptr);
|
||||
auto old_refcnt = Py_REFCNT(arr);
|
||||
|
||||
auto buf = std::make_shared<NumPyBuffer>(arr);
|
||||
ASSERT_TRUE(buf->is_cpu());
|
||||
ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
|
||||
ASSERT_TRUE(buf->is_mutable());
|
||||
ASSERT_EQ(buf->mutable_data(), buf->data());
|
||||
ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr));
|
||||
buf.reset();
|
||||
ASSERT_EQ(old_refcnt, Py_REFCNT(arr));
|
||||
|
||||
// Read-only
|
||||
PyArray_CLEARFLAGS(reinterpret_cast<PyArrayObject*>(arr), NPY_ARRAY_WRITEABLE);
|
||||
buf = std::make_shared<NumPyBuffer>(arr);
|
||||
ASSERT_TRUE(buf->is_cpu());
|
||||
ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
|
||||
ASSERT_FALSE(buf->is_mutable());
|
||||
ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr));
|
||||
buf.reset();
|
||||
ASSERT_EQ(old_refcnt, Py_REFCNT(arr));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
#endif
|
||||
|
||||
Status TestPythonDecimalToString() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("-39402950693754869342983");
|
||||
PyObject* python_object =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
ASSERT_NE(python_object, nullptr);
|
||||
|
||||
std::string string_result;
|
||||
ASSERT_OK(internal::PythonDecimalToString(python_object, &string_result));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestInferPrecisionAndScale() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("-394029506937548693.42983");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
|
||||
internal::DecimalMetadata metadata;
|
||||
ASSERT_OK(metadata.Update(python_decimal));
|
||||
|
||||
const auto expected_precision =
|
||||
static_cast<int32_t>(decimal_string.size() - 2); // 1 for -, 1 for .
|
||||
const int32_t expected_scale = 5;
|
||||
|
||||
ASSERT_EQ(expected_precision, metadata.precision());
|
||||
ASSERT_EQ(expected_scale, metadata.scale());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestInferPrecisionAndNegativeScale() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("-3.94042983E+10");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
|
||||
internal::DecimalMetadata metadata;
|
||||
ASSERT_OK(metadata.Update(python_decimal));
|
||||
|
||||
const auto expected_precision = 11;
|
||||
const int32_t expected_scale = 0;
|
||||
|
||||
ASSERT_EQ(expected_precision, metadata.precision());
|
||||
ASSERT_EQ(expected_scale, metadata.scale());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestInferAllLeadingZeros() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("0.001");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
|
||||
internal::DecimalMetadata metadata;
|
||||
ASSERT_OK(metadata.Update(python_decimal));
|
||||
ASSERT_EQ(3, metadata.precision());
|
||||
ASSERT_EQ(3, metadata.scale());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestInferAllLeadingZerosExponentialNotationPositive() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("0.01E5");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
|
||||
internal::DecimalMetadata metadata;
|
||||
ASSERT_OK(metadata.Update(python_decimal));
|
||||
ASSERT_EQ(4, metadata.precision());
|
||||
ASSERT_EQ(0, metadata.scale());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestInferAllLeadingZerosExponentialNotationNegative() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("0.01E3");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
internal::DecimalMetadata metadata;
|
||||
ASSERT_OK(metadata.Update(python_decimal));
|
||||
ASSERT_EQ(2, metadata.precision());
|
||||
ASSERT_EQ(0, metadata.scale());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestObjectBlockWriteFails() {
|
||||
StringBuilder builder;
|
||||
const char value[] = {'\xf1', '\0'};
|
||||
|
||||
for (int i = 0; i < 1000; ++i) {
|
||||
ASSERT_OK(builder.Append(value, static_cast<int32_t>(strlen(value))));
|
||||
}
|
||||
|
||||
std::shared_ptr<Array> arr;
|
||||
ASSERT_OK(builder.Finish(&arr));
|
||||
|
||||
auto f1 = field("f1", utf8());
|
||||
auto f2 = field("f2", utf8());
|
||||
auto f3 = field("f3", utf8());
|
||||
std::vector<std::shared_ptr<Field>> fields = {f1, f2, f3};
|
||||
std::vector<std::shared_ptr<Array>> cols = {arr, arr, arr};
|
||||
|
||||
auto schema = ::arrow::schema(fields);
|
||||
auto table = Table::Make(schema, cols);
|
||||
|
||||
Status st;
|
||||
Py_BEGIN_ALLOW_THREADS;
|
||||
PyObject* out;
|
||||
PandasOptions options;
|
||||
options.use_threads = true;
|
||||
st = ConvertTableToPandas(options, table, &out);
|
||||
Py_END_ALLOW_THREADS;
|
||||
ASSERT_RAISES(UnknownError, st);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestMixedTypeFails() {
|
||||
OwnedRef list_ref(PyList_New(3));
|
||||
PyObject* list = list_ref.obj();
|
||||
|
||||
ASSERT_NE(list, nullptr);
|
||||
|
||||
PyObject* str = PyUnicode_FromString("abc");
|
||||
ASSERT_NE(str, nullptr);
|
||||
|
||||
PyObject* integer = PyLong_FromLong(1234L);
|
||||
ASSERT_NE(integer, nullptr);
|
||||
|
||||
PyObject* doub = PyFloat_FromDouble(123.0234);
|
||||
ASSERT_NE(doub, nullptr);
|
||||
|
||||
// This steals a reference to each object, so we don't need to decref them later
|
||||
// just the list
|
||||
ASSERT_EQ(PyList_SetItem(list, 0, str), 0);
|
||||
ASSERT_EQ(PyList_SetItem(list, 1, integer), 0);
|
||||
ASSERT_EQ(PyList_SetItem(list, 2, doub), 0);
|
||||
|
||||
ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, {}));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename DecimalValue>
|
||||
Status DecimalTestFromPythonDecimalRescale(std::shared_ptr<DataType> type,
|
||||
PyObject* python_decimal,
|
||||
std::optional<int> expected) {
|
||||
DecimalValue value;
|
||||
const auto& decimal_type = checked_cast<const DecimalType&>(*type);
|
||||
|
||||
if (expected.has_value()) {
|
||||
ASSERT_OK(internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value));
|
||||
ASSERT_EQ(expected.value(), value);
|
||||
|
||||
ASSERT_OK(internal::DecimalFromPyObject(python_decimal, decimal_type, &value));
|
||||
ASSERT_EQ(expected.value(), value);
|
||||
} else {
|
||||
ASSERT_RAISES(Invalid, internal::DecimalFromPythonDecimal(python_decimal,
|
||||
decimal_type, &value));
|
||||
ASSERT_RAISES(Invalid,
|
||||
internal::DecimalFromPyObject(python_decimal, decimal_type, &value));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestFromPythonDecimalRescaleNotTruncateable() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("1.001");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
// We fail when truncating values that would lose data if cast to a decimal type with
|
||||
// lower scale
|
||||
ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal128>(::arrow::decimal128(10, 2),
|
||||
python_decimal, {}));
|
||||
ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal256>(::arrow::decimal256(10, 2),
|
||||
python_decimal, {}));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestFromPythonDecimalRescaleTruncateable() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("1.000");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
// We allow truncation of values that do not lose precision when dividing by 10 * the
|
||||
// difference between the scales, e.g., 1.000 -> 1.00
|
||||
ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal128>(::arrow::decimal128(10, 2),
|
||||
python_decimal, 100));
|
||||
ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal256>(::arrow::decimal256(10, 2),
|
||||
python_decimal, 100));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestFromPythonNegativeDecimalRescale() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("-1.000");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal128>(::arrow::decimal128(10, 9),
|
||||
python_decimal, -1000000000));
|
||||
ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal256>(::arrow::decimal256(10, 9),
|
||||
python_decimal, -1000000000));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestDecimal128FromPythonInteger() {
|
||||
Decimal128 value;
|
||||
OwnedRef python_long(PyLong_FromLong(42));
|
||||
auto type = ::arrow::decimal128(10, 2);
|
||||
const auto& decimal_type = checked_cast<const DecimalType&>(*type);
|
||||
ASSERT_OK(internal::DecimalFromPyObject(python_long.obj(), decimal_type, &value));
|
||||
ASSERT_EQ(4200, value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestDecimal256FromPythonInteger() {
|
||||
Decimal256 value;
|
||||
OwnedRef python_long(PyLong_FromLong(42));
|
||||
auto type = ::arrow::decimal256(10, 2);
|
||||
const auto& decimal_type = checked_cast<const DecimalType&>(*type);
|
||||
ASSERT_OK(internal::DecimalFromPyObject(python_long.obj(), decimal_type, &value));
|
||||
ASSERT_EQ(4200, value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestDecimal128OverflowFails() {
|
||||
Decimal128 value;
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("9999999999999999999999999999999999999.9");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
internal::DecimalMetadata metadata;
|
||||
ASSERT_OK(metadata.Update(python_decimal));
|
||||
ASSERT_EQ(38, metadata.precision());
|
||||
ASSERT_EQ(1, metadata.scale());
|
||||
|
||||
auto type = ::arrow::smallest_decimal(38, 38);
|
||||
const auto& decimal_type = checked_cast<const DecimalType&>(*type);
|
||||
ASSERT_RAISES(Invalid,
|
||||
internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestDecimal256OverflowFails() {
|
||||
Decimal256 value;
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string(
|
||||
"999999999999999999999999999999999999999999999999999999999999999999999999999.9");
|
||||
PyObject* python_decimal =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
|
||||
internal::DecimalMetadata metadata;
|
||||
ASSERT_OK(metadata.Update(python_decimal));
|
||||
ASSERT_EQ(76, metadata.precision());
|
||||
ASSERT_EQ(1, metadata.scale());
|
||||
|
||||
auto type = ::arrow::smallest_decimal(76, 76);
|
||||
const auto& decimal_type = checked_cast<const DecimalType&>(*type);
|
||||
ASSERT_RAISES(Invalid,
|
||||
internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestNoneAndNaN() {
|
||||
OwnedRef list_ref(PyList_New(4));
|
||||
PyObject* list = list_ref.obj();
|
||||
|
||||
ASSERT_NE(list, nullptr);
|
||||
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
PyObject* constructor = decimal_constructor_.obj();
|
||||
PyObject* decimal_value = internal::DecimalFromString(constructor, "1.234");
|
||||
ASSERT_NE(decimal_value, nullptr);
|
||||
|
||||
Py_INCREF(Py_None);
|
||||
PyObject* missing_value1 = Py_None;
|
||||
ASSERT_NE(missing_value1, nullptr);
|
||||
|
||||
PyObject* missing_value2 = PyFloat_FromDouble(NPY_NAN);
|
||||
ASSERT_NE(missing_value2, nullptr);
|
||||
|
||||
PyObject* missing_value3 = internal::DecimalFromString(constructor, "nan");
|
||||
ASSERT_NE(missing_value3, nullptr);
|
||||
|
||||
// This steals a reference to each object, so we don't need to decref them later,
|
||||
// just the list
|
||||
ASSERT_EQ(0, PyList_SetItem(list, 0, decimal_value));
|
||||
ASSERT_EQ(0, PyList_SetItem(list, 1, missing_value1));
|
||||
ASSERT_EQ(0, PyList_SetItem(list, 2, missing_value2));
|
||||
ASSERT_EQ(0, PyList_SetItem(list, 3, missing_value3));
|
||||
|
||||
PyConversionOptions options;
|
||||
ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options));
|
||||
|
||||
options.from_pandas = true;
|
||||
auto chunked = std::move(ConvertPySequence(list, nullptr, options)).ValueOrDie();
|
||||
ASSERT_EQ(chunked->num_chunks(), 1);
|
||||
|
||||
auto arr = chunked->chunk(0);
|
||||
ASSERT_TRUE(arr->IsValid(0));
|
||||
ASSERT_TRUE(arr->IsNull(1));
|
||||
ASSERT_TRUE(arr->IsNull(2));
|
||||
ASSERT_TRUE(arr->IsNull(3));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestMixedPrecisionAndScale() {
|
||||
std::vector<std::string> strings{{"0.001", "1.01E5", "1.01E5"}};
|
||||
|
||||
OwnedRef list_ref(PyList_New(static_cast<Py_ssize_t>(strings.size())));
|
||||
PyObject* list = list_ref.obj();
|
||||
|
||||
ASSERT_NE(list, nullptr);
|
||||
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
// PyList_SetItem steals a reference to the item so we don't decref it later
|
||||
PyObject* decimal_constructor = decimal_constructor_.obj();
|
||||
for (Py_ssize_t i = 0; i < static_cast<Py_ssize_t>(strings.size()); ++i) {
|
||||
const int result = PyList_SetItem(
|
||||
list, i, internal::DecimalFromString(decimal_constructor, strings.at(i)));
|
||||
ASSERT_EQ(0, result);
|
||||
}
|
||||
|
||||
auto arr = std::move(ConvertPySequence(list, nullptr, {})).ValueOrDie();
|
||||
const auto& type = checked_cast<const DecimalType&>(*arr->type());
|
||||
|
||||
int32_t expected_precision = 9;
|
||||
int32_t expected_scale = 3;
|
||||
ASSERT_EQ(expected_precision, type.precision());
|
||||
ASSERT_EQ(expected_scale, type.scale());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestMixedPrecisionAndScaleSequenceConvert() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string_1("0.01");
|
||||
PyObject* value1 =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string_1);
|
||||
ASSERT_NE(value1, nullptr);
|
||||
|
||||
std::string decimal_string_2("0.001");
|
||||
PyObject* value2 =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string_2);
|
||||
ASSERT_NE(value2, nullptr);
|
||||
|
||||
OwnedRef list_ref(PyList_New(2));
|
||||
PyObject* list = list_ref.obj();
|
||||
|
||||
// This steals a reference to each object, so we don't need to decref them later
|
||||
// just the list
|
||||
ASSERT_EQ(PyList_SetItem(list, 0, value1), 0);
|
||||
ASSERT_EQ(PyList_SetItem(list, 1, value2), 0);
|
||||
|
||||
auto arr = std::move(ConvertPySequence(list, nullptr, {})).ValueOrDie();
|
||||
const auto& type = checked_cast<const Decimal128Type&>(*arr->type());
|
||||
ASSERT_EQ(3, type.precision());
|
||||
ASSERT_EQ(3, type.scale());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestSimpleInference() {
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
|
||||
std::string decimal_string("0.01");
|
||||
PyObject* value =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
ASSERT_NE(value, nullptr);
|
||||
internal::DecimalMetadata metadata;
|
||||
ASSERT_OK(metadata.Update(value));
|
||||
ASSERT_EQ(2, metadata.precision());
|
||||
ASSERT_EQ(2, metadata.scale());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status TestUpdateWithNaN() {
|
||||
internal::DecimalMetadata metadata;
|
||||
OwnedRef decimal_constructor_;
|
||||
OwnedRef decimal_module;
|
||||
RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
|
||||
RETURN_NOT_OK(
|
||||
internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
|
||||
std::string decimal_string("nan");
|
||||
PyObject* nan_value =
|
||||
internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
|
||||
|
||||
ASSERT_OK(metadata.Update(nan_value));
|
||||
ASSERT_EQ(std::numeric_limits<int32_t>::min(), metadata.precision());
|
||||
ASSERT_EQ(std::numeric_limits<int32_t>::min(), metadata.scale());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::vector<TestCase> GetCppTestCases() {
|
||||
return {
|
||||
{"test_owned_ref_moves", TestOwnedRefMoves},
|
||||
{"test_owned_ref_nogil_moves", TestOwnedRefNoGILMoves},
|
||||
{"test_check_pyerror_status", TestCheckPyErrorStatus},
|
||||
{"test_check_pyerror_status_nogil", TestCheckPyErrorStatusNoGIL},
|
||||
{"test_restore_pyerror_basics", TestRestorePyErrorBasics},
|
||||
{"test_pybuffer_invalid_input_object", TestPyBufferInvalidInputObject},
|
||||
#ifndef _WIN32
|
||||
{"test_pybuffer_numpy_array", TestPyBufferNumpyArray},
|
||||
{"test_numpybuffer_numpy_array", TestNumPyBufferNumpyArray},
|
||||
#endif
|
||||
{"test_python_decimal_to_string", TestPythonDecimalToString},
|
||||
{"test_infer_precision_and_scale", TestInferPrecisionAndScale},
|
||||
{"test_infer_precision_and_negative_scale", TestInferPrecisionAndNegativeScale},
|
||||
{"test_infer_all_leading_zeros", TestInferAllLeadingZeros},
|
||||
{"test_infer_all_leading_zeros_exponential_notation_positive",
|
||||
TestInferAllLeadingZerosExponentialNotationPositive},
|
||||
{"test_infer_all_leading_zeros_exponential_notation_negative",
|
||||
TestInferAllLeadingZerosExponentialNotationNegative},
|
||||
{"test_object_block_write_fails_pandas_convert", TestObjectBlockWriteFails},
|
||||
{"test_mixed_type_fails", TestMixedTypeFails},
|
||||
{"test_from_python_decimal_rescale_not_truncateable",
|
||||
TestFromPythonDecimalRescaleNotTruncateable},
|
||||
{"test_from_python_decimal_rescale_truncateable",
|
||||
TestFromPythonDecimalRescaleTruncateable},
|
||||
{"test_from_python_negative_decimal_rescale", TestFromPythonNegativeDecimalRescale},
|
||||
{"test_decimal128_from_python_integer", TestDecimal128FromPythonInteger},
|
||||
{"test_decimal256_from_python_integer", TestDecimal256FromPythonInteger},
|
||||
{"test_decimal128_overflow_fails", TestDecimal128OverflowFails},
|
||||
{"test_decimal256_overflow_fails", TestDecimal256OverflowFails},
|
||||
{"test_none_and_nan", TestNoneAndNaN},
|
||||
{"test_mixed_precision_and_scale", TestMixedPrecisionAndScale},
|
||||
{"test_mixed_precision_and_scale_sequence_convert",
|
||||
TestMixedPrecisionAndScaleSequenceConvert},
|
||||
{"test_simple_inference", TestSimpleInference},
|
||||
{"test_update_with_nan", TestUpdateWithNaN},
|
||||
};
|
||||
}
|
||||
|
||||
} // namespace testing
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,42 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/status.h"
|
||||
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
namespace testing {
|
||||
|
||||
struct TestCase {
|
||||
std::string name;
|
||||
std::function<Status()> func;
|
||||
};
|
||||
|
||||
ARROW_PYTHON_EXPORT
|
||||
std::vector<TestCase> GetCppTestCases();
|
||||
|
||||
} // namespace testing
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,80 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Functions for converting between CPython built-in data structures and Arrow
|
||||
// data structures
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/python/visibility.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class Status;
|
||||
|
||||
namespace py {
|
||||
|
||||
struct PyConversionOptions {
|
||||
PyConversionOptions() = default;
|
||||
|
||||
PyConversionOptions(const std::shared_ptr<DataType>& type, int64_t size,
|
||||
MemoryPool* pool, bool from_pandas)
|
||||
: type(type), size(size), from_pandas(from_pandas) {}
|
||||
|
||||
// Set to null if to be inferred
|
||||
std::shared_ptr<DataType> type;
|
||||
|
||||
// Default is -1, which indicates the size should the same as the input sequence
|
||||
int64_t size = -1;
|
||||
|
||||
bool from_pandas = false;
|
||||
|
||||
/// Used to maintain backwards compatibility for
|
||||
/// timezone bugs (see ARROW-9528). Should be removed
|
||||
/// after Arrow 2.0 release.
|
||||
bool ignore_timezone = false;
|
||||
|
||||
bool strict = false;
|
||||
};
|
||||
|
||||
/// \brief Convert sequence (list, generator, NumPy array with dtype object) of
|
||||
/// Python objects.
|
||||
/// \param[in] obj the sequence to convert
|
||||
/// \param[in] mask a NumPy array of true/false values to indicate whether
|
||||
/// values in the sequence are null (true) or not null (false). This parameter
|
||||
/// may be null
|
||||
/// \param[in] options various conversion options
|
||||
/// \param[in] pool MemoryPool to use for allocations
|
||||
/// \return Result ChunkedArray
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(
|
||||
PyObject* obj, PyObject* mask, PyConversionOptions options,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
} // namespace py
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,353 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Internal header
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/platform.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
#include "arrow/python/numpy_interop.h"
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/float16.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace py {
|
||||
|
||||
static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
|
||||
constexpr int64_t kNanosecondsInDay = 86400000000000LL;
|
||||
|
||||
namespace internal {
|
||||
|
||||
//
|
||||
// Type traits for Numpy -> Arrow equivalence
|
||||
//
|
||||
template <int TYPE>
|
||||
struct npy_traits {};
|
||||
|
||||
template <>
|
||||
struct npy_traits<NPY_BOOL> {
|
||||
typedef uint8_t value_type;
|
||||
using TypeClass = BooleanType;
|
||||
using BuilderClass = BooleanBuilder;
|
||||
|
||||
static constexpr bool supports_nulls = false;
|
||||
static inline bool isnull(uint8_t v) { return false; }
|
||||
};
|
||||
|
||||
#define NPY_INT_DECL(TYPE, CapType, T) \
|
||||
template <> \
|
||||
struct npy_traits<NPY_##TYPE> { \
|
||||
typedef T value_type; \
|
||||
using TypeClass = CapType##Type; \
|
||||
using BuilderClass = CapType##Builder; \
|
||||
\
|
||||
static constexpr bool supports_nulls = false; \
|
||||
static inline bool isnull(T v) { return false; } \
|
||||
};
|
||||
|
||||
NPY_INT_DECL(INT8, Int8, int8_t);
|
||||
NPY_INT_DECL(INT16, Int16, int16_t);
|
||||
NPY_INT_DECL(INT32, Int32, int32_t);
|
||||
NPY_INT_DECL(INT64, Int64, int64_t);
|
||||
|
||||
NPY_INT_DECL(UINT8, UInt8, uint8_t);
|
||||
NPY_INT_DECL(UINT16, UInt16, uint16_t);
|
||||
NPY_INT_DECL(UINT32, UInt32, uint32_t);
|
||||
NPY_INT_DECL(UINT64, UInt64, uint64_t);
|
||||
|
||||
#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
|
||||
NPY_INT_DECL(INT, Int32, int32_t);
|
||||
NPY_INT_DECL(UINT, UInt32, uint32_t);
|
||||
#endif
|
||||
#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64
|
||||
NPY_INT_DECL(LONGLONG, Int64, int64_t);
|
||||
NPY_INT_DECL(ULONGLONG, UInt64, uint64_t);
|
||||
#endif
|
||||
|
||||
template <>
|
||||
struct npy_traits<NPY_FLOAT16> {
|
||||
typedef uint16_t value_type;
|
||||
using TypeClass = HalfFloatType;
|
||||
using BuilderClass = HalfFloatBuilder;
|
||||
|
||||
static constexpr uint16_t na_sentinel =
|
||||
std::numeric_limits<arrow::util::Float16>::quiet_NaN().bits();
|
||||
|
||||
static constexpr bool supports_nulls = true;
|
||||
|
||||
static inline bool isnull(uint16_t v) {
|
||||
return arrow::util::Float16::FromBits(v).is_nan();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct npy_traits<NPY_FLOAT32> {
|
||||
typedef float value_type;
|
||||
using TypeClass = FloatType;
|
||||
using BuilderClass = FloatBuilder;
|
||||
|
||||
// We need to use quiet_NaN here instead of the NAN macro as on Windows
|
||||
// the NAN macro leads to "division-by-zero" compile-time error with clang.
|
||||
static constexpr float na_sentinel = std::numeric_limits<float>::quiet_NaN();
|
||||
|
||||
static constexpr bool supports_nulls = true;
|
||||
|
||||
static inline bool isnull(float v) { return v != v; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct npy_traits<NPY_FLOAT64> {
|
||||
typedef double value_type;
|
||||
using TypeClass = DoubleType;
|
||||
using BuilderClass = DoubleBuilder;
|
||||
|
||||
static constexpr double na_sentinel = std::numeric_limits<double>::quiet_NaN();
|
||||
|
||||
static constexpr bool supports_nulls = true;
|
||||
|
||||
static inline bool isnull(double v) { return v != v; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct npy_traits<NPY_DATETIME> {
|
||||
typedef int64_t value_type;
|
||||
using TypeClass = TimestampType;
|
||||
using BuilderClass = TimestampBuilder;
|
||||
|
||||
static constexpr bool supports_nulls = true;
|
||||
|
||||
static inline bool isnull(int64_t v) {
|
||||
// NaT = -2**63
|
||||
// = -0x8000000000000000
|
||||
// = -9223372036854775808;
|
||||
// = std::numeric_limits<int64_t>::min()
|
||||
return v == std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct npy_traits<NPY_TIMEDELTA> {
|
||||
typedef int64_t value_type;
|
||||
using TypeClass = DurationType;
|
||||
using BuilderClass = DurationBuilder;
|
||||
|
||||
static constexpr bool supports_nulls = true;
|
||||
|
||||
static inline bool isnull(int64_t v) {
|
||||
// NaT = -2**63 = std::numeric_limits<int64_t>::min()
|
||||
return v == std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct npy_traits<NPY_OBJECT> {
|
||||
typedef PyObject* value_type;
|
||||
static constexpr bool supports_nulls = true;
|
||||
|
||||
static inline bool isnull(PyObject* v) { return v == Py_None; }
|
||||
};
|
||||
|
||||
//
|
||||
// Type traits for Arrow -> Numpy equivalence
|
||||
// Note *supports_nulls* means the equivalent Numpy type support nulls
|
||||
//
|
||||
template <int TYPE>
|
||||
struct arrow_traits {};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::BOOL> {
|
||||
static constexpr int npy_type = NPY_BOOL;
|
||||
static constexpr bool supports_nulls = false;
|
||||
typedef typename npy_traits<NPY_BOOL>::value_type T;
|
||||
};
|
||||
|
||||
#define INT_DECL(TYPE) \
|
||||
template <> \
|
||||
struct arrow_traits<Type::TYPE> { \
|
||||
static constexpr int npy_type = NPY_##TYPE; \
|
||||
static constexpr bool supports_nulls = false; \
|
||||
static constexpr double na_value = std::numeric_limits<double>::quiet_NaN(); \
|
||||
typedef typename npy_traits<NPY_##TYPE>::value_type T; \
|
||||
};
|
||||
|
||||
INT_DECL(INT8);
|
||||
INT_DECL(INT16);
|
||||
INT_DECL(INT32);
|
||||
INT_DECL(INT64);
|
||||
INT_DECL(UINT8);
|
||||
INT_DECL(UINT16);
|
||||
INT_DECL(UINT32);
|
||||
INT_DECL(UINT64);
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::HALF_FLOAT> {
|
||||
static constexpr int npy_type = NPY_FLOAT16;
|
||||
static constexpr bool supports_nulls = true;
|
||||
static constexpr uint16_t na_value =
|
||||
std::numeric_limits<arrow::util::Float16>::quiet_NaN().bits();
|
||||
typedef typename npy_traits<NPY_FLOAT16>::value_type T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::FLOAT> {
|
||||
static constexpr int npy_type = NPY_FLOAT32;
|
||||
static constexpr bool supports_nulls = true;
|
||||
static constexpr float na_value = std::numeric_limits<float>::quiet_NaN();
|
||||
typedef typename npy_traits<NPY_FLOAT32>::value_type T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::DOUBLE> {
|
||||
static constexpr int npy_type = NPY_FLOAT64;
|
||||
static constexpr bool supports_nulls = true;
|
||||
static constexpr double na_value = std::numeric_limits<double>::quiet_NaN();
|
||||
typedef typename npy_traits<NPY_FLOAT64>::value_type T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::TIMESTAMP> {
|
||||
static constexpr int npy_type = NPY_DATETIME;
|
||||
static constexpr int64_t npy_shift = 1;
|
||||
|
||||
static constexpr bool supports_nulls = true;
|
||||
static constexpr int64_t na_value = kPandasTimestampNull;
|
||||
typedef typename npy_traits<NPY_DATETIME>::value_type T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::DURATION> {
|
||||
static constexpr int npy_type = NPY_TIMEDELTA;
|
||||
static constexpr int64_t npy_shift = 1;
|
||||
|
||||
static constexpr bool supports_nulls = true;
|
||||
static constexpr int64_t na_value = kPandasTimestampNull;
|
||||
typedef typename npy_traits<NPY_TIMEDELTA>::value_type T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::DATE32> {
|
||||
// Data stores as FR_D day unit
|
||||
static constexpr int npy_type = NPY_DATETIME;
|
||||
static constexpr int64_t npy_shift = 1;
|
||||
|
||||
static constexpr bool supports_nulls = true;
|
||||
typedef typename npy_traits<NPY_DATETIME>::value_type T;
|
||||
|
||||
static constexpr int64_t na_value = kPandasTimestampNull;
|
||||
static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::DATE64> {
|
||||
// Data stores as FR_D day unit
|
||||
static constexpr int npy_type = NPY_DATETIME;
|
||||
|
||||
// There are 1000 * 60 * 60 * 24 = 86400000ms in a day
|
||||
static constexpr int64_t npy_shift = 86400000;
|
||||
|
||||
static constexpr bool supports_nulls = true;
|
||||
typedef typename npy_traits<NPY_DATETIME>::value_type T;
|
||||
|
||||
static constexpr int64_t na_value = kPandasTimestampNull;
|
||||
static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::TIME32> {
|
||||
static constexpr int npy_type = NPY_OBJECT;
|
||||
static constexpr bool supports_nulls = true;
|
||||
static constexpr int64_t na_value = kPandasTimestampNull;
|
||||
typedef typename npy_traits<NPY_DATETIME>::value_type T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::TIME64> {
|
||||
static constexpr int npy_type = NPY_OBJECT;
|
||||
static constexpr bool supports_nulls = true;
|
||||
typedef typename npy_traits<NPY_DATETIME>::value_type T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::STRING> {
|
||||
static constexpr int npy_type = NPY_OBJECT;
|
||||
static constexpr bool supports_nulls = true;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct arrow_traits<Type::BINARY> {
|
||||
static constexpr int npy_type = NPY_OBJECT;
|
||||
static constexpr bool supports_nulls = true;
|
||||
};
|
||||
|
||||
static inline NPY_DATETIMEUNIT NumPyFrequency(TimeUnit::type unit) {
|
||||
switch (unit) {
|
||||
case TimestampType::Unit::SECOND:
|
||||
return NPY_FR_s;
|
||||
case TimestampType::Unit::MILLI:
|
||||
return NPY_FR_ms;
|
||||
break;
|
||||
case TimestampType::Unit::MICRO:
|
||||
return NPY_FR_us;
|
||||
default:
|
||||
// NANO
|
||||
return NPY_FR_ns;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int NumPyTypeSize(int npy_type) {
|
||||
npy_type = fix_numpy_type_num(npy_type);
|
||||
|
||||
switch (npy_type) {
|
||||
case NPY_BOOL:
|
||||
case NPY_INT8:
|
||||
case NPY_UINT8:
|
||||
return 1;
|
||||
case NPY_INT16:
|
||||
case NPY_UINT16:
|
||||
return 2;
|
||||
case NPY_INT32:
|
||||
case NPY_UINT32:
|
||||
return 4;
|
||||
case NPY_INT64:
|
||||
case NPY_UINT64:
|
||||
return 8;
|
||||
case NPY_FLOAT16:
|
||||
return 2;
|
||||
case NPY_FLOAT32:
|
||||
return 4;
|
||||
case NPY_FLOAT64:
|
||||
return 8;
|
||||
case NPY_DATETIME:
|
||||
return 8;
|
||||
case NPY_OBJECT:
|
||||
return sizeof(void*);
|
||||
default:
|
||||
ARROW_CHECK(false) << "unhandled numpy type";
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,707 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/udf.h"
|
||||
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/compute/api_aggregate.h"
|
||||
#include "arrow/compute/api_vector.h"
|
||||
#include "arrow/compute/function.h"
|
||||
#include "arrow/compute/kernel.h"
|
||||
#include "arrow/compute/row/grouper.h"
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/vendored/pythoncapi_compat.h"
|
||||
#include "arrow/table.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/logging.h"
|
||||
|
||||
namespace arrow {
|
||||
using compute::ExecSpan;
|
||||
using compute::Grouper;
|
||||
using compute::KernelContext;
|
||||
using compute::KernelState;
|
||||
using internal::checked_cast;
|
||||
|
||||
namespace py {
|
||||
namespace {
|
||||
|
||||
struct PythonUdfKernelState : public compute::KernelState {
|
||||
// NOTE: this KernelState constructor doesn't require the GIL.
|
||||
// If it did, the corresponding KernelInit::operator() should be wrapped
|
||||
// within SafeCallIntoPython (GH-43487).
|
||||
explicit PythonUdfKernelState(std::shared_ptr<OwnedRefNoGIL> function)
|
||||
: function(std::move(function)) {}
|
||||
|
||||
std::shared_ptr<OwnedRefNoGIL> function;
|
||||
};
|
||||
|
||||
struct PythonUdfKernelInit {
|
||||
explicit PythonUdfKernelInit(std::shared_ptr<OwnedRefNoGIL> function)
|
||||
: function(std::move(function)) {}
|
||||
|
||||
Result<std::unique_ptr<compute::KernelState>> operator()(
|
||||
compute::KernelContext*, const compute::KernelInitArgs&) {
|
||||
return std::make_unique<PythonUdfKernelState>(function);
|
||||
}
|
||||
|
||||
std::shared_ptr<OwnedRefNoGIL> function;
|
||||
};
|
||||
|
||||
struct ScalarUdfAggregator : public compute::KernelState {
|
||||
virtual Status Consume(compute::KernelContext* ctx, const compute::ExecSpan& batch) = 0;
|
||||
virtual Status MergeFrom(compute::KernelContext* ctx, compute::KernelState&& src) = 0;
|
||||
virtual Status Finalize(compute::KernelContext* ctx, Datum* out) = 0;
|
||||
};
|
||||
|
||||
struct HashUdfAggregator : public compute::KernelState {
|
||||
virtual Status Resize(KernelContext* ctx, int64_t size) = 0;
|
||||
virtual Status Consume(KernelContext* ctx, const ExecSpan& batch) = 0;
|
||||
virtual Status Merge(KernelContext* ct, KernelState&& other, const ArrayData&) = 0;
|
||||
virtual Status Finalize(KernelContext* ctx, Datum* out) = 0;
|
||||
};
|
||||
|
||||
Status AggregateUdfConsume(compute::KernelContext* ctx, const compute::ExecSpan& batch) {
|
||||
return checked_cast<ScalarUdfAggregator*>(ctx->state())->Consume(ctx, batch);
|
||||
}
|
||||
|
||||
Status AggregateUdfMerge(compute::KernelContext* ctx, compute::KernelState&& src,
|
||||
compute::KernelState* dst) {
|
||||
return checked_cast<ScalarUdfAggregator*>(dst)->MergeFrom(ctx, std::move(src));
|
||||
}
|
||||
|
||||
Status AggregateUdfFinalize(compute::KernelContext* ctx, arrow::Datum* out) {
|
||||
return checked_cast<ScalarUdfAggregator*>(ctx->state())->Finalize(ctx, out);
|
||||
}
|
||||
|
||||
Status HashAggregateUdfResize(KernelContext* ctx, int64_t size) {
|
||||
return checked_cast<HashUdfAggregator*>(ctx->state())->Resize(ctx, size);
|
||||
}
|
||||
|
||||
Status HashAggregateUdfConsume(KernelContext* ctx, const ExecSpan& batch) {
|
||||
return checked_cast<HashUdfAggregator*>(ctx->state())->Consume(ctx, batch);
|
||||
}
|
||||
|
||||
Status HashAggregateUdfMerge(KernelContext* ctx, KernelState&& src,
|
||||
const ArrayData& group_id_mapping) {
|
||||
return checked_cast<HashUdfAggregator*>(ctx->state())
|
||||
->Merge(ctx, std::move(src), group_id_mapping);
|
||||
}
|
||||
|
||||
Status HashAggregateUdfFinalize(KernelContext* ctx, Datum* out) {
|
||||
return checked_cast<HashUdfAggregator*>(ctx->state())->Finalize(ctx, out);
|
||||
}
|
||||
|
||||
struct PythonTableUdfKernelInit {
|
||||
PythonTableUdfKernelInit(std::shared_ptr<OwnedRefNoGIL> function_maker,
|
||||
UdfWrapperCallback cb)
|
||||
: function_maker(std::move(function_maker)), cb(std::move(cb)) {}
|
||||
|
||||
Result<std::unique_ptr<compute::KernelState>> operator()(
|
||||
compute::KernelContext* ctx, const compute::KernelInitArgs&) {
|
||||
return SafeCallIntoPython(
|
||||
[this, ctx]() -> Result<std::unique_ptr<compute::KernelState>> {
|
||||
UdfContext udf_context{ctx->memory_pool(), /*batch_length=*/0};
|
||||
OwnedRef empty_tuple(PyTuple_New(0));
|
||||
auto function = std::make_shared<OwnedRefNoGIL>(
|
||||
cb(function_maker->obj(), udf_context, empty_tuple.obj()));
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
if (!PyCallable_Check(function->obj())) {
|
||||
return Status::TypeError("Expected a callable Python object.");
|
||||
}
|
||||
return std::make_unique<PythonUdfKernelState>(std::move(function));
|
||||
});
|
||||
}
|
||||
|
||||
std::shared_ptr<OwnedRefNoGIL> function_maker;
|
||||
UdfWrapperCallback cb;
|
||||
};
|
||||
|
||||
struct PythonUdfScalarAggregatorImpl : public ScalarUdfAggregator {
|
||||
PythonUdfScalarAggregatorImpl(std::shared_ptr<OwnedRefNoGIL> function,
|
||||
UdfWrapperCallback cb,
|
||||
std::vector<std::shared_ptr<DataType>> input_types,
|
||||
std::shared_ptr<DataType> output_type)
|
||||
: function(std::move(function)),
|
||||
cb(std::move(cb)),
|
||||
output_type(std::move(output_type)) {
|
||||
std::vector<std::shared_ptr<Field>> fields;
|
||||
for (size_t i = 0; i < input_types.size(); i++) {
|
||||
fields.push_back(field("", input_types[i]));
|
||||
}
|
||||
input_schema = schema(std::move(fields));
|
||||
};
|
||||
|
||||
Status Consume(compute::KernelContext* ctx, const compute::ExecSpan& batch) override {
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
auto rb, batch.ToExecBatch().ToRecordBatch(input_schema, ctx->memory_pool()));
|
||||
values.push_back(std::move(rb));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status MergeFrom(compute::KernelContext* ctx, compute::KernelState&& src) override {
|
||||
auto& other_values = checked_cast<PythonUdfScalarAggregatorImpl&>(src).values;
|
||||
values.insert(values.end(), std::make_move_iterator(other_values.begin()),
|
||||
std::make_move_iterator(other_values.end()));
|
||||
|
||||
other_values.erase(other_values.begin(), other_values.end());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Finalize(compute::KernelContext* ctx, Datum* out) override {
|
||||
auto state =
|
||||
arrow::internal::checked_cast<PythonUdfScalarAggregatorImpl*>(ctx->state());
|
||||
const int num_args = input_schema->num_fields();
|
||||
|
||||
// Note: The way that batches are concatenated together
|
||||
// would result in using double amount of the memory.
|
||||
// This is OK for now because non decomposable aggregate
|
||||
// UDF is supposed to be used with segmented aggregation
|
||||
// where the size of the segment is more or less constant
|
||||
// so doubling that is not a big deal. This can be also
|
||||
// improved in the future to use more efficient way to
|
||||
// concatenate.
|
||||
ARROW_ASSIGN_OR_RAISE(auto table,
|
||||
arrow::Table::FromRecordBatches(input_schema, values));
|
||||
ARROW_ASSIGN_OR_RAISE(table, table->CombineChunks(ctx->memory_pool()));
|
||||
UdfContext udf_context{ctx->memory_pool(), table->num_rows()};
|
||||
|
||||
if (table->num_rows() == 0) {
|
||||
return Status::Invalid("Finalized is called with empty inputs");
|
||||
}
|
||||
|
||||
RETURN_NOT_OK(SafeCallIntoPython([&] {
|
||||
std::unique_ptr<OwnedRef> result;
|
||||
OwnedRef arg_tuple(PyTuple_New(num_args));
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
|
||||
for (int arg_id = 0; arg_id < num_args; arg_id++) {
|
||||
// Since we combined chunks there is only one chunk
|
||||
std::shared_ptr<Array> c_data = table->column(arg_id)->chunk(0);
|
||||
PyObject* data = wrap_array(c_data);
|
||||
PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
|
||||
}
|
||||
result =
|
||||
std::make_unique<OwnedRef>(cb(function->obj(), udf_context, arg_tuple.obj()));
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
// unwrapping the output for expected output type
|
||||
if (is_scalar(result->obj())) {
|
||||
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> val, unwrap_scalar(result->obj()));
|
||||
if (*output_type != *val->type) {
|
||||
return Status::TypeError("Expected output datatype ", output_type->ToString(),
|
||||
", but function returned datatype ",
|
||||
val->type->ToString());
|
||||
}
|
||||
out->value = std::move(val);
|
||||
return Status::OK();
|
||||
}
|
||||
return Status::TypeError("Unexpected output type: ",
|
||||
Py_TYPE(result->obj())->tp_name, " (expected Scalar)");
|
||||
}));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<OwnedRefNoGIL> function;
|
||||
UdfWrapperCallback cb;
|
||||
std::vector<std::shared_ptr<RecordBatch>> values;
|
||||
std::shared_ptr<Schema> input_schema;
|
||||
std::shared_ptr<DataType> output_type;
|
||||
};
|
||||
|
||||
struct PythonUdfHashAggregatorImpl : public HashUdfAggregator {
|
||||
PythonUdfHashAggregatorImpl(std::shared_ptr<OwnedRefNoGIL> function,
|
||||
UdfWrapperCallback cb,
|
||||
std::vector<std::shared_ptr<DataType>> input_types,
|
||||
std::shared_ptr<DataType> output_type)
|
||||
: function(std::move(function)),
|
||||
cb(std::move(cb)),
|
||||
output_type(std::move(output_type)) {
|
||||
std::vector<std::shared_ptr<Field>> fields;
|
||||
fields.reserve(input_types.size());
|
||||
for (size_t i = 0; i < input_types.size(); i++) {
|
||||
fields.push_back(field("", input_types[i]));
|
||||
}
|
||||
input_schema = schema(std::move(fields));
|
||||
};
|
||||
|
||||
// same as ApplyGrouping in partition.cc
|
||||
// replicated the code here to avoid complicating the dependencies
|
||||
static Result<RecordBatchVector> ApplyGroupings(
|
||||
const ListArray& groupings, const std::shared_ptr<RecordBatch>& batch) {
|
||||
ARROW_ASSIGN_OR_RAISE(Datum sorted,
|
||||
compute::Take(batch, groupings.data()->child_data[0]));
|
||||
|
||||
const auto& sorted_batch = *sorted.record_batch();
|
||||
|
||||
RecordBatchVector out(static_cast<size_t>(groupings.length()));
|
||||
for (size_t i = 0; i < out.size(); ++i) {
|
||||
out[i] = sorted_batch.Slice(groupings.value_offset(i), groupings.value_length(i));
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Status Resize(KernelContext* ctx, int64_t new_num_groups) override {
|
||||
// We only need to change num_groups in resize
|
||||
// similar to other hash aggregate kernels
|
||||
num_groups = new_num_groups;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Consume(KernelContext* ctx, const ExecSpan& batch) override {
|
||||
ARROW_ASSIGN_OR_RAISE(
|
||||
std::shared_ptr<RecordBatch> rb,
|
||||
batch.ToExecBatch().ToRecordBatch(input_schema, ctx->memory_pool()));
|
||||
|
||||
// This is similar to GroupedListImpl
|
||||
// last array is the group id
|
||||
const ArraySpan& groups_array_data = batch[batch.num_values() - 1].array;
|
||||
ARROW_DCHECK_EQ(groups_array_data.offset, 0);
|
||||
int64_t batch_num_values = groups_array_data.length;
|
||||
const auto* batch_groups = groups_array_data.GetValues<uint32_t>(1);
|
||||
RETURN_NOT_OK(groups.Append(batch_groups, batch_num_values));
|
||||
values.push_back(std::move(rb));
|
||||
num_values += batch_num_values;
|
||||
return Status::OK();
|
||||
}
|
||||
Status Merge(KernelContext* ctx, KernelState&& other_state,
|
||||
const ArrayData& group_id_mapping) override {
|
||||
// This is similar to GroupedListImpl
|
||||
auto& other = checked_cast<PythonUdfHashAggregatorImpl&>(other_state);
|
||||
auto& other_values = other.values;
|
||||
const uint32_t* other_raw_groups = other.groups.data();
|
||||
values.insert(values.end(), std::make_move_iterator(other_values.begin()),
|
||||
std::make_move_iterator(other_values.end()));
|
||||
|
||||
auto g = group_id_mapping.GetValues<uint32_t>(1);
|
||||
for (uint32_t other_g = 0; static_cast<int64_t>(other_g) < other.num_values;
|
||||
++other_g) {
|
||||
// Different state can have different group_id mappings, so we
|
||||
// need to translate the ids
|
||||
RETURN_NOT_OK(groups.Append(g[other_raw_groups[other_g]]));
|
||||
}
|
||||
|
||||
num_values += other.num_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Finalize(KernelContext* ctx, Datum* out) override {
|
||||
// Exclude the last column which is the group id
|
||||
const int num_args = input_schema->num_fields() - 1;
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto groups_buffer, groups.Finish());
|
||||
ARROW_ASSIGN_OR_RAISE(auto groupings,
|
||||
Grouper::MakeGroupings(UInt32Array(num_values, groups_buffer),
|
||||
static_cast<uint32_t>(num_groups)));
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto table,
|
||||
arrow::Table::FromRecordBatches(input_schema, values));
|
||||
ARROW_ASSIGN_OR_RAISE(auto rb, table->CombineChunksToBatch(ctx->memory_pool()));
|
||||
UdfContext udf_context{ctx->memory_pool(), table->num_rows()};
|
||||
|
||||
if (rb->num_rows() == 0) {
|
||||
*out = Datum();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(RecordBatchVector rbs, ApplyGroupings(*groupings, rb));
|
||||
|
||||
return SafeCallIntoPython([&] {
|
||||
ARROW_ASSIGN_OR_RAISE(std::unique_ptr<ArrayBuilder> builder,
|
||||
MakeBuilder(output_type, ctx->memory_pool()));
|
||||
for (auto& group_rb : rbs) {
|
||||
std::unique_ptr<OwnedRef> result;
|
||||
OwnedRef arg_tuple(PyTuple_New(num_args));
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
|
||||
for (int arg_id = 0; arg_id < num_args; arg_id++) {
|
||||
// Since we combined chunks there is only one chunk
|
||||
std::shared_ptr<Array> c_data = group_rb->column(arg_id);
|
||||
PyObject* data = wrap_array(c_data);
|
||||
PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
|
||||
}
|
||||
|
||||
result =
|
||||
std::make_unique<OwnedRef>(cb(function->obj(), udf_context, arg_tuple.obj()));
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
|
||||
// unwrapping the output for expected output type
|
||||
if (is_scalar(result->obj())) {
|
||||
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> val,
|
||||
unwrap_scalar(result->obj()));
|
||||
if (*output_type != *val->type) {
|
||||
return Status::TypeError("Expected output datatype ", output_type->ToString(),
|
||||
", but function returned datatype ",
|
||||
val->type->ToString());
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(builder->AppendScalar(std::move(*val)));
|
||||
} else {
|
||||
return Status::TypeError("Unexpected output type: ",
|
||||
Py_TYPE(result->obj())->tp_name, " (expected Scalar)");
|
||||
}
|
||||
}
|
||||
ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
|
||||
out->value = std::move(result->data());
|
||||
return Status::OK();
|
||||
});
|
||||
}
|
||||
|
||||
std::shared_ptr<OwnedRefNoGIL> function;
|
||||
UdfWrapperCallback cb;
|
||||
// Accumulated input batches
|
||||
std::vector<std::shared_ptr<RecordBatch>> values;
|
||||
// Group ids - extracted from the last column from the batch
|
||||
TypedBufferBuilder<uint32_t> groups;
|
||||
int64_t num_groups = 0;
|
||||
int64_t num_values = 0;
|
||||
std::shared_ptr<Schema> input_schema;
|
||||
std::shared_ptr<DataType> output_type;
|
||||
};
|
||||
|
||||
struct PythonUdf : public PythonUdfKernelState {
|
||||
PythonUdf(std::shared_ptr<OwnedRefNoGIL> function, UdfWrapperCallback cb,
|
||||
std::vector<TypeHolder> input_types, compute::OutputType output_type)
|
||||
: PythonUdfKernelState(std::move(function)),
|
||||
cb(std::move(cb)),
|
||||
input_types(std::move(input_types)),
|
||||
output_type(std::move(output_type)) {}
|
||||
|
||||
UdfWrapperCallback cb;
|
||||
std::vector<TypeHolder> input_types;
|
||||
compute::OutputType output_type;
|
||||
TypeHolder resolved_type;
|
||||
|
||||
Result<TypeHolder> ResolveType(compute::KernelContext* ctx,
|
||||
const std::vector<TypeHolder>& types) {
|
||||
if (input_types == types) {
|
||||
if (!resolved_type) {
|
||||
ARROW_ASSIGN_OR_RAISE(resolved_type, output_type.Resolve(ctx, input_types));
|
||||
}
|
||||
return resolved_type;
|
||||
}
|
||||
return output_type.Resolve(ctx, types);
|
||||
}
|
||||
|
||||
Status Exec(compute::KernelContext* ctx, const compute::ExecSpan& batch,
|
||||
compute::ExecResult* out) {
|
||||
auto state = arrow::internal::checked_cast<PythonUdfKernelState*>(ctx->state());
|
||||
PyObject* function = state->function->obj();
|
||||
const int num_args = batch.num_values();
|
||||
UdfContext udf_context{ctx->memory_pool(), batch.length};
|
||||
|
||||
OwnedRef arg_tuple(PyTuple_New(num_args));
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
for (int arg_id = 0; arg_id < num_args; arg_id++) {
|
||||
if (batch[arg_id].is_scalar()) {
|
||||
std::shared_ptr<Scalar> c_data = batch[arg_id].scalar->GetSharedPtr();
|
||||
PyObject* data = wrap_scalar(c_data);
|
||||
PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
|
||||
} else {
|
||||
std::shared_ptr<Array> c_data = batch[arg_id].array.ToArray();
|
||||
PyObject* data = wrap_array(c_data);
|
||||
PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
|
||||
}
|
||||
}
|
||||
|
||||
OwnedRef result(cb(function, udf_context, arg_tuple.obj()));
|
||||
RETURN_NOT_OK(CheckPyError());
|
||||
// unwrapping the output for expected output type
|
||||
if (is_array(result.obj())) {
|
||||
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> val, unwrap_array(result.obj()));
|
||||
ARROW_ASSIGN_OR_RAISE(TypeHolder type, ResolveType(ctx, batch.GetTypes()));
|
||||
if (type.type == NULLPTR) {
|
||||
return Status::TypeError("expected output datatype is null");
|
||||
}
|
||||
if (*type.type != *val->type()) {
|
||||
return Status::TypeError("Expected output datatype ", type.type->ToString(),
|
||||
", but function returned datatype ",
|
||||
val->type()->ToString());
|
||||
}
|
||||
out->value = std::move(val->data());
|
||||
return Status::OK();
|
||||
} else {
|
||||
return Status::TypeError("Unexpected output type: ", Py_TYPE(result.obj())->tp_name,
|
||||
" (expected Array)");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
Status PythonUdfExec(compute::KernelContext* ctx, const compute::ExecSpan& batch,
|
||||
compute::ExecResult* out) {
|
||||
auto udf = static_cast<PythonUdf*>(ctx->kernel()->data.get());
|
||||
return SafeCallIntoPython([&]() -> Status { return udf->Exec(ctx, batch, out); });
|
||||
}
|
||||
|
||||
template <class Function, class Kernel>
|
||||
Status RegisterUdf(PyObject* function, compute::KernelInit kernel_init,
|
||||
UdfWrapperCallback cb, const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry) {
|
||||
if (!PyCallable_Check(function)) {
|
||||
return Status::TypeError("Expected a callable Python object.");
|
||||
}
|
||||
auto scalar_func =
|
||||
std::make_shared<Function>(options.func_name, options.arity, options.func_doc);
|
||||
std::vector<compute::InputType> input_types;
|
||||
for (const auto& in_dtype : options.input_types) {
|
||||
input_types.emplace_back(in_dtype);
|
||||
}
|
||||
compute::OutputType output_type(options.output_type);
|
||||
// Take reference before wrapping with OwnedRefNoGIL
|
||||
Py_INCREF(function);
|
||||
auto udf_data = std::make_shared<PythonUdf>(
|
||||
std::make_shared<OwnedRefNoGIL>(function), cb,
|
||||
TypeHolder::FromTypes(options.input_types), options.output_type);
|
||||
Kernel kernel(
|
||||
compute::KernelSignature::Make(std::move(input_types), std::move(output_type),
|
||||
options.arity.is_varargs),
|
||||
PythonUdfExec, kernel_init);
|
||||
kernel.data = std::move(udf_data);
|
||||
|
||||
kernel.mem_allocation = compute::MemAllocation::NO_PREALLOCATE;
|
||||
kernel.null_handling = compute::NullHandling::COMPUTED_NO_PREALLOCATE;
|
||||
RETURN_NOT_OK(scalar_func->AddKernel(std::move(kernel)));
|
||||
if (registry == NULLPTR) {
|
||||
registry = compute::GetFunctionRegistry();
|
||||
}
|
||||
RETURN_NOT_OK(registry->AddFunction(std::move(scalar_func)));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Status RegisterScalarFunction(PyObject* function, UdfWrapperCallback cb,
|
||||
const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry) {
|
||||
return RegisterUdf<compute::ScalarFunction, compute::ScalarKernel>(
|
||||
function, PythonUdfKernelInit{std::make_shared<OwnedRefNoGIL>(function)}, cb,
|
||||
options, registry);
|
||||
}
|
||||
|
||||
Status RegisterVectorFunction(PyObject* function, UdfWrapperCallback cb,
|
||||
const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry) {
|
||||
return RegisterUdf<compute::VectorFunction, compute::VectorKernel>(
|
||||
function, PythonUdfKernelInit{std::make_shared<OwnedRefNoGIL>(function)}, cb,
|
||||
options, registry);
|
||||
}
|
||||
|
||||
Status RegisterTabularFunction(PyObject* function, UdfWrapperCallback cb,
|
||||
const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry) {
|
||||
if (options.arity.num_args != 0 || options.arity.is_varargs) {
|
||||
return Status::NotImplemented("tabular function of non-null arity");
|
||||
}
|
||||
if (options.output_type->id() != Type::type::STRUCT) {
|
||||
return Status::Invalid("tabular function with non-struct output");
|
||||
}
|
||||
return RegisterUdf<compute::ScalarFunction, compute::ScalarKernel>(
|
||||
function, PythonTableUdfKernelInit{std::make_shared<OwnedRefNoGIL>(function), cb},
|
||||
cb, options, registry);
|
||||
}
|
||||
|
||||
Status RegisterScalarAggregateFunction(PyObject* function, UdfWrapperCallback cb,
|
||||
const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry) {
|
||||
if (!PyCallable_Check(function)) {
|
||||
return Status::TypeError("Expected a callable Python object.");
|
||||
}
|
||||
|
||||
if (registry == NULLPTR) {
|
||||
registry = compute::GetFunctionRegistry();
|
||||
}
|
||||
|
||||
static auto default_scalar_aggregate_options =
|
||||
compute::ScalarAggregateOptions::Defaults();
|
||||
auto aggregate_func = std::make_shared<compute::ScalarAggregateFunction>(
|
||||
options.func_name, options.arity, options.func_doc,
|
||||
&default_scalar_aggregate_options);
|
||||
|
||||
std::vector<compute::InputType> input_types;
|
||||
for (const auto& in_dtype : options.input_types) {
|
||||
input_types.emplace_back(in_dtype);
|
||||
}
|
||||
compute::OutputType output_type(options.output_type);
|
||||
|
||||
// Take reference before wrapping with OwnedRefNoGIL
|
||||
Py_INCREF(function);
|
||||
auto function_ref = std::make_shared<OwnedRefNoGIL>(function);
|
||||
|
||||
compute::KernelInit init = [cb, function_ref, options](
|
||||
compute::KernelContext* ctx,
|
||||
const compute::KernelInitArgs& args)
|
||||
-> Result<std::unique_ptr<compute::KernelState>> {
|
||||
return std::make_unique<PythonUdfScalarAggregatorImpl>(
|
||||
function_ref, cb, options.input_types, options.output_type);
|
||||
};
|
||||
|
||||
auto sig = compute::KernelSignature::Make(
|
||||
std::move(input_types), std::move(output_type), options.arity.is_varargs);
|
||||
compute::ScalarAggregateKernel kernel(std::move(sig), std::move(init),
|
||||
AggregateUdfConsume, AggregateUdfMerge,
|
||||
AggregateUdfFinalize, /*ordered=*/false);
|
||||
RETURN_NOT_OK(aggregate_func->AddKernel(std::move(kernel)));
|
||||
RETURN_NOT_OK(registry->AddFunction(std::move(aggregate_func)));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Create a new UdfOptions with adjustment for hash kernel
|
||||
/// \param options User provided udf options
|
||||
UdfOptions AdjustForHashAggregate(const UdfOptions& options) {
|
||||
UdfOptions hash_options;
|
||||
// Append hash_ before the function name to separate from the scalar
|
||||
// version
|
||||
hash_options.func_name = "hash_" + options.func_name;
|
||||
// Extend input types with group id. Group id is appended by the group
|
||||
// aggregation node. Here we change both arity and input types
|
||||
if (options.arity.is_varargs) {
|
||||
hash_options.arity = options.arity;
|
||||
} else {
|
||||
hash_options.arity = compute::Arity(options.arity.num_args + 1, false);
|
||||
}
|
||||
// Changing the function doc shouldn't be necessarily because group id
|
||||
// is not user visible, however, this is currently needed to pass the
|
||||
// function validation. The name group_id_array is consistent with
|
||||
// hash kernels in hash_aggregate.cc
|
||||
hash_options.func_doc = options.func_doc;
|
||||
hash_options.func_doc.arg_names.emplace_back("group_id_array");
|
||||
std::vector<std::shared_ptr<DataType>> input_dtypes = options.input_types;
|
||||
input_dtypes.emplace_back(uint32());
|
||||
hash_options.input_types = std::move(input_dtypes);
|
||||
hash_options.output_type = options.output_type;
|
||||
return hash_options;
|
||||
}
|
||||
|
||||
Status RegisterHashAggregateFunction(PyObject* function, UdfWrapperCallback cb,
|
||||
const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry) {
|
||||
if (!PyCallable_Check(function)) {
|
||||
return Status::TypeError("Expected a callable Python object.");
|
||||
}
|
||||
|
||||
if (registry == NULLPTR) {
|
||||
registry = compute::GetFunctionRegistry();
|
||||
}
|
||||
|
||||
UdfOptions hash_options = AdjustForHashAggregate(options);
|
||||
|
||||
std::vector<compute::InputType> input_types;
|
||||
for (const auto& in_dtype : hash_options.input_types) {
|
||||
input_types.emplace_back(in_dtype);
|
||||
}
|
||||
compute::OutputType output_type(hash_options.output_type);
|
||||
|
||||
static auto default_hash_aggregate_options =
|
||||
compute::ScalarAggregateOptions::Defaults();
|
||||
auto hash_aggregate_func = std::make_shared<compute::HashAggregateFunction>(
|
||||
hash_options.func_name, hash_options.arity, hash_options.func_doc,
|
||||
&default_hash_aggregate_options);
|
||||
|
||||
// Take reference before wrapping with OwnedRefNoGIL
|
||||
Py_INCREF(function);
|
||||
auto function_ref = std::make_shared<OwnedRefNoGIL>(function);
|
||||
compute::KernelInit init = [function_ref, cb, hash_options](
|
||||
compute::KernelContext* ctx,
|
||||
const compute::KernelInitArgs& args)
|
||||
-> Result<std::unique_ptr<compute::KernelState>> {
|
||||
return std::make_unique<PythonUdfHashAggregatorImpl>(
|
||||
function_ref, cb, hash_options.input_types, hash_options.output_type);
|
||||
};
|
||||
|
||||
auto sig = compute::KernelSignature::Make(
|
||||
std::move(input_types), std::move(output_type), hash_options.arity.is_varargs);
|
||||
|
||||
compute::HashAggregateKernel kernel(
|
||||
std::move(sig), std::move(init), HashAggregateUdfResize, HashAggregateUdfConsume,
|
||||
HashAggregateUdfMerge, HashAggregateUdfFinalize, /*ordered=*/false);
|
||||
RETURN_NOT_OK(hash_aggregate_func->AddKernel(std::move(kernel)));
|
||||
RETURN_NOT_OK(registry->AddFunction(std::move(hash_aggregate_func)));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegisterAggregateFunction(PyObject* function, UdfWrapperCallback cb,
|
||||
const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry) {
|
||||
RETURN_NOT_OK(RegisterScalarAggregateFunction(function, cb, options, registry));
|
||||
RETURN_NOT_OK(RegisterHashAggregateFunction(function, cb, options, registry));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<RecordBatchReader>> CallTabularFunction(
|
||||
const std::string& func_name, const std::vector<Datum>& args,
|
||||
compute::FunctionRegistry* registry) {
|
||||
if (args.size() != 0) {
|
||||
return Status::NotImplemented("non-empty arguments to tabular function");
|
||||
}
|
||||
if (registry == NULLPTR) {
|
||||
registry = compute::GetFunctionRegistry();
|
||||
}
|
||||
ARROW_ASSIGN_OR_RAISE(auto func, registry->GetFunction(func_name));
|
||||
if (func->kind() != compute::Function::SCALAR) {
|
||||
return Status::Invalid("tabular function of non-scalar kind");
|
||||
}
|
||||
auto arity = func->arity();
|
||||
if (arity.num_args != 0 || arity.is_varargs) {
|
||||
return Status::NotImplemented("tabular function of non-null arity");
|
||||
}
|
||||
auto kernels =
|
||||
arrow::internal::checked_pointer_cast<compute::ScalarFunction>(func)->kernels();
|
||||
if (kernels.size() != 1) {
|
||||
return Status::NotImplemented("tabular function with non-single kernel");
|
||||
}
|
||||
const compute::ScalarKernel* kernel = kernels[0];
|
||||
auto out_type = kernel->signature->out_type();
|
||||
if (out_type.kind() != compute::OutputType::FIXED) {
|
||||
return Status::Invalid("tabular kernel of non-fixed kind");
|
||||
}
|
||||
auto datatype = out_type.type();
|
||||
if (datatype->id() != Type::type::STRUCT) {
|
||||
return Status::Invalid("tabular kernel with non-struct output");
|
||||
}
|
||||
auto struct_type = arrow::internal::checked_cast<StructType*>(datatype.get());
|
||||
auto schema = ::arrow::schema(struct_type->fields());
|
||||
std::vector<TypeHolder> in_types;
|
||||
ARROW_ASSIGN_OR_RAISE(auto func_exec,
|
||||
GetFunctionExecutor(func_name, in_types, NULLPTR, registry));
|
||||
auto next_func = [schema, func_exec = std::move(
|
||||
func_exec)]() -> Result<std::shared_ptr<RecordBatch>> {
|
||||
std::vector<Datum> args;
|
||||
// passed_length of -1 or 0 with args.size() of 0 leads to an empty ExecSpanIterator
|
||||
// in exec.cc and to never invoking the source function, so 1 is passed instead
|
||||
// TODO: GH-33612: Support batch size in user-defined tabular functions
|
||||
ARROW_ASSIGN_OR_RAISE(auto datum, func_exec->Execute(args, /*passed_length=*/1));
|
||||
if (!datum.is_array()) {
|
||||
return Status::Invalid("UDF result of non-array kind");
|
||||
}
|
||||
std::shared_ptr<Array> array = datum.make_array();
|
||||
if (array->length() == 0) {
|
||||
return IterationTraits<std::shared_ptr<RecordBatch>>::End();
|
||||
}
|
||||
ARROW_ASSIGN_OR_RAISE(auto batch, RecordBatch::FromStructArray(std::move(array)));
|
||||
if (!schema->Equals(batch->schema())) {
|
||||
return Status::Invalid("UDF result with shape not conforming to schema");
|
||||
}
|
||||
return std::move(batch);
|
||||
};
|
||||
return RecordBatchReader::MakeFromIterator(MakeFunctionIterator(std::move(next_func)),
|
||||
schema);
|
||||
}
|
||||
|
||||
} // namespace py
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,81 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/compute/exec.h"
|
||||
#include "arrow/compute/function.h"
|
||||
#include "arrow/compute/registry.h"
|
||||
#include "arrow/python/platform.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/util/iterator.h"
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/pyarrow.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace py {
|
||||
|
||||
// TODO: TODO(ARROW-16041): UDF Options are not exposed to the Python
|
||||
// users. This feature will be included when extending to provide advanced
|
||||
// options for the users.
|
||||
struct ARROW_PYTHON_EXPORT UdfOptions {
|
||||
std::string func_name;
|
||||
compute::Arity arity;
|
||||
compute::FunctionDoc func_doc;
|
||||
std::vector<std::shared_ptr<DataType>> input_types;
|
||||
std::shared_ptr<DataType> output_type;
|
||||
};
|
||||
|
||||
/// \brief A context passed as the first argument of UDF functions.
|
||||
struct ARROW_PYTHON_EXPORT UdfContext {
|
||||
MemoryPool* pool;
|
||||
int64_t batch_length;
|
||||
};
|
||||
|
||||
using UdfWrapperCallback = std::function<PyObject*(
|
||||
PyObject* user_function, const UdfContext& context, PyObject* inputs)>;
|
||||
|
||||
/// \brief register a Scalar user-defined-function from Python
|
||||
Status ARROW_PYTHON_EXPORT RegisterScalarFunction(
|
||||
PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry = NULLPTR);
|
||||
|
||||
/// \brief register a Table user-defined-function from Python
|
||||
Status ARROW_PYTHON_EXPORT RegisterTabularFunction(
|
||||
PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry = NULLPTR);
|
||||
|
||||
/// \brief register a Aggregate user-defined-function from Python
|
||||
Status ARROW_PYTHON_EXPORT RegisterAggregateFunction(
|
||||
PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry = NULLPTR);
|
||||
|
||||
/// \brief register a Vector user-defined-function from Python
|
||||
Status ARROW_PYTHON_EXPORT RegisterVectorFunction(
|
||||
PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
|
||||
compute::FunctionRegistry* registry = NULLPTR);
|
||||
|
||||
Result<std::shared_ptr<RecordBatchReader>> ARROW_PYTHON_EXPORT
|
||||
CallTabularFunction(const std::string& func_name, const std::vector<Datum>& args,
|
||||
compute::FunctionRegistry* registry = NULLPTR);
|
||||
|
||||
} // namespace py
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,50 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "arrow/python/util.h"
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/python/common.h"
|
||||
|
||||
namespace arrow ::py {
|
||||
|
||||
Result<std::shared_ptr<Array>> Arange(int64_t start, int64_t stop, int64_t step,
|
||||
MemoryPool* pool) {
|
||||
int64_t size;
|
||||
if (step == 0) {
|
||||
return Status::Invalid("Step must not be zero");
|
||||
}
|
||||
if (step > 0 && stop > start) {
|
||||
// Ceiling division for positive step
|
||||
size = (stop - start + step - 1) / step;
|
||||
} else if (step < 0 && stop < start) {
|
||||
// Ceiling division for negative step
|
||||
size = (start - stop - step - 1) / (-step);
|
||||
} else {
|
||||
return MakeEmptyArray(int64());
|
||||
}
|
||||
std::shared_ptr<Buffer> data_buffer;
|
||||
ARROW_ASSIGN_OR_RAISE(data_buffer, AllocateBuffer(size * sizeof(int64_t), pool));
|
||||
auto values = reinterpret_cast<int64_t*>(data_buffer->mutable_data());
|
||||
for (int64_t i = 0; i < size; ++i) {
|
||||
values[i] = start + i * step;
|
||||
}
|
||||
auto data = ArrayData::Make(int64(), size, {nullptr, data_buffer}, 0);
|
||||
return MakeArray(data);
|
||||
}
|
||||
|
||||
} // namespace arrow::py
|
||||
@@ -0,0 +1,40 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/python/common.h"
|
||||
#include "arrow/python/visibility.h"
|
||||
|
||||
namespace arrow::py {
|
||||
|
||||
/// \brief Create an array of evenly spaced values within a given interval.
|
||||
/// This function is similar to Python's `range` function.
|
||||
/// The resulting array will contain values starting from `start` up to but not
|
||||
/// including `stop`, with a step size of `step`. If `step` is zero, the function
|
||||
/// will return an error.
|
||||
/// The resulting array will have a data type of `int64`.
|
||||
/// \param[in] start initial value of the sequence.
|
||||
/// \param[in] stop final value of the sequence (exclusive).
|
||||
/// \param[in] step step size between consecutive values.
|
||||
/// \param[in] pool Memory pool for any memory allocations.
|
||||
/// \return Result Array
|
||||
ARROW_PYTHON_EXPORT
|
||||
Result<std::shared_ptr<Array>> Arange(int64_t start, int64_t stop, int64_t step,
|
||||
MemoryPool* pool);
|
||||
|
||||
} // namespace arrow::py
|
||||
@@ -0,0 +1,18 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
arrow_install_all_headers("arrow/python/vendored")
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,39 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__) // Windows
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(disable : 4251)
|
||||
# else
|
||||
# pragma GCC diagnostic ignored "-Wattributes"
|
||||
# endif
|
||||
|
||||
# ifdef ARROW_PYTHON_STATIC
|
||||
# define ARROW_PYTHON_EXPORT
|
||||
# elif defined(ARROW_PYTHON_EXPORTING)
|
||||
# define ARROW_PYTHON_EXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define ARROW_PYTHON_EXPORT __declspec(dllimport)
|
||||
# endif
|
||||
|
||||
#else // Not Windows
|
||||
# ifndef ARROW_PYTHON_EXPORT
|
||||
# define ARROW_PYTHON_EXPORT __attribute__((visibility("default")))
|
||||
# endif
|
||||
#endif // Non-Windows
|
||||
Reference in New Issue
Block a user