Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,42 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from libcpp.memory cimport shared_ptr
from pyarrow.includes.libarrow cimport (CArray, CBuffer, CDataType,
CField, CRecordBatch, CSchema,
CTable, CTensor, CSparseCOOTensor,
CSparseCSRMatrix, CSparseCSCMatrix,
CSparseCSFTensor)
cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py":
cdef int import_pyarrow() except -1
cdef object wrap_buffer(const shared_ptr[CBuffer]& buffer)
cdef object wrap_data_type(const shared_ptr[CDataType]& type)
cdef object wrap_field(const shared_ptr[CField]& field)
cdef object wrap_schema(const shared_ptr[CSchema]& schema)
cdef object wrap_array(const shared_ptr[CArray]& sp_array)
cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor)
cdef object wrap_sparse_tensor_coo(
const shared_ptr[CSparseCOOTensor]& sp_sparse_tensor)
cdef object wrap_sparse_tensor_csr(
const shared_ptr[CSparseCSRMatrix]& sp_sparse_tensor)
cdef object wrap_sparse_tensor_csc(
const shared_ptr[CSparseCSCMatrix]& sp_sparse_tensor)
cdef object wrap_sparse_tensor_csf(
const shared_ptr[CSparseCSFTensor]& sp_sparse_tensor)
cdef object wrap_table(const shared_ptr[CTable]& ctable)
cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch)

View File

@@ -0,0 +1,441 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# flake8: noqa
"""
PyArrow is the python implementation of Apache Arrow.
Apache Arrow is a cross-language development platform for in-memory data.
It specifies a standardized language-independent columnar memory format for
flat and hierarchical data, organized for efficient analytic operations on
modern hardware. It also provides computational libraries and zero-copy
streaming messaging and interprocess communication.
For more information see the official page at https://arrow.apache.org
"""
import gc as _gc
import importlib as _importlib
import os as _os
import platform as _platform
import sys as _sys
import warnings as _warnings
try:
from ._generated_version import version as __version__
except ImportError:
# Package is not installed, parse git tag at runtime
try:
import setuptools_scm
# Code duplicated from setup.py to avoid a dependency on each other
def parse_git(root, **kwargs):
"""
Parse function for setuptools_scm that ignores tags for non-C++
subprojects, e.g. apache-arrow-js-XXX tags.
"""
from setuptools_scm.git import parse
kwargs['describe_command'] = \
"git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'"
return parse(root, **kwargs)
__version__ = setuptools_scm.get_version('../',
parse=parse_git)
except ImportError:
__version__ = None
import pyarrow.lib as _lib
from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path,
MonthDayNano, VersionInfo, build_info, cpp_build_info,
cpp_version, cpp_version_info, runtime_info,
cpu_count, set_cpu_count, enable_signal_handlers,
io_thread_count, set_io_thread_count)
def show_versions():
"""
Print various version information, to help with error reporting.
"""
def print_entry(label, value):
print(f"{label: <26}: {value: <8}")
print("pyarrow version info\n--------------------")
print_entry("Package kind", build_info.cpp_build_info.package_kind
if len(build_info.cpp_build_info.package_kind) > 0
else "not indicated")
print_entry("Arrow C++ library version", build_info.cpp_build_info.version)
print_entry("Arrow C++ compiler",
(f"{build_info.cpp_build_info.compiler_id} "
f"{build_info.cpp_build_info.compiler_version}"))
print_entry("Arrow C++ compiler flags", build_info.cpp_build_info.compiler_flags)
print_entry("Arrow C++ git revision", build_info.cpp_build_info.git_id)
print_entry("Arrow C++ git description", build_info.cpp_build_info.git_description)
print_entry("Arrow C++ build type", build_info.cpp_build_info.build_type)
print_entry("PyArrow build type", build_info.build_type)
def _module_is_available(module):
try:
_importlib.import_module(f'pyarrow.{module}')
except ImportError:
return False
else:
return True
def _filesystem_is_available(fs):
try:
import pyarrow.fs
except ImportError:
return False
try:
getattr(pyarrow.fs, fs)
except (ImportError, AttributeError):
return False
else:
return True
def show_info():
"""
Print detailed version and platform information, for error reporting
"""
show_versions()
def print_entry(label, value):
print(f" {label: <20}: {value: <8}")
print("\nPlatform:")
print_entry("OS / Arch", f"{_platform.system()} {_platform.machine()}")
print_entry("SIMD Level", runtime_info().simd_level)
print_entry("Detected SIMD Level", runtime_info().detected_simd_level)
pool = default_memory_pool()
print("\nMemory:")
print_entry("Default backend", pool.backend_name)
print_entry("Bytes allocated", f"{pool.bytes_allocated()} bytes")
print_entry("Max memory", f"{pool.max_memory()} bytes")
print_entry("Supported Backends", ', '.join(supported_memory_backends()))
print("\nOptional modules:")
modules = ["csv", "cuda", "dataset", "feather", "flight", "fs", "gandiva", "json",
"orc", "parquet"]
for module in modules:
status = "Enabled" if _module_is_available(module) else "-"
print(f" {module: <20}: {status: <8}")
print("\nFilesystems:")
filesystems = ["AzureFileSystem", "GcsFileSystem",
"HadoopFileSystem", "S3FileSystem"]
for fs in filesystems:
status = "Enabled" if _filesystem_is_available(fs) else "-"
print(f" {fs: <20}: {status: <8}")
print("\nCompression Codecs:")
codecs = ["brotli", "bz2", "gzip", "lz4_frame", "lz4", "snappy", "zstd"]
for codec in codecs:
status = "Enabled" if Codec.is_available(codec) else "-"
print(f" {codec: <20}: {status: <8}")
from pyarrow.lib import (null, bool_,
int8, int16, int32, int64,
uint8, uint16, uint32, uint64,
time32, time64, timestamp, date32, date64, duration,
month_day_nano_interval,
float16, float32, float64,
binary, string, utf8, binary_view, string_view,
large_binary, large_string, large_utf8,
decimal32, decimal64, decimal128, decimal256,
list_, large_list, list_view, large_list_view,
map_, struct,
union, sparse_union, dense_union,
dictionary,
run_end_encoded,
bool8, fixed_shape_tensor, json_, opaque, uuid,
field,
type_for_alias,
DataType, DictionaryType, StructType,
ListType, LargeListType, FixedSizeListType,
ListViewType, LargeListViewType,
MapType, UnionType, SparseUnionType, DenseUnionType,
TimestampType, Time32Type, Time64Type, DurationType,
FixedSizeBinaryType,
Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
RunEndEncodedType, Bool8Type, FixedShapeTensorType,
JsonType, OpaqueType, UuidType,
UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
KeyValueMetadata,
Field,
Schema,
schema,
unify_schemas,
Array, Tensor,
array, chunked_array, record_batch, nulls, repeat,
SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix,
SparseCSFTensor,
infer_type, from_numpy_dtype,
arange,
NullArray,
NumericArray, IntegerArray, FloatingPointArray,
BooleanArray,
Int8Array, UInt8Array,
Int16Array, UInt16Array,
Int32Array, UInt32Array,
Int64Array, UInt64Array,
HalfFloatArray, FloatArray, DoubleArray,
ListArray, LargeListArray, FixedSizeListArray,
ListViewArray, LargeListViewArray,
MapArray, UnionArray,
BinaryArray, StringArray,
LargeBinaryArray, LargeStringArray,
BinaryViewArray, StringViewArray,
FixedSizeBinaryArray,
DictionaryArray,
Date32Array, Date64Array, TimestampArray,
Time32Array, Time64Array, DurationArray,
MonthDayNanoIntervalArray,
Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
StructArray, ExtensionArray,
RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
JsonArray, OpaqueArray, UuidArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
HalfFloatScalar, FloatScalar, DoubleScalar,
Decimal32Scalar, Decimal64Scalar, Decimal128Scalar, Decimal256Scalar,
ListScalar, LargeListScalar, FixedSizeListScalar,
ListViewScalar, LargeListViewScalar,
Date32Scalar, Date64Scalar,
Time32Scalar, Time64Scalar,
TimestampScalar, DurationScalar,
MonthDayNanoIntervalScalar,
BinaryScalar, LargeBinaryScalar, BinaryViewScalar,
StringScalar, LargeStringScalar, StringViewScalar,
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
RunEndEncodedScalar, Bool8Scalar, ExtensionScalar,
FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar)
# Buffers, allocation
from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
default_cpu_memory_manager)
from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer,
Codec, compress, decompress, allocate_buffer)
from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool,
total_allocated_bytes, set_memory_pool,
default_memory_pool, system_memory_pool,
jemalloc_memory_pool, mimalloc_memory_pool,
logging_memory_pool, proxy_memory_pool,
log_memory_allocations, jemalloc_set_decay_ms,
supported_memory_backends)
# I/O
from pyarrow.lib import (NativeFile, PythonFile,
BufferedInputStream, BufferedOutputStream, CacheOptions,
CompressedInputStream, CompressedOutputStream,
TransformInputStream, transcoding_input_stream,
FixedSizeBufferWriter,
BufferReader, BufferOutputStream,
OSFile, MemoryMappedFile, memory_map,
create_memory_map, MockOutputStream,
input_stream, output_stream,
have_libhdfs)
from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table,
concat_arrays, concat_tables, TableGroupBy,
RecordBatchReader, concat_batches)
# Exceptions
from pyarrow.lib import (ArrowCancelled,
ArrowCapacityError,
ArrowException,
ArrowKeyError,
ArrowIndexError,
ArrowInvalid,
ArrowIOError,
ArrowMemoryError,
ArrowNotImplementedError,
ArrowTypeError,
ArrowSerializationError)
from pyarrow.ipc import serialize_pandas, deserialize_pandas
import pyarrow.ipc as ipc
import pyarrow.types as types
# ----------------------------------------------------------------------
# Deprecations
from pyarrow.util import _deprecate_api, _deprecate_class
# TODO: Deprecate these somehow in the pyarrow namespace
from pyarrow.ipc import (Message, MessageReader, MetadataVersion,
RecordBatchFileReader, RecordBatchFileWriter,
RecordBatchStreamReader, RecordBatchStreamWriter)
# ----------------------------------------------------------------------
# Returning absolute path to the pyarrow include directory (if bundled, e.g. in
# wheels)
def get_include():
"""
Return absolute path to directory containing Arrow C++ include
headers. Similar to numpy.get_include
"""
return _os.path.join(_os.path.dirname(__file__), 'include')
def _get_pkg_config_executable():
return _os.environ.get('PKG_CONFIG', 'pkg-config')
def _has_pkg_config(pkgname):
import subprocess
try:
return subprocess.call([_get_pkg_config_executable(),
'--exists', pkgname]) == 0
except FileNotFoundError:
return False
def _read_pkg_config_variable(pkgname, cli_args):
import subprocess
cmd = [_get_pkg_config_executable(), pkgname] + cli_args
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = proc.communicate()
if proc.returncode != 0:
raise RuntimeError("pkg-config failed: " + err.decode('utf8'))
return out.rstrip().decode('utf8')
def get_libraries():
"""
Return list of library names to include in the `libraries` argument for C
or Cython extensions using pyarrow
"""
return ['arrow_python', 'arrow']
def create_library_symlinks():
"""
With Linux and macOS wheels, the bundled shared libraries have an embedded
ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them
with -larrow won't work unless we create symlinks at locations like
site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses
prior problems we had with shipping two copies of the shared libraries to
permit third party projects like turbodbc to build their C++ extensions
against the pyarrow wheels.
This function must only be invoked once and only when the shared libraries
are bundled with the Python package, which should only apply to wheel-based
installs. It requires write access to the site-packages/pyarrow directory
and so depending on your system may need to be run with root.
"""
import glob
if _sys.platform == 'win32':
return
package_cwd = _os.path.dirname(__file__)
if _sys.platform == 'linux':
bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*'))
def get_symlink_path(hard_path):
return hard_path.rsplit('.', 1)[0]
else:
bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib'))
def get_symlink_path(hard_path):
return '.'.join((hard_path.rsplit('.', 2)[0], 'dylib'))
for lib_hard_path in bundled_libs:
symlink_path = get_symlink_path(lib_hard_path)
if _os.path.exists(symlink_path):
continue
try:
_os.symlink(lib_hard_path, symlink_path)
except PermissionError:
print("Tried creating symlink {}. If you need to link to "
"bundled shared libraries, run "
"pyarrow.create_library_symlinks() as root")
def get_library_dirs():
"""
Return lists of directories likely to contain Arrow C++ libraries for
linking C or Cython extensions using pyarrow
"""
package_cwd = _os.path.dirname(__file__)
library_dirs = [package_cwd]
def append_library_dir(library_dir):
if library_dir not in library_dirs:
library_dirs.append(library_dir)
# Search library paths via pkg-config. This is necessary if the user
# installed libarrow and the other shared libraries manually and they
# are not shipped inside the pyarrow package (see also ARROW-2976).
pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config'
for pkgname in ["arrow", "arrow_python"]:
if _has_pkg_config(pkgname):
library_dir = _read_pkg_config_variable(pkgname,
["--libs-only-L"])
# pkg-config output could be empty if Arrow is installed
# as a system package.
if library_dir:
if not library_dir.startswith("-L"):
raise ValueError(
"pkg-config --libs-only-L returned unexpected "
f"value {library_dir!r}")
append_library_dir(library_dir[2:])
if _sys.platform == 'win32':
# TODO(wesm): Is this necessary, or does setuptools within a conda
# installation add Library\lib to the linker path for MSVC?
python_base_install = _os.path.dirname(_sys.executable)
library_dir = _os.path.join(python_base_install, 'Library', 'lib')
if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')):
append_library_dir(library_dir)
# GH-45530: Add pyarrow.libs dir containing delvewheel-mangled
# msvcp140.dll
pyarrow_libs_dir = _os.path.abspath(
_os.path.join(_os.path.dirname(__file__), _os.pardir, "pyarrow.libs")
)
if _os.path.exists(pyarrow_libs_dir):
append_library_dir(pyarrow_libs_dir)
# ARROW-4074: Allow for ARROW_HOME to be set to some other directory
if _os.environ.get('ARROW_HOME'):
append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
else:
# Python wheels bundle the Arrow libraries in the pyarrow directory.
append_library_dir(_os.path.dirname(_os.path.abspath(__file__)))
return library_dirs

View File

@@ -0,0 +1,44 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib cimport *
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_acero cimport *
cdef class ExecNodeOptions(_Weakrefable):
cdef:
shared_ptr[CExecNodeOptions] wrapped
cdef void init(self, const shared_ptr[CExecNodeOptions]& sp)
cdef inline shared_ptr[CExecNodeOptions] unwrap(self) nogil
cdef class Declaration(_Weakrefable):
cdef:
CDeclaration decl
cdef void init(self, const CDeclaration& c_decl)
@staticmethod
cdef wrap(const CDeclaration& c_decl)
cdef inline CDeclaration unwrap(self) nogil

View File

@@ -0,0 +1,610 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ---------------------------------------------------------------------
# Low-level Acero bindings
# cython: profile=False
# distutils: language = c++
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_acero cimport *
from pyarrow.lib cimport (Table, pyarrow_unwrap_table, pyarrow_wrap_table,
RecordBatchReader)
from pyarrow.lib import frombytes, tobytes
from pyarrow._compute cimport (
Expression, FunctionOptions, _ensure_field_ref, _true,
unwrap_null_placement, unwrap_sort_keys
)
cdef class ExecNodeOptions(_Weakrefable):
"""
Base class for the node options.
Use one of the subclasses to construct an options object.
"""
__slots__ = () # avoid mistakingly creating attributes
cdef void init(self, const shared_ptr[CExecNodeOptions]& sp):
self.wrapped = sp
cdef inline shared_ptr[CExecNodeOptions] unwrap(self) nogil:
return self.wrapped
cdef class _TableSourceNodeOptions(ExecNodeOptions):
def _set_options(self, Table table):
cdef:
shared_ptr[CTable] c_table
c_table = pyarrow_unwrap_table(table)
self.wrapped.reset(
new CTableSourceNodeOptions(c_table)
)
class TableSourceNodeOptions(_TableSourceNodeOptions):
"""
A Source node which accepts a table.
This is the option class for the "table_source" node factory.
Parameters
----------
table : pyarrow.Table
The table which acts as the data source.
"""
def __init__(self, Table table):
self._set_options(table)
cdef class _FilterNodeOptions(ExecNodeOptions):
def _set_options(self, Expression filter_expression not None):
self.wrapped.reset(
new CFilterNodeOptions(<CExpression>filter_expression.unwrap())
)
class FilterNodeOptions(_FilterNodeOptions):
"""
Make a node which excludes some rows from batches passed through it.
This is the option class for the "filter" node factory.
The "filter" operation provides an option to define data filtering
criteria. It selects rows where the given expression evaluates to true.
Filters can be written using pyarrow.compute.Expression, and the
expression must have a return type of boolean.
Parameters
----------
filter_expression : pyarrow.compute.Expression
"""
def __init__(self, Expression filter_expression):
self._set_options(filter_expression)
cdef class _ProjectNodeOptions(ExecNodeOptions):
def _set_options(self, expressions, names=None):
cdef:
Expression expr
vector[CExpression] c_expressions
vector[c_string] c_names
for expr in expressions:
c_expressions.push_back(expr.unwrap())
if names is not None:
if len(names) != len(expressions):
raise ValueError(
"The number of names should be equal to the number of expressions"
)
for name in names:
c_names.push_back(<c_string>tobytes(name))
self.wrapped.reset(
new CProjectNodeOptions(c_expressions, c_names)
)
else:
self.wrapped.reset(
new CProjectNodeOptions(c_expressions)
)
class ProjectNodeOptions(_ProjectNodeOptions):
"""
Make a node which executes expressions on input batches,
producing batches of the same length with new columns.
This is the option class for the "project" node factory.
The "project" operation rearranges, deletes, transforms, and
creates columns. Each output column is computed by evaluating
an expression against the source record batch. These must be
scalar expressions (expressions consisting of scalar literals,
field references and scalar functions, i.e. elementwise functions
that return one value for each input row independent of the value
of all other rows).
Parameters
----------
expressions : list of pyarrow.compute.Expression
List of expressions to evaluate against the source batch. This must
be scalar expressions.
names : list of str, optional
List of names for each of the output columns (same length as
`expressions`). If `names` is not provided, the string
representations of exprs will be used.
"""
def __init__(self, expressions, names=None):
self._set_options(expressions, names)
cdef class _AggregateNodeOptions(ExecNodeOptions):
def _set_options(self, aggregates, keys=None):
cdef:
CAggregate c_aggr
vector[CAggregate] c_aggregations
vector[CFieldRef] c_keys
for arg_names, func_name, opts, name in aggregates:
c_aggr.function = tobytes(func_name)
if opts is not None:
c_aggr.options = (<FunctionOptions?>opts).wrapped
else:
c_aggr.options = <shared_ptr[CFunctionOptions]>nullptr
if not isinstance(arg_names, (list, tuple)):
arg_names = [arg_names]
for arg in arg_names:
c_aggr.target.push_back(_ensure_field_ref(arg))
c_aggr.name = tobytes(name)
c_aggregations.push_back(move(c_aggr))
if keys is None:
keys = []
for name in keys:
c_keys.push_back(_ensure_field_ref(name))
self.wrapped.reset(
new CAggregateNodeOptions(c_aggregations, c_keys)
)
class AggregateNodeOptions(_AggregateNodeOptions):
"""
Make a node which aggregates input batches, optionally grouped by keys.
This is the option class for the "aggregate" node factory.
Acero supports two types of aggregates: "scalar" aggregates,
and "hash" aggregates. Scalar aggregates reduce an array or scalar
input to a single scalar output (e.g. computing the mean of a column).
Hash aggregates act like GROUP BY in SQL and first partition data
based on one or more key columns, then reduce the data in each partition.
The aggregate node supports both types of computation, and can compute
any number of aggregations at once.
Parameters
----------
aggregates : list of tuples
Aggregations which will be applied to the targeted fields.
Specified as a list of tuples, where each tuple is one aggregation
specification and consists of: aggregation target column(s) followed
by function name, aggregation function options object and the
output field name.
The target column(s) specification can be a single field reference,
an empty list or a list of fields unary, nullary and n-ary aggregation
functions respectively. Each field reference can be a string
column name or expression.
keys : list of field references, optional
Keys by which aggregations will be grouped. Each key can reference
a field using a string name or expression.
"""
def __init__(self, aggregates, keys=None):
self._set_options(aggregates, keys)
cdef class _OrderByNodeOptions(ExecNodeOptions):
def _set_options(self, sort_keys, null_placement):
self.wrapped.reset(
new COrderByNodeOptions(
COrdering(unwrap_sort_keys(sort_keys, allow_str=False),
unwrap_null_placement(null_placement))
)
)
class OrderByNodeOptions(_OrderByNodeOptions):
"""
Make a node which applies a new ordering to the data.
Currently this node works by accumulating all data, sorting, and then
emitting the new data with an updated batch index.
Larger-than-memory sort is not currently supported.
This is the option class for the "order_by" node factory.
Parameters
----------
sort_keys : sequence of (name, order) tuples
Names of field/column keys to sort the input on,
along with the order each field/column is sorted in.
Accepted values for `order` are "ascending", "descending".
Each field reference can be a string column name or expression.
null_placement : str, default "at_end"
Where nulls in input should be sorted, only applying to
columns/fields mentioned in `sort_keys`.
Accepted values are "at_start", "at_end".
"""
def __init__(self, sort_keys=(), *, null_placement="at_end"):
self._set_options(sort_keys, null_placement)
cdef class _HashJoinNodeOptions(ExecNodeOptions):
def _set_options(
self, join_type, left_keys, right_keys, left_output=None, right_output=None,
output_suffix_for_left="", output_suffix_for_right="", Expression filter_expression=None,
):
cdef:
CJoinType c_join_type
vector[CFieldRef] c_left_keys
vector[CFieldRef] c_right_keys
vector[CFieldRef] c_left_output
vector[CFieldRef] c_right_output
CExpression c_filter_expression
# join type
if join_type == "left semi":
c_join_type = CJoinType_LEFT_SEMI
elif join_type == "right semi":
c_join_type = CJoinType_RIGHT_SEMI
elif join_type == "left anti":
c_join_type = CJoinType_LEFT_ANTI
elif join_type == "right anti":
c_join_type = CJoinType_RIGHT_ANTI
elif join_type == "inner":
c_join_type = CJoinType_INNER
elif join_type == "left outer":
c_join_type = CJoinType_LEFT_OUTER
elif join_type == "right outer":
c_join_type = CJoinType_RIGHT_OUTER
elif join_type == "full outer":
c_join_type = CJoinType_FULL_OUTER
else:
raise ValueError("Unsupported join type")
# left/right keys
if not isinstance(left_keys, (list, tuple)):
left_keys = [left_keys]
for key in left_keys:
c_left_keys.push_back(_ensure_field_ref(key))
if not isinstance(right_keys, (list, tuple)):
right_keys = [right_keys]
for key in right_keys:
c_right_keys.push_back(_ensure_field_ref(key))
if filter_expression is None:
c_filter_expression = _true
else:
c_filter_expression = filter_expression.unwrap()
# left/right output fields
if left_output is not None and right_output is not None:
for colname in left_output:
c_left_output.push_back(_ensure_field_ref(colname))
for colname in right_output:
c_right_output.push_back(_ensure_field_ref(colname))
self.wrapped.reset(
new CHashJoinNodeOptions(
c_join_type, c_left_keys, c_right_keys,
c_left_output, c_right_output,
c_filter_expression,
<c_string>tobytes(output_suffix_for_left),
<c_string>tobytes(output_suffix_for_right)
)
)
else:
self.wrapped.reset(
new CHashJoinNodeOptions(
c_join_type, c_left_keys, c_right_keys,
c_filter_expression,
<c_string>tobytes(output_suffix_for_left),
<c_string>tobytes(output_suffix_for_right)
)
)
class HashJoinNodeOptions(_HashJoinNodeOptions):
"""
Make a node which implements join operation using hash join strategy.
This is the option class for the "hashjoin" node factory.
Parameters
----------
join_type : str
Type of join. One of "left semi", "right semi", "left anti",
"right anti", "inner", "left outer", "right outer", "full outer".
left_keys : str, Expression or list
Key fields from left input. Each key can be a string column name
or a field expression, or a list of such field references.
right_keys : str, Expression or list
Key fields from right input. See `left_keys` for details.
left_output : list, optional
List of output fields passed from left input. If left and right
output fields are not specified, all valid fields from both left and
right input will be output. Each field can be a string column name
or a field expression.
right_output : list, optional
List of output fields passed from right input. If left and right
output fields are not specified, all valid fields from both left and
right input will be output. Each field can be a string column name
or a field expression.
output_suffix_for_left : str
Suffix added to names of output fields coming from left input
(used to distinguish, if necessary, between fields of the same
name in left and right input and can be left empty if there are
no name collisions).
output_suffix_for_right : str
Suffix added to names of output fields coming from right input,
see `output_suffix_for_left` for details.
filter_expression : pyarrow.compute.Expression
Residual filter which is applied to matching row.
"""
def __init__(
self, join_type, left_keys, right_keys, left_output=None, right_output=None,
output_suffix_for_left="", output_suffix_for_right="", filter_expression=None,
):
self._set_options(
join_type, left_keys, right_keys, left_output, right_output,
output_suffix_for_left, output_suffix_for_right, filter_expression
)
cdef class _AsofJoinNodeOptions(ExecNodeOptions):
def _set_options(self, left_on, left_by, right_on, right_by, tolerance):
cdef:
vector[CFieldRef] c_left_by
vector[CFieldRef] c_right_by
CAsofJoinKeys c_left_keys
CAsofJoinKeys c_right_keys
vector[CAsofJoinKeys] c_input_keys
# Prepare left AsofJoinNodeOption::Keys
if not isinstance(left_by, (list, tuple)):
left_by = [left_by]
for key in left_by:
c_left_by.push_back(_ensure_field_ref(key))
c_left_keys.on_key = _ensure_field_ref(left_on)
c_left_keys.by_key = c_left_by
c_input_keys.push_back(c_left_keys)
# Prepare right AsofJoinNodeOption::Keys
if not isinstance(right_by, (list, tuple)):
right_by = [right_by]
for key in right_by:
c_right_by.push_back(_ensure_field_ref(key))
c_right_keys.on_key = _ensure_field_ref(right_on)
c_right_keys.by_key = c_right_by
c_input_keys.push_back(c_right_keys)
self.wrapped.reset(
new CAsofJoinNodeOptions(
c_input_keys,
tolerance,
)
)
class AsofJoinNodeOptions(_AsofJoinNodeOptions):
"""
Make a node which implements 'as of join' operation.
This is the option class for the "asofjoin" node factory.
Parameters
----------
left_on : str, Expression
The left key on which the join operation should be performed.
Can be a string column name or a field expression.
An inexact match is used on the "on" key, i.e. a row is considered a
match if and only if ``right.on - left.on`` is in the range
``[min(0, tolerance), max(0, tolerance)]``.
The input dataset must be sorted by the "on" key. Must be a single
field of a common type.
Currently, the "on" key must be an integer, date, or timestamp type.
left_by: str, Expression or list
The left keys on which the join operation should be performed.
Exact equality is used for each field of the "by" keys.
Each key can be a string column name or a field expression,
or a list of such field references.
right_on : str, Expression
The right key on which the join operation should be performed.
See `left_on` for details.
right_by: str, Expression or list
The right keys on which the join operation should be performed.
See `left_by` for details.
tolerance : int
The tolerance to use for the asof join. The tolerance is interpreted in
the same units as the "on" key.
"""
def __init__(self, left_on, left_by, right_on, right_by, tolerance):
self._set_options(left_on, left_by, right_on, right_by, tolerance)
cdef class Declaration(_Weakrefable):
"""
Helper class for declaring the nodes of an ExecPlan.
A Declaration represents an unconstructed ExecNode, and potentially
more since its inputs may also be Declarations or when constructed
with ``from_sequence``.
The possible ExecNodes to use are registered with a name,
the "factory name", and need to be specified using this name, together
with its corresponding ExecNodeOptions subclass.
Parameters
----------
factory_name : str
The ExecNode factory name, such as "table_source", "filter",
"project" etc. See the ExecNodeOptions subclasses for the exact
factory names to use.
options : ExecNodeOptions
Corresponding ExecNodeOptions subclass (matching the factory name).
inputs : list of Declaration, optional
Input nodes for this declaration. Optional if the node is a source
node, or when the declaration gets combined later with
``from_sequence``.
Returns
-------
Declaration
"""
cdef void init(self, const CDeclaration& c_decl):
self.decl = c_decl
@staticmethod
cdef wrap(const CDeclaration& c_decl):
cdef Declaration self = Declaration.__new__(Declaration)
self.init(c_decl)
return self
cdef inline CDeclaration unwrap(self) nogil:
return self.decl
def __init__(self, factory_name, ExecNodeOptions options, inputs=None):
cdef:
c_string c_factory_name
CDeclaration c_decl
vector[CDeclaration.Input] c_inputs
c_factory_name = tobytes(factory_name)
if inputs is not None:
for ipt in inputs:
c_inputs.push_back(
CDeclaration.Input((<Declaration>ipt).unwrap())
)
c_decl = CDeclaration(c_factory_name, c_inputs, options.unwrap())
self.init(c_decl)
@staticmethod
def from_sequence(decls):
"""
Convenience factory for the common case of a simple sequence of nodes.
Each of the declarations will be appended to the inputs of the
subsequent declaration, and the final modified declaration will
be returned.
Parameters
----------
decls : list of Declaration
Returns
-------
Declaration
"""
cdef:
vector[CDeclaration] c_decls
CDeclaration c_decl
for decl in decls:
c_decls.push_back((<Declaration> decl).unwrap())
c_decl = CDeclaration.Sequence(c_decls)
return Declaration.wrap(c_decl)
def __str__(self):
return frombytes(GetResultValue(DeclarationToString(self.decl)))
def __repr__(self):
return f"<pyarrow.acero.Declaration>\n{self}"
def to_table(self, bint use_threads=True):
"""
Run the declaration and collect the results into a table.
This method will implicitly add a sink node to the declaration
to collect results into a table. It will then create an ExecPlan
from the declaration, start the exec plan, block until the plan
has finished, and return the created table.
Parameters
----------
use_threads : bool, default True
If set to False, then all CPU work will be done on the calling
thread. I/O tasks will still happen on the I/O executor
and may be multi-threaded (but should not use significant CPU
resources).
Returns
-------
pyarrow.Table
"""
cdef:
shared_ptr[CTable] c_table
with nogil:
c_table = GetResultValue(DeclarationToTable(self.unwrap(), use_threads))
return pyarrow_wrap_table(c_table)
def to_reader(self, bint use_threads=True):
"""Run the declaration and return results as a RecordBatchReader.
For details about the parameters, see `to_table`.
Returns
-------
pyarrow.RecordBatchReader
"""
cdef:
RecordBatchReader reader
reader = RecordBatchReader.__new__(RecordBatchReader)
reader.reader.reset(
GetResultValue(DeclarationToReader(self.unwrap(), use_threads)).release()
)
return reader

View File

@@ -0,0 +1,188 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib import frombytes, tobytes
from pyarrow.includes.libarrow_fs cimport *
from pyarrow._fs cimport FileSystem
cdef class AzureFileSystem(FileSystem):
"""
Azure Blob Storage backed FileSystem implementation
This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a.
Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific
features will be used when they provide a performance advantage. Azurite emulator is
also supported. Note: `/` is the only supported delimiter.
The storage account is considered the root of the filesystem. When enabled, containers
will be created or deleted during relevant directory operations. Obviously, this also
requires authentication with the additional permissions.
By default `DefaultAzureCredential <https://github.com/Azure/azure-sdk-for-cpp/blob/main/sdk/identity/azure-identity/README.md#defaultazurecredential>`__
is used for authentication. This means it will try several types of authentication
and go with the first one that works. If any authentication parameters are provided when
initialising the FileSystem, they will be used instead of the default credential.
Parameters
----------
account_name : str
Azure Blob Storage account name. This is the globally unique identifier for the
storage account.
account_key : str, default None
Account key of the storage account. If sas_token and account_key are None the
default credential will be used. The parameters account_key and sas_token are
mutually exclusive.
blob_storage_authority : str, default None
hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful
for connecting to a local emulator, like Azurite.
blob_storage_scheme : str, default None
Either `http` or `https`. Defaults to `https`. Useful for connecting to a local
emulator, like Azurite.
client_id : str, default None
The client ID (Application ID) for Azure Active Directory authentication.
Its interpretation depends on the credential type being used:
- For `ClientSecretCredential`: It is the Application (client) ID of your
registered Azure AD application (Service Principal). It must be provided
together with `tenant_id` and `client_secret` to use ClientSecretCredential.
- For `ManagedIdentityCredential`: It is the client ID of a specific
user-assigned managed identity. This is only necessary if you are using a
user-assigned managed identity and need to explicitly specify which one
(e.g., if the resource has multiple user-assigned identities). For
system-assigned managed identities, this parameter is typically not required.
client_secret : str, default None
Client secret for Azure Active Directory authentication. Must be provided together
with `tenant_id` and `client_id` to use ClientSecretCredential.
dfs_storage_authority : str, default None
hostname[:port] of the Data Lake Gen 2 Service. Defaults to
`.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite.
dfs_storage_scheme : str, default None
Either `http` or `https`. Defaults to `https`. Useful for connecting to a local
emulator, like Azurite.
sas_token : str, default None
SAS token for the storage account, used as an alternative to account_key. If sas_token
and account_key are None the default credential will be used. The parameters
account_key and sas_token are mutually exclusive.
tenant_id : str, default None
Tenant ID for Azure Active Directory authentication. Must be provided together with
`client_id` and `client_secret` to use ClientSecretCredential.
Examples
--------
>>> from pyarrow import fs
>>> azure_fs = fs.AzureFileSystem(account_name='myaccount')
>>> azurite_fs = fs.AzureFileSystem(
... account_name='devstoreaccount1',
... account_key='Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==',
... blob_storage_authority='127.0.0.1:10000',
... dfs_storage_authority='127.0.0.1:10000',
... blob_storage_scheme='http',
... dfs_storage_scheme='http',
... )
For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`.
"""
cdef:
CAzureFileSystem* azurefs
c_string account_key
c_string sas_token
c_string tenant_id
c_string client_id
c_string client_secret
def __init__(self, account_name, *, account_key=None, blob_storage_authority=None,
blob_storage_scheme=None, client_id=None, client_secret=None,
dfs_storage_authority=None, dfs_storage_scheme=None,
sas_token=None, tenant_id=None):
cdef:
CAzureOptions options
shared_ptr[CAzureFileSystem] wrapped
options.account_name = tobytes(account_name)
if blob_storage_authority:
options.blob_storage_authority = tobytes(blob_storage_authority)
if dfs_storage_authority:
options.dfs_storage_authority = tobytes(dfs_storage_authority)
if blob_storage_scheme:
options.blob_storage_scheme = tobytes(blob_storage_scheme)
if dfs_storage_scheme:
options.dfs_storage_scheme = tobytes(dfs_storage_scheme)
if account_key and sas_token:
raise ValueError("Cannot specify both account_key and sas_token.")
if (tenant_id or client_id or client_secret):
if not client_id:
raise ValueError("client_id must be specified")
if not tenant_id and not client_secret:
options.ConfigureManagedIdentityCredential(tobytes(client_id))
self.client_id = tobytes(client_id)
elif tenant_id and client_secret:
options.ConfigureClientSecretCredential(
tobytes(tenant_id), tobytes(client_id), tobytes(client_secret)
)
self.tenant_id = tobytes(tenant_id)
self.client_id = tobytes(client_id)
self.client_secret = tobytes(client_secret)
else:
raise ValueError(
"Invalid Azure credential configuration: "
"For ManagedIdentityCredential, provide only client_id. "
"For ClientSecretCredential, provide tenant_id, client_id, and client_secret."
)
elif account_key:
options.ConfigureAccountKeyCredential(tobytes(account_key))
self.account_key = tobytes(account_key)
elif sas_token:
options.ConfigureSASCredential(tobytes(sas_token))
self.sas_token = tobytes(sas_token)
else:
options.ConfigureDefaultCredential()
with nogil:
wrapped = GetResultValue(CAzureFileSystem.Make(options))
self.init(<shared_ptr[CFileSystem]> wrapped)
cdef init(self, const shared_ptr[CFileSystem]& wrapped):
FileSystem.init(self, wrapped)
self.azurefs = <CAzureFileSystem*> wrapped.get()
@staticmethod
def _reconstruct(kwargs):
# __reduce__ doesn't allow passing named arguments directly to the
# reconstructor, hence this wrapper.
return AzureFileSystem(**kwargs)
def __reduce__(self):
cdef CAzureOptions opts = self.azurefs.options()
return (
AzureFileSystem._reconstruct, (dict(
account_name=frombytes(opts.account_name),
account_key=frombytes(self.account_key),
blob_storage_authority=frombytes(opts.blob_storage_authority),
blob_storage_scheme=frombytes(opts.blob_storage_scheme),
client_id=frombytes(self.client_id),
client_secret=frombytes(self.client_secret),
dfs_storage_authority=frombytes(opts.dfs_storage_authority),
dfs_storage_scheme=frombytes(opts.dfs_storage_scheme),
sas_token=frombytes(self.sas_token),
tenant_id=frombytes(self.tenant_id)
),))

View File

@@ -0,0 +1,72 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib cimport *
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
cdef class UdfContext(_Weakrefable):
cdef:
CUdfContext c_context
cdef void init(self, const CUdfContext& c_context)
cdef class FunctionOptions(_Weakrefable):
cdef:
shared_ptr[CFunctionOptions] wrapped
cdef const CFunctionOptions* get_options(self) except NULL
cdef void init(self, const shared_ptr[CFunctionOptions]& sp)
cdef inline shared_ptr[CFunctionOptions] unwrap(self)
cdef class _SortOptions(FunctionOptions):
pass
cdef CExpression _bind(Expression filter, Schema schema) except *
cdef class Expression(_Weakrefable):
cdef:
CExpression expr
cdef void init(self, const CExpression& sp)
@staticmethod
cdef wrap(const CExpression& sp)
cdef inline CExpression unwrap(self)
@staticmethod
cdef Expression _expr_or_scalar(object expr)
cdef CExpression _true
cdef CFieldRef _ensure_field_ref(value) except *
cdef vector[CSortKey] unwrap_sort_keys(sort_keys, allow_str=*) except *
cdef CSortOrder unwrap_sort_order(order) except *
cdef CNullPlacement unwrap_null_placement(null_placement) except *

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,56 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Custom documentation additions for compute functions.
"""
function_doc_additions = {}
function_doc_additions["filter"] = """
Examples
--------
>>> import pyarrow as pa
>>> arr = pa.array(["a", "b", "c", None, "e"])
>>> mask = pa.array([True, False, None, False, True])
>>> arr.filter(mask)
<pyarrow.lib.StringArray object at ...>
[
"a",
"e"
]
>>> arr.filter(mask, null_selection_behavior='emit_null')
<pyarrow.lib.StringArray object at ...>
[
"a",
null,
"e"
]
"""
function_doc_additions["mode"] = """
Examples
--------
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2])
>>> modes = pc.mode(arr, 2)
>>> modes[0]
<pyarrow.StructScalar: [('mode', 2), ('count', 5)]>
>>> modes[1]
<pyarrow.StructScalar: [('mode', 1), ('count', 2)]>
"""

View File

@@ -0,0 +1,55 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport _Weakrefable
cdef class ConvertOptions(_Weakrefable):
cdef:
unique_ptr[CCSVConvertOptions] options
@staticmethod
cdef ConvertOptions wrap(CCSVConvertOptions options)
cdef class ParseOptions(_Weakrefable):
cdef:
unique_ptr[CCSVParseOptions] options
object _invalid_row_handler
@staticmethod
cdef ParseOptions wrap(CCSVParseOptions options)
cdef class ReadOptions(_Weakrefable):
cdef:
unique_ptr[CCSVReadOptions] options
public object encoding
@staticmethod
cdef ReadOptions wrap(CCSVReadOptions options)
cdef class WriteOptions(_Weakrefable):
cdef:
unique_ptr[CCSVWriteOptions] options
@staticmethod
cdef WriteOptions wrap(CCSVWriteOptions options)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,67 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib cimport *
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_cuda cimport *
cdef class Context(_Weakrefable):
cdef:
shared_ptr[CCudaContext] context
int device_number
cdef void init(self, const shared_ptr[CCudaContext]& ctx)
cdef class IpcMemHandle(_Weakrefable):
cdef:
shared_ptr[CCudaIpcMemHandle] handle
cdef void init(self, shared_ptr[CCudaIpcMemHandle]& h)
cdef class CudaBuffer(Buffer):
cdef:
shared_ptr[CCudaBuffer] cuda_buffer
object base
cdef void init_cuda(self,
const shared_ptr[CCudaBuffer]& buffer,
object base)
cdef class HostBuffer(Buffer):
cdef:
shared_ptr[CCudaHostBuffer] host_buffer
cdef void init_host(self, const shared_ptr[CCudaHostBuffer]& buffer)
cdef class BufferReader(NativeFile):
cdef:
CCudaBufferReader* reader
CudaBuffer buffer
cdef class BufferWriter(NativeFile):
cdef:
CCudaBufferWriter* writer
CudaBuffer buffer

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,183 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
"""Dataset is currently unstable. APIs subject to change without notice."""
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow_dataset cimport *
from pyarrow.lib cimport *
from pyarrow._fs cimport FileSystem, FileInfo
cdef CFileSource _make_file_source(object file, FileSystem filesystem=*, object file_size=*)
cdef class DatasetFactory(_Weakrefable):
cdef:
SharedPtrNoGIL[CDatasetFactory] wrapped
CDatasetFactory* factory
cdef init(self, const shared_ptr[CDatasetFactory]& sp)
@staticmethod
cdef wrap(const shared_ptr[CDatasetFactory]& sp)
cdef inline shared_ptr[CDatasetFactory] unwrap(self) nogil
cdef class Dataset(_Weakrefable):
cdef:
SharedPtrNoGIL[CDataset] wrapped
CDataset* dataset
public dict _scan_options
cdef void init(self, const shared_ptr[CDataset]& sp)
@staticmethod
cdef wrap(const shared_ptr[CDataset]& sp)
cdef shared_ptr[CDataset] unwrap(self) nogil
cdef class Scanner(_Weakrefable):
cdef:
SharedPtrNoGIL[CScanner] wrapped
CScanner* scanner
cdef void init(self, const shared_ptr[CScanner]& sp)
@staticmethod
cdef wrap(const shared_ptr[CScanner]& sp)
cdef shared_ptr[CScanner] unwrap(self)
@staticmethod
cdef shared_ptr[CScanOptions] _make_scan_options(Dataset dataset, dict py_scanoptions) except *
cdef class FragmentScanOptions(_Weakrefable):
cdef:
shared_ptr[CFragmentScanOptions] wrapped
cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp)
@staticmethod
cdef wrap(const shared_ptr[CFragmentScanOptions]& sp)
cdef class FileFormat(_Weakrefable):
cdef:
shared_ptr[CFileFormat] wrapped
CFileFormat* format
cdef void init(self, const shared_ptr[CFileFormat]& sp)
@staticmethod
cdef wrap(const shared_ptr[CFileFormat]& sp)
cdef inline shared_ptr[CFileFormat] unwrap(self)
cdef _set_default_fragment_scan_options(self, FragmentScanOptions options)
# Return a WrittenFile after a file was written.
# May be overridden by subclasses, e.g. to add metadata.
cdef WrittenFile _finish_write(self, path, base_dir,
CFileWriter* file_writer)
cdef class FileWriteOptions(_Weakrefable):
cdef:
shared_ptr[CFileWriteOptions] wrapped
CFileWriteOptions* c_options
cdef void init(self, const shared_ptr[CFileWriteOptions]& sp)
@staticmethod
cdef wrap(const shared_ptr[CFileWriteOptions]& sp)
cdef inline shared_ptr[CFileWriteOptions] unwrap(self)
cdef class Fragment(_Weakrefable):
cdef:
SharedPtrNoGIL[CFragment] wrapped
CFragment* fragment
cdef void init(self, const shared_ptr[CFragment]& sp)
@staticmethod
cdef wrap(const shared_ptr[CFragment]& sp)
cdef inline shared_ptr[CFragment] unwrap(self)
cdef class FileFragment(Fragment):
cdef:
CFileFragment* file_fragment
cdef void init(self, const shared_ptr[CFragment]& sp)
cdef class Partitioning(_Weakrefable):
cdef:
shared_ptr[CPartitioning] wrapped
CPartitioning* partitioning
cdef init(self, const shared_ptr[CPartitioning]& sp)
@staticmethod
cdef wrap(const shared_ptr[CPartitioning]& sp)
cdef inline shared_ptr[CPartitioning] unwrap(self)
cdef class PartitioningFactory(_Weakrefable):
cdef:
shared_ptr[CPartitioningFactory] wrapped
CPartitioningFactory* factory
object constructor
object options
cdef init(self, const shared_ptr[CPartitioningFactory]& sp)
@staticmethod
cdef wrap(const shared_ptr[CPartitioningFactory]& sp,
object constructor, object options)
cdef inline shared_ptr[CPartitioningFactory] unwrap(self)
cdef class WrittenFile(_Weakrefable):
# The full path to the created file
cdef public str path
# Optional Parquet metadata
# This metadata will have the file path attribute set to the path of
# the written file.
cdef public object metadata
# The size of the file in bytes
cdef public int64_t size

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,51 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
"""Dataset support for ORC file format."""
from pyarrow.lib cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_dataset cimport *
from pyarrow._dataset cimport FileFormat
cdef class OrcFileFormat(FileFormat):
def __init__(self):
self.init(shared_ptr[CFileFormat](new COrcFileFormat()))
def equals(self, OrcFileFormat other):
"""
Parameters
----------
other : pyarrow.dataset.OrcFileFormat
Returns
-------
True
"""
return True
@property
def default_extname(self):
return "orc"
def __reduce__(self):
return OrcFileFormat, tuple()

View File

@@ -0,0 +1,43 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
"""Dataset support for Parquet file format."""
from pyarrow.includes.libarrow_dataset cimport *
from pyarrow.includes.libarrow_dataset_parquet cimport *
from pyarrow._dataset cimport FragmentScanOptions, FileWriteOptions
cdef class ParquetFragmentScanOptions(FragmentScanOptions):
cdef:
CParquetFragmentScanOptions* parquet_options
object _parquet_decryption_config
object _decryption_properties
cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp)
cdef CReaderProperties* reader_properties(self)
cdef ArrowReaderProperties* arrow_reader_properties(self)
cdef class ParquetFileWriteOptions(FileWriteOptions):
cdef:
CParquetFileWriteOptions* parquet_options
object _properties

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,178 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
"""Dataset support for Parquet encryption."""
from pyarrow.includes.libarrow_dataset_parquet cimport *
from pyarrow._parquet_encryption cimport *
from pyarrow._dataset_parquet cimport ParquetFragmentScanOptions, ParquetFileWriteOptions
cdef class ParquetEncryptionConfig(_Weakrefable):
"""
Core configuration class encapsulating parameters for high-level encryption
within the Parquet framework.
The ParquetEncryptionConfig class serves as a bridge for passing encryption-related
parameters to the appropriate components within the Parquet library. It maintains references
to objects that define the encryption strategy, Key Management Service (KMS) configuration,
and specific encryption configurations for Parquet data.
Parameters
----------
crypto_factory : pyarrow.parquet.encryption.CryptoFactory
Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for
creating cryptographic components, such as encryptors and decryptors.
kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig
Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration
parameters necessary for connecting to a Key Management Service (KMS).
encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration
Shared pointer to an `EncryptionConfiguration` object. This object defines specific
encryption settings for Parquet data, including the keys assigned to different columns.
Raises
------
ValueError
Raised if `encryption_config` is None.
"""
cdef:
shared_ptr[CParquetEncryptionConfig] c_config
# Avoid mistakenly creating attributes
__slots__ = ()
def __cinit__(self, CryptoFactory crypto_factory, KmsConnectionConfig kms_connection_config,
EncryptionConfiguration encryption_config):
cdef shared_ptr[CEncryptionConfiguration] c_encryption_config
if crypto_factory is None:
raise ValueError("crypto_factory cannot be None")
if kms_connection_config is None:
raise ValueError("kms_connection_config cannot be None")
if encryption_config is None:
raise ValueError("encryption_config cannot be None")
self.c_config.reset(new CParquetEncryptionConfig())
c_encryption_config = pyarrow_unwrap_encryptionconfig(
encryption_config)
self.c_config.get().crypto_factory = pyarrow_unwrap_cryptofactory(crypto_factory)
self.c_config.get().kms_connection_config = pyarrow_unwrap_kmsconnectionconfig(
kms_connection_config)
self.c_config.get().encryption_config = c_encryption_config
@staticmethod
cdef wrap(shared_ptr[CParquetEncryptionConfig] c_config):
cdef ParquetEncryptionConfig python_config = ParquetEncryptionConfig.__new__(ParquetEncryptionConfig)
python_config.c_config = c_config
return python_config
cdef shared_ptr[CParquetEncryptionConfig] unwrap(self):
return self.c_config
cdef class ParquetDecryptionConfig(_Weakrefable):
"""
Core configuration class encapsulating parameters for high-level decryption
within the Parquet framework.
ParquetDecryptionConfig is designed to pass decryption-related parameters to
the appropriate decryption components within the Parquet library. It holds references to
objects that define the decryption strategy, Key Management Service (KMS) configuration,
and specific decryption configurations for reading encrypted Parquet data.
Parameters
----------
crypto_factory : pyarrow.parquet.encryption.CryptoFactory
Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic
components for the decryption process.
kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig
Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary
for connecting to a Key Management Service (KMS) during decryption.
decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration
Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings
for reading encrypted Parquet data.
Raises
------
ValueError
Raised if `decryption_config` is None.
"""
cdef:
shared_ptr[CParquetDecryptionConfig] c_config
# Avoid mistakingly creating attributes
__slots__ = ()
def __cinit__(self, CryptoFactory crypto_factory, KmsConnectionConfig kms_connection_config,
DecryptionConfiguration decryption_config):
cdef shared_ptr[CDecryptionConfiguration] c_decryption_config
if decryption_config is None:
raise ValueError(
"decryption_config cannot be None")
self.c_config.reset(new CParquetDecryptionConfig())
c_decryption_config = pyarrow_unwrap_decryptionconfig(
decryption_config)
self.c_config.get().crypto_factory = pyarrow_unwrap_cryptofactory(crypto_factory)
self.c_config.get().kms_connection_config = pyarrow_unwrap_kmsconnectionconfig(
kms_connection_config)
self.c_config.get().decryption_config = c_decryption_config
@staticmethod
cdef wrap(shared_ptr[CParquetDecryptionConfig] c_config):
cdef ParquetDecryptionConfig python_config = ParquetDecryptionConfig.__new__(ParquetDecryptionConfig)
python_config.c_config = c_config
return python_config
cdef shared_ptr[CParquetDecryptionConfig] unwrap(self):
return self.c_config
def set_encryption_config(
ParquetFileWriteOptions opts not None,
ParquetEncryptionConfig config not None
):
cdef shared_ptr[CParquetEncryptionConfig] c_config = config.unwrap()
opts.parquet_options.parquet_encryption_config = c_config
def set_decryption_properties(
ParquetFragmentScanOptions opts not None,
FileDecryptionProperties config not None
):
cdef CReaderProperties* reader_props = opts.reader_properties()
reader_props.file_decryption_properties(config.unwrap())
def set_decryption_config(
ParquetFragmentScanOptions opts not None,
ParquetDecryptionConfig config not None
):
cdef shared_ptr[CParquetDecryptionConfig] c_config = config.unwrap()
opts.parquet_options.parquet_decryption_config = c_config

View File

@@ -0,0 +1,46 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
cimport cpython
from cpython.pycapsule cimport PyCapsule_New
cdef void dlpack_pycapsule_deleter(object dltensor) noexcept:
cdef DLManagedTensor* dlm_tensor
cdef PyObject* err_type
cdef PyObject* err_value
cdef PyObject* err_traceback
# Do nothing if the capsule has been consumed
if cpython.PyCapsule_IsValid(dltensor, "used_dltensor"):
return
# An exception may be in-flight, we must save it in case
# we create another one
cpython.PyErr_Fetch(&err_type, &err_value, &err_traceback)
dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(dltensor, 'dltensor')
if dlm_tensor == NULL:
cpython.PyErr_WriteUnraisable(dltensor)
# The deleter can be NULL if there is no way for the caller
# to provide a reasonable destructor
elif dlm_tensor.deleter:
dlm_tensor.deleter(dlm_tensor)
assert (not cpython.PyErr_Occurred())
# Set the error indicator from err_type, err_value, err_traceback
cpython.PyErr_Restore(err_type, err_value, err_traceback)

View File

@@ -0,0 +1,117 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ---------------------------------------------------------------------
# Implement Feather file format
# cython: profile=False
# distutils: language = c++
# cython: language_level=3
from cython.operator cimport dereference as deref
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_feather cimport *
from pyarrow.lib cimport (check_status, Table, _Weakrefable,
get_writer, get_reader, pyarrow_wrap_table)
from pyarrow.lib import tobytes
class FeatherError(Exception):
pass
def write_feather(Table table, object dest, compression=None,
compression_level=None, chunksize=None, version=2):
cdef shared_ptr[COutputStream] sink
get_writer(dest, &sink)
cdef CFeatherProperties properties
if version == 2:
properties.version = kFeatherV2Version
else:
properties.version = kFeatherV1Version
if compression == 'zstd':
properties.compression = CCompressionType_ZSTD
elif compression == 'lz4':
properties.compression = CCompressionType_LZ4_FRAME
else:
properties.compression = CCompressionType_UNCOMPRESSED
if chunksize is not None:
properties.chunksize = chunksize
if compression_level is not None:
properties.compression_level = compression_level
with nogil:
check_status(WriteFeather(deref(table.table), sink.get(),
properties))
cdef class FeatherReader(_Weakrefable):
cdef:
shared_ptr[CFeatherReader] reader
def __cinit__(self, source, c_bool use_memory_map, c_bool use_threads):
cdef:
shared_ptr[CRandomAccessFile] reader
CIpcReadOptions options = CIpcReadOptions.Defaults()
options.use_threads = use_threads
get_reader(source, use_memory_map, &reader)
with nogil:
self.reader = GetResultValue(CFeatherReader.Open(reader, options))
@property
def version(self):
return self.reader.get().version()
def read(self):
cdef shared_ptr[CTable] sp_table
with nogil:
check_status(self.reader.get()
.Read(&sp_table))
return pyarrow_wrap_table(sp_table)
def read_indices(self, indices):
cdef:
shared_ptr[CTable] sp_table
vector[int] c_indices
for index in indices:
c_indices.push_back(index)
with nogil:
check_status(self.reader.get()
.Read(c_indices, &sp_table))
return pyarrow_wrap_table(sp_table)
def read_names(self, names):
cdef:
shared_ptr[CTable] sp_table
vector[c_string] c_names
for name in names:
c_names.push_back(tobytes(name))
with nogil:
check_status(self.reader.get()
.Read(c_names, &sp_table))
return pyarrow_wrap_table(sp_table)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,91 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow_fs cimport *
from pyarrow.lib import _detect_compression, frombytes, tobytes
from pyarrow.lib cimport *
cpdef enum FileType:
NotFound = <int8_t> CFileType_NotFound
Unknown = <int8_t> CFileType_Unknown
File = <int8_t> CFileType_File
Directory = <int8_t> CFileType_Directory
cdef class FileInfo(_Weakrefable):
cdef:
CFileInfo info
@staticmethod
cdef wrap(CFileInfo info)
cdef inline CFileInfo unwrap(self) nogil
@staticmethod
cdef CFileInfo unwrap_safe(obj)
cdef class FileSelector(_Weakrefable):
cdef:
CFileSelector selector
@staticmethod
cdef FileSelector wrap(CFileSelector selector)
cdef inline CFileSelector unwrap(self) nogil
cdef class FileSystem(_Weakrefable):
cdef:
shared_ptr[CFileSystem] wrapped
CFileSystem* fs
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
@staticmethod
cdef wrap(const shared_ptr[CFileSystem]& sp)
cdef inline shared_ptr[CFileSystem] unwrap(self) nogil
cdef class LocalFileSystem(FileSystem):
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
cdef class SubTreeFileSystem(FileSystem):
cdef:
CSubTreeFileSystem* subtreefs
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
cdef class _MockFileSystem(FileSystem):
cdef:
CMockFileSystem* mockfs
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
cdef class PyFileSystem(FileSystem):
cdef:
CPyFileSystem* pyfs
cdef init(self, const shared_ptr[CFileSystem]& wrapped)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,209 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib cimport (pyarrow_wrap_metadata,
pyarrow_unwrap_metadata)
from pyarrow.lib import frombytes, tobytes, ensure_metadata
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_fs cimport *
from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint
from datetime import datetime, timedelta, timezone
cdef class GcsFileSystem(FileSystem):
"""
Google Cloud Storage (GCS) backed FileSystem implementation
By default uses the process described in https://google.aip.dev/auth/4110
to resolve credentials. If not running on Google Cloud Platform (GCP),
this generally requires the environment variable
GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file
containing credentials.
Note: GCS buckets are special and the operations available on them may be
limited or more expensive than expected compared to local file systems.
Note: When pickling a GcsFileSystem that uses default credentials, resolution
credentials are not stored in the serialized data. Therefore, when unpickling
it is assumed that the necessary credentials are in place for the target
process.
Parameters
----------
anonymous : boolean, default False
Whether to connect anonymously.
If true, will not attempt to look up credentials using standard GCP
configuration methods.
access_token : str, default None
GCP access token. If provided, temporary credentials will be fetched by
assuming this role; also, a `credential_token_expiration` must be
specified as well.
target_service_account : str, default None
An optional service account to try to impersonate when accessing GCS. This
requires the specified credential user or service account to have the necessary
permissions.
credential_token_expiration : datetime, default None
Expiration for credential generated with an access token. Must be specified
if `access_token` is specified.
default_bucket_location : str, default 'US'
GCP region to create buckets in.
scheme : str, default 'https'
GCS connection transport scheme.
endpoint_override : str, default None
Override endpoint with a connect string such as "localhost:9000"
default_metadata : mapping or pyarrow.KeyValueMetadata, default None
Default metadata for `open_output_stream`. This will be ignored if
non-empty metadata is passed to `open_output_stream`.
retry_time_limit : timedelta, default None
Set the maximum amount of time the GCS client will attempt to retry
transient errors. Subsecond granularity is ignored.
project_id : str, default None
The GCP project identifier to use for creating buckets.
If not set, the library uses the GOOGLE_CLOUD_PROJECT environment
variable. Most I/O operations do not need a project id, only applications
that create new buckets need a project id.
"""
cdef:
CGcsFileSystem* gcsfs
def __init__(self, *, bint anonymous=False, access_token=None,
target_service_account=None, credential_token_expiration=None,
default_bucket_location='US',
scheme=None,
endpoint_override=None,
default_metadata=None,
retry_time_limit=None,
project_id=None):
cdef:
CGcsOptions options
shared_ptr[CGcsFileSystem] wrapped
double time_limit_seconds
# Intentional use of truthiness because empty strings aren't valid and
# for reconstruction from pickling will give empty strings.
if anonymous and (target_service_account or access_token):
raise ValueError(
'anonymous option is not compatible with target_service_account and '
'access_token'
)
elif bool(access_token) != bool(credential_token_expiration):
raise ValueError(
'access_token and credential_token_expiration must be '
'specified together'
)
elif anonymous:
options = CGcsOptions.Anonymous()
elif access_token:
if not isinstance(credential_token_expiration, datetime):
raise ValueError(
"credential_token_expiration must be a datetime")
options = CGcsOptions.FromAccessToken(
tobytes(access_token),
PyDateTime_to_TimePoint(<PyDateTime_DateTime*>credential_token_expiration))
else:
options = CGcsOptions.Defaults()
# Target service account requires base credentials so
# it is not part of the if/else chain above which only
# handles base credentials.
if target_service_account:
options = CGcsOptions.FromImpersonatedServiceAccount(
options.credentials, tobytes(target_service_account))
options.default_bucket_location = tobytes(default_bucket_location)
if scheme is not None:
options.scheme = tobytes(scheme)
if endpoint_override is not None:
options.endpoint_override = tobytes(endpoint_override)
if default_metadata is not None:
options.default_metadata = pyarrow_unwrap_metadata(
ensure_metadata(default_metadata))
if retry_time_limit is not None:
time_limit_seconds = retry_time_limit.total_seconds()
options.retry_limit_seconds = time_limit_seconds
if project_id is not None:
options.project_id = <c_string>tobytes(project_id)
with nogil:
wrapped = GetResultValue(CGcsFileSystem.Make(options))
self.init(<shared_ptr[CFileSystem]> wrapped)
cdef init(self, const shared_ptr[CFileSystem]& wrapped):
FileSystem.init(self, wrapped)
self.gcsfs = <CGcsFileSystem*> wrapped.get()
def _expiration_datetime_from_options(self):
expiration_ns = TimePoint_to_ns(
self.gcsfs.options().credentials.expiration())
if expiration_ns == 0:
return None
return datetime.fromtimestamp(expiration_ns / 1.0e9, timezone.utc)
@staticmethod
def _reconstruct(kwargs):
# __reduce__ doesn't allow passing named arguments directly to the
# reconstructor, hence this wrapper.
return GcsFileSystem(**kwargs)
def __reduce__(self):
cdef CGcsOptions opts = self.gcsfs.options()
service_account = frombytes(opts.credentials.target_service_account())
expiration_dt = self._expiration_datetime_from_options()
retry_time_limit = None
if opts.retry_limit_seconds.has_value():
retry_time_limit = timedelta(
seconds=opts.retry_limit_seconds.value())
project_id = None
if opts.project_id.has_value():
project_id = frombytes(opts.project_id.value())
return (
GcsFileSystem._reconstruct, (dict(
access_token=frombytes(opts.credentials.access_token()),
anonymous=opts.credentials.anonymous(),
credential_token_expiration=expiration_dt,
target_service_account=service_account,
scheme=frombytes(opts.scheme),
endpoint_override=frombytes(opts.endpoint_override),
default_bucket_location=frombytes(
opts.default_bucket_location),
default_metadata=pyarrow_wrap_metadata(opts.default_metadata),
retry_time_limit=retry_time_limit,
project_id=project_id
),))
@property
def default_bucket_location(self):
"""
The GCP location this filesystem will write to.
"""
return frombytes(self.gcsfs.options().default_bucket_location)
@property
def project_id(self):
"""
The GCP project id this filesystem will use.
"""
if self.gcsfs.options().project_id.has_value():
return frombytes(self.gcsfs.options().project_id.value())

View File

@@ -0,0 +1,34 @@
# file generated by setuptools-scm
# don't change, don't track in version control
__all__ = [
"__version__",
"__version_tuple__",
"version",
"version_tuple",
"__commit_id__",
"commit_id",
]
TYPE_CHECKING = False
if TYPE_CHECKING:
from typing import Tuple
from typing import Union
VERSION_TUPLE = Tuple[Union[int, str], ...]
COMMIT_ID = Union[str, None]
else:
VERSION_TUPLE = object
COMMIT_ID = object
version: str
__version__: str
__version_tuple__: VERSION_TUPLE
version_tuple: VERSION_TUPLE
commit_id: COMMIT_ID
__commit_id__: COMMIT_ID
__version__ = version = '22.0.0'
__version_tuple__ = version_tuple = (22, 0, 0)
__commit_id__ = commit_id = None

View File

@@ -0,0 +1,157 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_fs cimport *
from pyarrow._fs cimport FileSystem
from pyarrow.lib import frombytes, tobytes
from pyarrow.util import _stringify_path
cdef class HadoopFileSystem(FileSystem):
"""
HDFS backed FileSystem implementation
Parameters
----------
host : str
HDFS host to connect to. Set to "default" for fs.defaultFS from
core-site.xml.
port : int, default 8020
HDFS port to connect to. Set to 0 for default or logical (HA) nodes.
user : str, default None
Username when connecting to HDFS; None implies login user.
replication : int, default 3
Number of copies each block will have.
buffer_size : int, default 0
If 0, no buffering will happen otherwise the size of the temporary read
and write buffer.
default_block_size : int, default None
None means the default configuration for HDFS, a typical block size is
128 MB.
kerb_ticket : string or path, default None
If not None, the path to the Kerberos ticket cache.
extra_conf : dict, default None
Extra key/value pairs for configuration; will override any
hdfs-site.xml properties.
Examples
--------
>>> from pyarrow import fs
>>> hdfs = fs.HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path) # doctest: +SKIP
For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`.
"""
cdef:
CHadoopFileSystem* hdfs
def __init__(self, str host, int port=8020, *, str user=None,
int replication=3, int buffer_size=0,
default_block_size=None, kerb_ticket=None,
extra_conf=None):
cdef:
CHdfsOptions options
shared_ptr[CHadoopFileSystem] wrapped
if not host.startswith(('hdfs://', 'viewfs://')) and host != "default":
# TODO(kszucs): do more sanitization
host = f'hdfs://{host}'
options.ConfigureEndPoint(tobytes(host), int(port))
options.ConfigureReplication(replication)
options.ConfigureBufferSize(buffer_size)
if user is not None:
options.ConfigureUser(tobytes(user))
if default_block_size is not None:
options.ConfigureBlockSize(default_block_size)
if kerb_ticket is not None:
options.ConfigureKerberosTicketCachePath(
tobytes(_stringify_path(kerb_ticket)))
if extra_conf is not None:
for k, v in extra_conf.items():
options.ConfigureExtraConf(tobytes(k), tobytes(v))
with nogil:
wrapped = GetResultValue(CHadoopFileSystem.Make(options))
self.init(<shared_ptr[CFileSystem]> wrapped)
cdef init(self, const shared_ptr[CFileSystem]& wrapped):
FileSystem.init(self, wrapped)
self.hdfs = <CHadoopFileSystem*> wrapped.get()
@staticmethod
def from_uri(uri):
"""
Instantiate HadoopFileSystem object from an URI string.
The following two calls are equivalent
* ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\
&replication=1')``
* ``HadoopFileSystem('localhost', port=8020, user='test', \
replication=1)``
Parameters
----------
uri : str
A string URI describing the connection to HDFS.
In order to change the user, replication, buffer_size or
default_block_size pass the values as query parts.
Returns
-------
HadoopFileSystem
"""
cdef:
HadoopFileSystem self = HadoopFileSystem.__new__(HadoopFileSystem)
shared_ptr[CHadoopFileSystem] wrapped
CHdfsOptions options
options = GetResultValue(CHdfsOptions.FromUriString(tobytes(uri)))
with nogil:
wrapped = GetResultValue(CHadoopFileSystem.Make(options))
self.init(<shared_ptr[CFileSystem]> wrapped)
return self
@staticmethod
def _reconstruct(kwargs):
# __reduce__ doesn't allow passing named arguments directly to the
# reconstructor, hence this wrapper.
return HadoopFileSystem(**kwargs)
def __reduce__(self):
cdef CHdfsOptions opts = self.hdfs.options()
return (
HadoopFileSystem._reconstruct, (dict(
host=frombytes(opts.connection_config.host),
port=opts.connection_config.port,
user=frombytes(opts.connection_config.user),
replication=opts.replication,
buffer_size=opts.buffer_size,
default_block_size=opts.default_block_size,
kerb_ticket=frombytes(opts.connection_config.kerb_ticket),
extra_conf={frombytes(k): frombytes(v)
for k, v in opts.connection_config.extra_conf},
),)
)

View File

@@ -0,0 +1,36 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport _Weakrefable
cdef class ParseOptions(_Weakrefable):
cdef:
CJSONParseOptions options
@staticmethod
cdef ParseOptions wrap(CJSONParseOptions options)
cdef class ReadOptions(_Weakrefable):
cdef:
CJSONReadOptions options
@staticmethod
cdef ReadOptions wrap(CJSONReadOptions options)

View File

@@ -0,0 +1,386 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport (_Weakrefable, Schema,
RecordBatchReader, MemoryPool,
maybe_unbox_memory_pool,
get_input_stream, pyarrow_wrap_table,
pyarrow_wrap_schema, pyarrow_unwrap_schema)
cdef class ReadOptions(_Weakrefable):
"""
Options for reading JSON files.
Parameters
----------
use_threads : bool, optional (default True)
Whether to use multiple threads to accelerate reading
block_size : int, optional
How much bytes to process at a time from the input stream.
This will determine multi-threading granularity as well as
the size of individual chunks in the Table.
"""
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, use_threads=None, block_size=None):
self.options = CJSONReadOptions.Defaults()
if use_threads is not None:
self.use_threads = use_threads
if block_size is not None:
self.block_size = block_size
@property
def use_threads(self):
"""
Whether to use multiple threads to accelerate reading.
"""
return self.options.use_threads
@use_threads.setter
def use_threads(self, value):
self.options.use_threads = value
@property
def block_size(self):
"""
How much bytes to process at a time from the input stream.
This will determine multi-threading granularity as well as the size of
individual chunks in the Table.
"""
return self.options.block_size
@block_size.setter
def block_size(self, value):
self.options.block_size = value
def __reduce__(self):
return ReadOptions, (
self.use_threads,
self.block_size
)
def equals(self, ReadOptions other):
"""
Parameters
----------
other : pyarrow.json.ReadOptions
Returns
-------
bool
"""
return (
self.use_threads == other.use_threads and
self.block_size == other.block_size
)
def __eq__(self, other):
try:
return self.equals(other)
except TypeError:
return False
@staticmethod
cdef ReadOptions wrap(CJSONReadOptions options):
out = ReadOptions()
out.options = options # shallow copy
return out
cdef class ParseOptions(_Weakrefable):
"""
Options for parsing JSON files.
Parameters
----------
explicit_schema : Schema, optional (default None)
Optional explicit schema (no type inference, ignores other fields).
newlines_in_values : bool, optional (default False)
Whether objects may be printed across multiple lines (for example
pretty printed). If false, input must end with an empty line.
unexpected_field_behavior : str, default "infer"
How JSON fields outside of explicit_schema (if given) are treated.
Possible behaviors:
- "ignore": unexpected JSON fields are ignored
- "error": error out on unexpected JSON fields
- "infer": unexpected JSON fields are type-inferred and included in
the output
"""
__slots__ = ()
def __init__(self, explicit_schema=None, newlines_in_values=None,
unexpected_field_behavior=None):
self.options = CJSONParseOptions.Defaults()
if explicit_schema is not None:
self.explicit_schema = explicit_schema
if newlines_in_values is not None:
self.newlines_in_values = newlines_in_values
if unexpected_field_behavior is not None:
self.unexpected_field_behavior = unexpected_field_behavior
def __reduce__(self):
return ParseOptions, (
self.explicit_schema,
self.newlines_in_values,
self.unexpected_field_behavior
)
@property
def explicit_schema(self):
"""
Optional explicit schema (no type inference, ignores other fields)
"""
if self.options.explicit_schema.get() == NULL:
return None
else:
return pyarrow_wrap_schema(self.options.explicit_schema)
@explicit_schema.setter
def explicit_schema(self, value):
self.options.explicit_schema = pyarrow_unwrap_schema(value)
@property
def newlines_in_values(self):
"""
Whether newline characters are allowed in JSON values.
Setting this to True reduces the performance of multi-threaded
JSON reading.
"""
return self.options.newlines_in_values
@newlines_in_values.setter
def newlines_in_values(self, value):
self.options.newlines_in_values = value
@property
def unexpected_field_behavior(self):
"""
How JSON fields outside of explicit_schema (if given) are treated.
Possible behaviors:
- "ignore": unexpected JSON fields are ignored
- "error": error out on unexpected JSON fields
- "infer": unexpected JSON fields are type-inferred and included in
the output
Set to "infer" by default.
"""
v = self.options.unexpected_field_behavior
if v == CUnexpectedFieldBehavior_Ignore:
return "ignore"
elif v == CUnexpectedFieldBehavior_Error:
return "error"
elif v == CUnexpectedFieldBehavior_InferType:
return "infer"
else:
raise ValueError('Unexpected value for unexpected_field_behavior')
@unexpected_field_behavior.setter
def unexpected_field_behavior(self, value):
cdef CUnexpectedFieldBehavior v
if value == "ignore":
v = CUnexpectedFieldBehavior_Ignore
elif value == "error":
v = CUnexpectedFieldBehavior_Error
elif value == "infer":
v = CUnexpectedFieldBehavior_InferType
else:
raise ValueError(
f"Unexpected value `{value}` for `unexpected_field_behavior`, pass "
f"either `ignore`, `error` or `infer`."
)
self.options.unexpected_field_behavior = v
def equals(self, ParseOptions other):
"""
Parameters
----------
other : pyarrow.json.ParseOptions
Returns
-------
bool
"""
return (
self.explicit_schema == other.explicit_schema and
self.newlines_in_values == other.newlines_in_values and
self.unexpected_field_behavior == other.unexpected_field_behavior
)
def __eq__(self, other):
try:
return self.equals(other)
except TypeError:
return False
@staticmethod
cdef ParseOptions wrap(CJSONParseOptions options):
out = ParseOptions()
out.options = options # shallow copy
return out
cdef _get_reader(input_file, shared_ptr[CInputStream]* out):
use_memory_map = False
get_input_stream(input_file, use_memory_map, out)
cdef _get_read_options(ReadOptions read_options, CJSONReadOptions* out):
if read_options is None:
out[0] = CJSONReadOptions.Defaults()
else:
out[0] = read_options.options
cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out):
if parse_options is None:
out[0] = CJSONParseOptions.Defaults()
else:
out[0] = parse_options.options
cdef class JSONStreamingReader(RecordBatchReader):
"""An object that reads record batches incrementally from a JSON file.
Should not be instantiated directly by user code.
"""
cdef readonly:
Schema schema
def __init__(self):
raise TypeError(f"Do not call {self.__class__.__name__}'s "
"constructor directly, "
"use pyarrow.json.open_json() instead.")
cdef _open(self, shared_ptr[CInputStream] stream,
CJSONReadOptions c_read_options,
CJSONParseOptions c_parse_options,
MemoryPool memory_pool):
cdef:
shared_ptr[CSchema] c_schema
CIOContext io_context
io_context = CIOContext(maybe_unbox_memory_pool(memory_pool))
with nogil:
self.reader = <shared_ptr[CRecordBatchReader]> GetResultValue(
CJSONStreamingReader.Make(stream, move(c_read_options),
move(c_parse_options), io_context))
c_schema = self.reader.get().schema()
self.schema = pyarrow_wrap_schema(c_schema)
def read_json(input_file, read_options=None, parse_options=None,
MemoryPool memory_pool=None):
"""
Read a Table from a stream of JSON data.
Parameters
----------
input_file : str, path or file-like object
The location of JSON data. Currently only the line-delimited JSON
format is supported.
read_options : pyarrow.json.ReadOptions, optional
Options for the JSON reader (see ReadOptions constructor for defaults).
parse_options : pyarrow.json.ParseOptions, optional
Options for the JSON parser
(see ParseOptions constructor for defaults).
memory_pool : MemoryPool, optional
Pool to allocate Table memory from.
Returns
-------
:class:`pyarrow.Table`
Contents of the JSON file as a in-memory table.
"""
cdef:
shared_ptr[CInputStream] stream
CJSONReadOptions c_read_options
CJSONParseOptions c_parse_options
shared_ptr[CJSONReader] reader
shared_ptr[CTable] table
_get_reader(input_file, &stream)
_get_read_options(read_options, &c_read_options)
_get_parse_options(parse_options, &c_parse_options)
reader = GetResultValue(
CJSONReader.Make(maybe_unbox_memory_pool(memory_pool),
stream, c_read_options, c_parse_options))
with nogil:
table = GetResultValue(reader.get().Read())
return pyarrow_wrap_table(table)
def open_json(input_file, read_options=None, parse_options=None,
MemoryPool memory_pool=None):
"""
Open a streaming reader of JSON data.
Reading using this function is always single-threaded.
Parameters
----------
input_file : string, path or file-like object
The location of JSON data. If a string or path, and if it ends
with a recognized compressed file extension (e.g. ".gz" or ".bz2"),
the data is automatically decompressed when reading.
read_options : pyarrow.json.ReadOptions, optional
Options for the JSON reader (see pyarrow.json.ReadOptions constructor
for defaults)
parse_options : pyarrow.json.ParseOptions, optional
Options for the JSON parser
(see pyarrow.json.ParseOptions constructor for defaults)
memory_pool : MemoryPool, optional
Pool to allocate RecordBatch memory from
Returns
-------
:class:`pyarrow.json.JSONStreamingReader`
"""
cdef:
shared_ptr[CInputStream] stream
CJSONReadOptions c_read_options
CJSONParseOptions c_parse_options
JSONStreamingReader reader
_get_reader(input_file, &stream)
_get_read_options(read_options, &c_read_options)
_get_parse_options(parse_options, &c_parse_options)
reader = JSONStreamingReader.__new__(JSONStreamingReader)
reader._open(stream, move(c_read_options), move(c_parse_options),
memory_pool)
return reader

View File

@@ -0,0 +1,134 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# distutils: language = c++
# cython: language_level = 3
from libcpp cimport bool as c_bool
from libc.string cimport const_char
from libcpp.vector cimport vector as std_vector
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus,
CResult, CTable, CMemoryPool,
CKeyValueMetadata,
CRecordBatch,
CTable, CCompressionType,
CRandomAccessFile, COutputStream,
TimeUnit)
cdef extern from "arrow/adapters/orc/options.h" \
namespace "arrow::adapters::orc" nogil:
cdef enum CompressionStrategy \
" arrow::adapters::orc::CompressionStrategy":
_CompressionStrategy_SPEED \
" arrow::adapters::orc::CompressionStrategy::kSpeed"
_CompressionStrategy_COMPRESSION \
" arrow::adapters::orc::CompressionStrategy::kCompression"
cdef enum WriterId" arrow::adapters::orc::WriterId":
_WriterId_ORC_JAVA_WRITER" arrow::adapters::orc::WriterId::kOrcJava"
_WriterId_ORC_CPP_WRITER" arrow::adapters::orc::WriterId::kOrcCpp"
_WriterId_PRESTO_WRITER" arrow::adapters::orc::WriterId::kPresto"
_WriterId_SCRITCHLEY_GO \
" arrow::adapters::orc::WriterId::kScritchleyGo"
_WriterId_TRINO_WRITER" arrow::adapters::orc::WriterId::kTrino"
_WriterId_UNKNOWN_WRITER" arrow::adapters::orc::WriterId::kUnknown"
cdef enum WriterVersion" arrow::adapters::orc::WriterVersion":
_WriterVersion_ORIGINAL \
" arrow::adapters::orc::WriterVersion::kOriginal"
_WriterVersion_HIVE_8732 \
" arrow::adapters::orc::WriterVersion::kHive8732"
_WriterVersion_HIVE_4243 \
" arrow::adapters::orc::WriterVersion::kHive4243"
_WriterVersion_HIVE_12055 \
" arrow::adapters::orc::WriterVersion::kHive12055"
_WriterVersion_HIVE_13083 \
" arrow::adapters::orc::WriterVersion::kHive13083"
_WriterVersion_ORC_101" arrow::adapters::orc::WriterVersion::kOrc101"
_WriterVersion_ORC_135" arrow::adapters::orc::WriterVersion::kOrc135"
_WriterVersion_ORC_517" arrow::adapters::orc::WriterVersion::kOrc517"
_WriterVersion_ORC_203" arrow::adapters::orc::WriterVersion::kOrc203"
_WriterVersion_ORC_14" arrow::adapters::orc::WriterVersion::kOrc14"
_WriterVersion_MAX" arrow::adapters::orc::WriterVersion::kMax"
cdef cppclass FileVersion" arrow::adapters::orc::FileVersion":
FileVersion(uint32_t major_version, uint32_t minor_version)
uint32_t major_version()
uint32_t minor_version()
c_string ToString()
cdef struct WriteOptions" arrow::adapters::orc::WriteOptions":
int64_t batch_size
FileVersion file_version
int64_t stripe_size
CCompressionType compression
int64_t compression_block_size
CompressionStrategy compression_strategy
int64_t row_index_stride
double padding_tolerance
double dictionary_key_size_threshold
std_vector[int64_t] bloom_filter_columns
double bloom_filter_fpp
cdef extern from "arrow/adapters/orc/adapter.h" \
namespace "arrow::adapters::orc" nogil:
cdef cppclass ORCFileReader:
@staticmethod
CResult[unique_ptr[ORCFileReader]] Open(
const shared_ptr[CRandomAccessFile]& file,
CMemoryPool* pool)
CResult[shared_ptr[const CKeyValueMetadata]] ReadMetadata()
CResult[shared_ptr[CSchema]] ReadSchema()
CResult[shared_ptr[CRecordBatch]] ReadStripe(int64_t stripe)
CResult[shared_ptr[CRecordBatch]] ReadStripe(
int64_t stripe, std_vector[c_string])
CResult[shared_ptr[CTable]] Read()
CResult[shared_ptr[CTable]] Read(std_vector[c_string])
int64_t NumberOfStripes()
int64_t NumberOfRows()
FileVersion GetFileVersion()
c_string GetSoftwareVersion()
CResult[CCompressionType] GetCompression()
int64_t GetCompressionSize()
int64_t GetRowIndexStride()
WriterId GetWriterId()
int32_t GetWriterIdValue()
WriterVersion GetWriterVersion()
int64_t GetNumberOfStripeStatistics()
int64_t GetContentLength()
int64_t GetStripeStatisticsLength()
int64_t GetFileFooterLength()
int64_t GetFilePostscriptLength()
int64_t GetFileLength()
c_string GetSerializedFileTail()
cdef cppclass ORCFileWriter:
@staticmethod
CResult[unique_ptr[ORCFileWriter]] Open(
COutputStream* output_stream, const WriteOptions& writer_options)
CStatus Write(const CTable& table)
CStatus Close()

View File

@@ -0,0 +1,445 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
from cython.operator cimport dereference as deref
from libcpp.vector cimport vector as std_vector
from libcpp.utility cimport move
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport (check_status, _Weakrefable,
MemoryPool, maybe_unbox_memory_pool,
pyarrow_wrap_schema,
pyarrow_wrap_batch,
Table,
pyarrow_wrap_table,
pyarrow_wrap_metadata,
pyarrow_unwrap_table,
get_reader,
get_writer)
from pyarrow.lib import frombytes, tobytes
from pyarrow.util import _stringify_path
cdef compression_type_from_enum(CCompressionType compression_type):
compression_map = {
CCompressionType_UNCOMPRESSED: 'UNCOMPRESSED',
CCompressionType_GZIP: 'ZLIB',
CCompressionType_SNAPPY: 'SNAPPY',
CCompressionType_LZ4: 'LZ4',
CCompressionType_ZSTD: 'ZSTD',
}
if compression_type in compression_map:
return compression_map[compression_type]
raise ValueError('Unsupported compression')
cdef CCompressionType compression_type_from_name(name) except *:
if not isinstance(name, str):
raise TypeError('compression must be a string')
name = name.upper()
if name == 'ZLIB':
return CCompressionType_GZIP
elif name == 'SNAPPY':
return CCompressionType_SNAPPY
elif name == 'LZ4':
return CCompressionType_LZ4
elif name == 'ZSTD':
return CCompressionType_ZSTD
elif name == 'UNCOMPRESSED':
return CCompressionType_UNCOMPRESSED
raise ValueError(f'Unknown CompressionKind: {name}')
cdef compression_strategy_from_enum(
CompressionStrategy compression_strategy
):
compression_strategy_map = {
_CompressionStrategy_SPEED: 'SPEED',
_CompressionStrategy_COMPRESSION: 'COMPRESSION',
}
if compression_strategy in compression_strategy_map:
return compression_strategy_map[compression_strategy]
raise ValueError('Unsupported compression strategy')
cdef CompressionStrategy compression_strategy_from_name(name) except *:
if not isinstance(name, str):
raise TypeError('compression strategy must be a string')
name = name.upper()
if name == 'COMPRESSION':
return _CompressionStrategy_COMPRESSION
elif name == 'SPEED':
return _CompressionStrategy_SPEED
raise ValueError(f'Unknown CompressionStrategy: {name}')
cdef file_version_from_class(FileVersion file_version):
return frombytes(file_version.ToString())
cdef writer_id_from_enum(WriterId writer_id):
writer_id_map = {
_WriterId_ORC_JAVA_WRITER: 'ORC_JAVA',
_WriterId_ORC_CPP_WRITER: 'ORC_CPP',
_WriterId_PRESTO_WRITER: 'PRESTO',
_WriterId_SCRITCHLEY_GO: 'SCRITCHLEY_GO',
_WriterId_TRINO_WRITER: 'TRINO',
}
if writer_id in writer_id_map:
return writer_id_map[writer_id]
raise ValueError('Unsupported writer ID')
cdef writer_version_from_enum(WriterVersion writer_version):
writer_version_map = {
_WriterVersion_ORIGINAL: 'ORIGINAL',
_WriterVersion_HIVE_8732: 'HIVE_8732',
_WriterVersion_HIVE_4243: 'HIVE_4243',
_WriterVersion_HIVE_12055: 'HIVE_12055',
_WriterVersion_HIVE_13083: 'HIVE_13083',
_WriterVersion_ORC_101: 'ORC_101',
_WriterVersion_ORC_135: 'ORC_135',
_WriterVersion_ORC_517: 'ORC_517',
_WriterVersion_ORC_203: 'ORC_203',
_WriterVersion_ORC_14: 'ORC_14',
}
if writer_version in writer_version_map:
return writer_version_map[writer_version]
raise ValueError('Unsupported writer version')
cdef shared_ptr[WriteOptions] _create_write_options(
file_version=None,
batch_size=None,
stripe_size=None,
compression=None,
compression_block_size=None,
compression_strategy=None,
row_index_stride=None,
padding_tolerance=None,
dictionary_key_size_threshold=None,
bloom_filter_columns=None,
bloom_filter_fpp=None
) except *:
"""General writer options"""
cdef:
shared_ptr[WriteOptions] options
options = make_shared[WriteOptions]()
# batch_size
if batch_size is not None:
if isinstance(batch_size, int) and batch_size > 0:
deref(options).batch_size = batch_size
else:
raise ValueError(f"Invalid ORC writer batch size: {batch_size}")
# file_version
if file_version is not None:
if file_version == "0.12":
deref(options).file_version = FileVersion(0, 12)
elif file_version == "0.11":
deref(options).file_version = FileVersion(0, 11)
else:
raise ValueError(f"Unsupported ORC file version: {file_version}")
# stripe_size
if stripe_size is not None:
if isinstance(stripe_size, int) and stripe_size > 0:
deref(options).stripe_size = stripe_size
else:
raise ValueError(f"Invalid ORC stripe size: {stripe_size}")
# compression
if compression is not None:
if isinstance(compression, str):
deref(options).compression = compression_type_from_name(
compression)
else:
raise TypeError("Unsupported ORC compression type: "
f"{compression}")
# compression_block_size
if compression_block_size is not None:
if (isinstance(compression_block_size, int) and
compression_block_size > 0):
deref(options).compression_block_size = compression_block_size
else:
raise ValueError("Invalid ORC compression block size: "
f"{compression_block_size}")
# compression_strategy
if compression_strategy is not None:
if isinstance(compression, str):
deref(options).compression_strategy = \
compression_strategy_from_name(compression_strategy)
else:
raise TypeError("Unsupported ORC compression strategy: "
f"{compression_strategy}")
# row_index_stride
if row_index_stride is not None:
if isinstance(row_index_stride, int) and row_index_stride > 0:
deref(options).row_index_stride = row_index_stride
else:
raise ValueError("Invalid ORC row index stride: "
f"{row_index_stride}")
# padding_tolerance
if padding_tolerance is not None:
try:
padding_tolerance = float(padding_tolerance)
deref(options).padding_tolerance = padding_tolerance
except Exception:
raise ValueError("Invalid ORC padding tolerance: "
f"{padding_tolerance}")
# dictionary_key_size_threshold
if dictionary_key_size_threshold is not None:
try:
dictionary_key_size_threshold = float(
dictionary_key_size_threshold)
assert 0 <= dictionary_key_size_threshold <= 1
deref(options).dictionary_key_size_threshold = \
dictionary_key_size_threshold
except Exception:
raise ValueError("Invalid ORC dictionary key size threshold: "
f"{dictionary_key_size_threshold}")
# bloom_filter_columns
if bloom_filter_columns is not None:
try:
bloom_filter_columns = list(bloom_filter_columns)
for col in bloom_filter_columns:
assert isinstance(col, int) and col >= 0
deref(options).bloom_filter_columns = bloom_filter_columns
except Exception:
raise ValueError("Invalid ORC BloomFilter columns: "
f"{bloom_filter_columns}")
# Max false positive rate of the Bloom Filter
if bloom_filter_fpp is not None:
try:
bloom_filter_fpp = float(bloom_filter_fpp)
assert 0 <= bloom_filter_fpp <= 1
deref(options).bloom_filter_fpp = bloom_filter_fpp
except Exception:
raise ValueError("Invalid ORC BloomFilter false positive rate: "
f"{bloom_filter_fpp}")
return options
cdef class ORCReader(_Weakrefable):
cdef:
object source
CMemoryPool* allocator
unique_ptr[ORCFileReader] reader
def __cinit__(self, MemoryPool memory_pool=None):
self.allocator = maybe_unbox_memory_pool(memory_pool)
def open(self, object source, c_bool use_memory_map=True):
cdef:
shared_ptr[CRandomAccessFile] rd_handle
self.source = source
get_reader(source, use_memory_map, &rd_handle)
with nogil:
self.reader = move(GetResultValue(
ORCFileReader.Open(rd_handle, self.allocator)
))
def metadata(self):
"""
The arrow metadata for this file.
Returns
-------
metadata : pyarrow.KeyValueMetadata
"""
cdef:
shared_ptr[const CKeyValueMetadata] sp_arrow_metadata
with nogil:
sp_arrow_metadata = GetResultValue(
deref(self.reader).ReadMetadata()
)
return pyarrow_wrap_metadata(sp_arrow_metadata)
def schema(self):
"""
The arrow schema for this file.
Returns
-------
schema : pyarrow.Schema
"""
cdef:
shared_ptr[CSchema] sp_arrow_schema
with nogil:
sp_arrow_schema = GetResultValue(deref(self.reader).ReadSchema())
return pyarrow_wrap_schema(sp_arrow_schema)
def nrows(self):
return deref(self.reader).NumberOfRows()
def nstripes(self):
return deref(self.reader).NumberOfStripes()
def file_version(self):
return file_version_from_class(deref(self.reader).GetFileVersion())
def software_version(self):
return frombytes(deref(self.reader).GetSoftwareVersion())
def compression(self):
return compression_type_from_enum(
GetResultValue(deref(self.reader).GetCompression()))
def compression_size(self):
return deref(self.reader).GetCompressionSize()
def row_index_stride(self):
return deref(self.reader).GetRowIndexStride()
def writer(self):
writer_name = writer_id_from_enum(deref(self.reader).GetWriterId())
if writer_name == 'UNKNOWN':
return deref(self.reader).GetWriterIdValue()
else:
return writer_name
def writer_version(self):
return writer_version_from_enum(deref(self.reader).GetWriterVersion())
def nstripe_statistics(self):
return deref(self.reader).GetNumberOfStripeStatistics()
def content_length(self):
return deref(self.reader).GetContentLength()
def stripe_statistics_length(self):
return deref(self.reader).GetStripeStatisticsLength()
def file_footer_length(self):
return deref(self.reader).GetFileFooterLength()
def file_postscript_length(self):
return deref(self.reader).GetFilePostscriptLength()
def file_length(self):
return deref(self.reader).GetFileLength()
def serialized_file_tail(self):
return deref(self.reader).GetSerializedFileTail()
def read_stripe(self, n, columns=None):
cdef:
shared_ptr[CRecordBatch] sp_record_batch
int64_t stripe
std_vector[c_string] c_names
stripe = n
if columns is None:
with nogil:
sp_record_batch = GetResultValue(
deref(self.reader).ReadStripe(stripe)
)
else:
c_names = [tobytes(name) for name in columns]
with nogil:
sp_record_batch = GetResultValue(
deref(self.reader).ReadStripe(stripe, c_names)
)
return pyarrow_wrap_batch(sp_record_batch)
def read(self, columns=None):
cdef:
shared_ptr[CTable] sp_table
std_vector[c_string] c_names
if columns is None:
with nogil:
sp_table = GetResultValue(deref(self.reader).Read())
else:
c_names = [tobytes(name) for name in columns]
with nogil:
sp_table = GetResultValue(deref(self.reader).Read(c_names))
return pyarrow_wrap_table(sp_table)
cdef class ORCWriter(_Weakrefable):
cdef:
unique_ptr[ORCFileWriter] writer
shared_ptr[COutputStream] sink
c_bool own_sink
def open(self, object where, *,
file_version=None,
batch_size=None,
stripe_size=None,
compression=None,
compression_block_size=None,
compression_strategy=None,
row_index_stride=None,
padding_tolerance=None,
dictionary_key_size_threshold=None,
bloom_filter_columns=None,
bloom_filter_fpp=None):
cdef:
shared_ptr[WriteOptions] write_options
c_string c_where
try:
where = _stringify_path(where)
except TypeError:
get_writer(where, &self.sink)
self.own_sink = False
else:
c_where = tobytes(where)
with nogil:
self.sink = GetResultValue(FileOutputStream.Open(c_where))
self.own_sink = True
write_options = _create_write_options(
file_version=file_version,
batch_size=batch_size,
stripe_size=stripe_size,
compression=compression,
compression_block_size=compression_block_size,
compression_strategy=compression_strategy,
row_index_stride=row_index_stride,
padding_tolerance=padding_tolerance,
dictionary_key_size_threshold=dictionary_key_size_threshold,
bloom_filter_columns=bloom_filter_columns,
bloom_filter_fpp=bloom_filter_fpp
)
with nogil:
self.writer = move(GetResultValue(
ORCFileWriter.Open(self.sink.get(),
deref(write_options))))
def write(self, Table table):
cdef:
shared_ptr[CTable] sp_table
sp_table = pyarrow_unwrap_table(table)
with nogil:
check_status(deref(self.writer).Write(deref(sp_table)))
def close(self):
with nogil:
check_status(deref(self.writer).Close())
if self.own_sink:
check_status(deref(self.sink).Close())

View File

@@ -0,0 +1,152 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# distutils: language = c++
# cython: language_level = 3
from pyarrow.includes.libparquet cimport *
from pyarrow.lib cimport _Weakrefable
cdef class FileEncryptionProperties:
"""File-level encryption properties for the low-level API"""
cdef:
shared_ptr[CFileEncryptionProperties] properties
@staticmethod
cdef inline FileEncryptionProperties wrap(
shared_ptr[CFileEncryptionProperties] properties):
result = FileEncryptionProperties()
result.properties = properties
return result
cdef inline shared_ptr[CFileEncryptionProperties] unwrap(self):
return self.properties
cdef shared_ptr[WriterProperties] _create_writer_properties(
use_dictionary=*,
compression=*,
version=*,
write_statistics=*,
data_page_size=*,
compression_level=*,
use_byte_stream_split=*,
column_encoding=*,
data_page_version=*,
FileEncryptionProperties encryption_properties=*,
write_batch_size=*,
dictionary_pagesize_limit=*,
write_page_index=*,
write_page_checksum=*,
sorting_columns=*,
store_decimal_as_integer=*,
use_content_defined_chunking=*
) except *
cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
use_deprecated_int96_timestamps=*,
coerce_timestamps=*,
allow_truncated_timestamps=*,
writer_engine_version=*,
use_compliant_nested_type=*,
store_schema=*,
) except *
# Unwrap the "list_type" argument for ArrowReaderProperties
cdef Type _unwrap_list_type(obj) except *
cdef class ParquetSchema(_Weakrefable):
cdef:
FileMetaData parent # the FileMetaData owning the SchemaDescriptor
const SchemaDescriptor* schema
cdef class FileMetaData(_Weakrefable):
cdef:
shared_ptr[CFileMetaData] sp_metadata
CFileMetaData* _metadata
ParquetSchema _schema
cdef inline init(self, const shared_ptr[CFileMetaData]& metadata):
self.sp_metadata = metadata
self._metadata = metadata.get()
cdef class RowGroupMetaData(_Weakrefable):
cdef:
int index # for pickling support
unique_ptr[CRowGroupMetaData] up_metadata
CRowGroupMetaData* metadata
FileMetaData parent
cdef inline init(self, FileMetaData parent, int index):
if index < 0 or index >= parent.num_row_groups:
raise IndexError('{0} out of bounds'.format(index))
self.up_metadata = parent._metadata.RowGroup(index)
self.metadata = self.up_metadata.get()
self.parent = parent
self.index = index
cdef class ColumnChunkMetaData(_Weakrefable):
cdef:
unique_ptr[CColumnChunkMetaData] up_metadata
CColumnChunkMetaData* metadata
RowGroupMetaData parent
cdef inline init(self, RowGroupMetaData parent, int i):
self.up_metadata = parent.metadata.ColumnChunk(i)
self.metadata = self.up_metadata.get()
self.parent = parent
cdef class Statistics(_Weakrefable):
cdef:
shared_ptr[CStatistics] statistics
ColumnChunkMetaData parent
cdef inline init(self, const shared_ptr[CStatistics]& statistics,
ColumnChunkMetaData parent):
self.statistics = statistics
self.parent = parent
cdef class GeoStatistics(_Weakrefable):
cdef:
shared_ptr[CParquetGeoStatistics] statistics
ColumnChunkMetaData parent
cdef inline init(self, const shared_ptr[CParquetGeoStatistics]& statistics,
ColumnChunkMetaData parent):
self.statistics = statistics
self.parent = parent
cdef class FileDecryptionProperties:
"""File-level decryption properties for the low-level API"""
cdef:
shared_ptr[CFileDecryptionProperties] properties
@staticmethod
cdef inline FileDecryptionProperties wrap(
shared_ptr[CFileDecryptionProperties] properties):
result = FileDecryptionProperties()
result.properties = properties
return result
cdef inline shared_ptr[CFileDecryptionProperties] unwrap(self):
return self.properties

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,56 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# distutils: language = c++
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow.includes.libparquet_encryption cimport *
from pyarrow._parquet cimport (ParquetCipher,
CFileEncryptionProperties,
CFileDecryptionProperties,
FileEncryptionProperties,
FileDecryptionProperties,
ParquetCipher_AES_GCM_V1,
ParquetCipher_AES_GCM_CTR_V1)
from pyarrow.lib cimport _Weakrefable
cdef class CryptoFactory(_Weakrefable):
cdef shared_ptr[CPyCryptoFactory] factory
cdef init(self, callable_client_factory)
cdef inline shared_ptr[CPyCryptoFactory] unwrap(self)
cdef class EncryptionConfiguration(_Weakrefable):
cdef shared_ptr[CEncryptionConfiguration] configuration
cdef inline shared_ptr[CEncryptionConfiguration] unwrap(self) nogil
cdef class DecryptionConfiguration(_Weakrefable):
cdef shared_ptr[CDecryptionConfiguration] configuration
cdef inline shared_ptr[CDecryptionConfiguration] unwrap(self) nogil
cdef class KmsConnectionConfig(_Weakrefable):
cdef shared_ptr[CKmsConnectionConfig] configuration
cdef inline shared_ptr[CKmsConnectionConfig] unwrap(self) nogil
@staticmethod
cdef wrap(const CKmsConnectionConfig& config)
cdef shared_ptr[CCryptoFactory] pyarrow_unwrap_cryptofactory(object crypto_factory) except *
cdef shared_ptr[CKmsConnectionConfig] pyarrow_unwrap_kmsconnectionconfig(object kmsconnectionconfig) except *
cdef shared_ptr[CEncryptionConfiguration] pyarrow_unwrap_encryptionconfig(object encryptionconfig) except *
cdef shared_ptr[CDecryptionConfiguration] pyarrow_unwrap_decryptionconfig(object decryptionconfig) except *

View File

@@ -0,0 +1,502 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
from datetime import timedelta
from cpython.bytes cimport PyBytes_FromStringAndSize
from cython.operator cimport dereference as deref
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport _Weakrefable
from pyarrow.lib import tobytes, frombytes
cdef ParquetCipher cipher_from_name(name):
name = name.upper()
if name == 'AES_GCM_V1':
return ParquetCipher_AES_GCM_V1
elif name == 'AES_GCM_CTR_V1':
return ParquetCipher_AES_GCM_CTR_V1
else:
raise ValueError(f'Invalid cipher name: {name!r}')
cdef cipher_to_name(ParquetCipher cipher):
if ParquetCipher_AES_GCM_V1 == cipher:
return 'AES_GCM_V1'
elif ParquetCipher_AES_GCM_CTR_V1 == cipher:
return 'AES_GCM_CTR_V1'
else:
raise ValueError(f'Invalid cipher value: {cipher}')
cdef class EncryptionConfiguration(_Weakrefable):
"""Configuration of the encryption, such as which columns to encrypt"""
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, footer_key, *, column_keys=None,
uniform_encryption=None,
encryption_algorithm=None,
plaintext_footer=None, double_wrapping=None,
cache_lifetime=None, internal_key_material=None,
data_key_length_bits=None):
self.configuration.reset(
new CEncryptionConfiguration(tobytes(footer_key)))
if column_keys is not None:
self.column_keys = column_keys
if uniform_encryption is not None:
self.uniform_encryption = uniform_encryption
if encryption_algorithm is not None:
self.encryption_algorithm = encryption_algorithm
if plaintext_footer is not None:
self.plaintext_footer = plaintext_footer
if double_wrapping is not None:
self.double_wrapping = double_wrapping
if cache_lifetime is not None:
self.cache_lifetime = cache_lifetime
if internal_key_material is not None:
self.internal_key_material = internal_key_material
if data_key_length_bits is not None:
self.data_key_length_bits = data_key_length_bits
@property
def footer_key(self):
"""ID of the master key for footer encryption/signing"""
return frombytes(self.configuration.get().footer_key)
@property
def column_keys(self):
"""
List of columns to encrypt, with master key IDs.
"""
column_keys_str = frombytes(self.configuration.get().column_keys)
# Convert from "masterKeyID:colName,colName;masterKeyID:colName..."
# (see HIVE-21848) to dictionary of master key ID to column name lists
column_keys_to_key_list_str = dict(subString.replace(" ", "").split(
":") for subString in column_keys_str.split(";"))
column_keys_dict = {k: v.split(
",") for k, v in column_keys_to_key_list_str.items()}
return column_keys_dict
@column_keys.setter
def column_keys(self, dict value):
if value is not None:
# convert a dictionary such as
# '{"key1": ["col1 ", "col2"], "key2": ["col3 ", "col4"]}''
# to the string defined by the spec
# 'key1: col1 , col2; key2: col3 , col4'
column_keys = "; ".join(
[f"{k}: {', '.join(v)}" for k, v in value.items()])
self.configuration.get().column_keys = tobytes(column_keys)
@property
def uniform_encryption(self):
"""Whether to encrypt footer and all columns with the same encryption key.
This cannot be used together with column_keys.
"""
return self.configuration.get().uniform_encryption
@uniform_encryption.setter
def uniform_encryption(self, value):
self.configuration.get().uniform_encryption = value
@property
def encryption_algorithm(self):
"""Parquet encryption algorithm.
Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1"."""
return cipher_to_name(self.configuration.get().encryption_algorithm)
@encryption_algorithm.setter
def encryption_algorithm(self, value):
cipher = cipher_from_name(value)
self.configuration.get().encryption_algorithm = cipher
@property
def plaintext_footer(self):
"""Write files with plaintext footer."""
return self.configuration.get().plaintext_footer
@plaintext_footer.setter
def plaintext_footer(self, value):
self.configuration.get().plaintext_footer = value
@property
def double_wrapping(self):
"""Use double wrapping - where data encryption keys (DEKs) are
encrypted with key encryption keys (KEKs), which in turn are
encrypted with master keys.
If set to false, use single wrapping - where DEKs are
encrypted directly with master keys."""
return self.configuration.get().double_wrapping
@double_wrapping.setter
def double_wrapping(self, value):
self.configuration.get().double_wrapping = value
@property
def cache_lifetime(self):
"""Lifetime of cached entities (key encryption keys,
local wrapping keys, KMS client objects)."""
return timedelta(
seconds=self.configuration.get().cache_lifetime_seconds)
@cache_lifetime.setter
def cache_lifetime(self, value):
if not isinstance(value, timedelta):
raise TypeError("cache_lifetime should be a timedelta")
self.configuration.get().cache_lifetime_seconds = value.total_seconds()
@property
def internal_key_material(self):
"""Store key material inside Parquet file footers; this mode doesnt
produce additional files. If set to false, key material is stored in
separate files in the same folder, which enables key rotation for
immutable Parquet files."""
return self.configuration.get().internal_key_material
@internal_key_material.setter
def internal_key_material(self, value):
self.configuration.get().internal_key_material = value
@property
def data_key_length_bits(self):
"""Length of data encryption keys (DEKs), randomly generated by parquet key
management tools. Can be 128, 192 or 256 bits."""
return self.configuration.get().data_key_length_bits
@data_key_length_bits.setter
def data_key_length_bits(self, value):
self.configuration.get().data_key_length_bits = value
cdef inline shared_ptr[CEncryptionConfiguration] unwrap(self) nogil:
return self.configuration
cdef class DecryptionConfiguration(_Weakrefable):
"""Configuration of the decryption, such as cache timeout."""
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, *, cache_lifetime=None):
self.configuration.reset(new CDecryptionConfiguration())
@property
def cache_lifetime(self):
"""Lifetime of cached entities (key encryption keys,
local wrapping keys, KMS client objects)."""
return timedelta(
seconds=self.configuration.get().cache_lifetime_seconds)
@cache_lifetime.setter
def cache_lifetime(self, value):
self.configuration.get().cache_lifetime_seconds = value.total_seconds()
cdef inline shared_ptr[CDecryptionConfiguration] unwrap(self) nogil:
return self.configuration
cdef class KmsConnectionConfig(_Weakrefable):
"""Configuration of the connection to the Key Management Service (KMS)"""
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, *, kms_instance_id=None, kms_instance_url=None,
key_access_token=None, custom_kms_conf=None):
self.configuration.reset(new CKmsConnectionConfig())
if kms_instance_id is not None:
self.kms_instance_id = kms_instance_id
if kms_instance_url is not None:
self.kms_instance_url = kms_instance_url
if key_access_token is None:
self.key_access_token = b'DEFAULT'
else:
self.key_access_token = key_access_token
if custom_kms_conf is not None:
self.custom_kms_conf = custom_kms_conf
@property
def kms_instance_id(self):
"""ID of the KMS instance that will be used for encryption
(if multiple KMS instances are available)."""
return frombytes(self.configuration.get().kms_instance_id)
@kms_instance_id.setter
def kms_instance_id(self, value):
self.configuration.get().kms_instance_id = tobytes(value)
@property
def kms_instance_url(self):
"""URL of the KMS instance."""
return frombytes(self.configuration.get().kms_instance_url)
@kms_instance_url.setter
def kms_instance_url(self, value):
self.configuration.get().kms_instance_url = tobytes(value)
@property
def key_access_token(self):
"""Authorization token that will be passed to KMS."""
return frombytes(self.configuration.get()
.refreshable_key_access_token.get().value())
@key_access_token.setter
def key_access_token(self, value):
self.refresh_key_access_token(value)
@property
def custom_kms_conf(self):
"""A dictionary with KMS-type-specific configuration"""
custom_kms_conf = {
frombytes(k): frombytes(v)
for k, v in self.configuration.get().custom_kms_conf
}
return custom_kms_conf
@custom_kms_conf.setter
def custom_kms_conf(self, dict value):
if value is not None:
for k, v in value.items():
if isinstance(k, str) and isinstance(v, str):
self.configuration.get().custom_kms_conf[tobytes(k)] = \
tobytes(v)
else:
raise TypeError("Expected custom_kms_conf to be " +
"a dictionary of strings")
def refresh_key_access_token(self, value):
cdef:
shared_ptr[CKeyAccessToken] c_key_access_token = \
self.configuration.get().refreshable_key_access_token
c_key_access_token.get().Refresh(tobytes(value))
cdef inline shared_ptr[CKmsConnectionConfig] unwrap(self) nogil:
return self.configuration
@staticmethod
cdef wrap(const CKmsConnectionConfig& config):
result = KmsConnectionConfig()
result.configuration = make_shared[CKmsConnectionConfig](move(config))
return result
# Callback definitions for CPyKmsClientVtable
cdef void _cb_wrap_key(
handler, const CSecureString& key,
const c_string& master_key_identifier, c_string* out) except *:
view = <cpp_string_view>key.as_view()
key_bytes = <bytes>PyBytes_FromStringAndSize(view.data(), view.size())
mkid_str = frombytes(master_key_identifier)
wrapped_key = handler.wrap_key(key_bytes, mkid_str)
out[0] = tobytes(wrapped_key)
cdef void _cb_unwrap_key(
handler, const c_string& wrapped_key,
const c_string& master_key_identifier, CSecureString* out) except *:
mkid_str = frombytes(master_key_identifier)
wk_str = frombytes(wrapped_key)
key = handler.unwrap_key(wk_str, mkid_str)
cstr = <c_string>tobytes(key)
out[0] = CSecureString(move(cstr))
cdef class KmsClient(_Weakrefable):
"""The abstract base class for KmsClient implementations."""
cdef:
shared_ptr[CKmsClient] client
def __init__(self):
self.init()
cdef init(self):
cdef:
CPyKmsClientVtable vtable = CPyKmsClientVtable()
vtable.wrap_key = _cb_wrap_key
vtable.unwrap_key = _cb_unwrap_key
self.client.reset(new CPyKmsClient(self, vtable))
def wrap_key(self, key_bytes, master_key_identifier):
"""Wrap a key - encrypt it with the master key."""
raise NotImplementedError()
def unwrap_key(self, wrapped_key, master_key_identifier):
"""Unwrap a key - decrypt it with the master key."""
raise NotImplementedError()
cdef inline shared_ptr[CKmsClient] unwrap(self) nogil:
return self.client
# Callback definition for CPyKmsClientFactoryVtable
cdef void _cb_create_kms_client(
handler,
const CKmsConnectionConfig& kms_connection_config,
shared_ptr[CKmsClient]* out) except *:
connection_config = KmsConnectionConfig.wrap(kms_connection_config)
result = handler(connection_config)
if not isinstance(result, KmsClient):
raise TypeError(
f"callable must return KmsClient instances, but got {type(result)}")
out[0] = (<KmsClient> result).unwrap()
cdef class CryptoFactory(_Weakrefable):
""" A factory that produces the low-level FileEncryptionProperties and
FileDecryptionProperties objects, from the high-level parameters."""
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, kms_client_factory):
"""Create CryptoFactory.
Parameters
----------
kms_client_factory : a callable that accepts KmsConnectionConfig
and returns a KmsClient
"""
self.factory.reset(new CPyCryptoFactory())
if callable(kms_client_factory):
self.init(kms_client_factory)
else:
raise TypeError("Parameter kms_client_factory must be a callable")
cdef init(self, callable_client_factory):
cdef:
CPyKmsClientFactoryVtable vtable
shared_ptr[CPyKmsClientFactory] kms_client_factory
vtable.create_kms_client = _cb_create_kms_client
kms_client_factory.reset(
new CPyKmsClientFactory(callable_client_factory, vtable))
# A KmsClientFactory object must be registered
# via this method before calling any of
# file_encryption_properties()/file_decryption_properties() methods.
self.factory.get().RegisterKmsClientFactory(
static_pointer_cast[CKmsClientFactory, CPyKmsClientFactory](
kms_client_factory))
def file_encryption_properties(self,
KmsConnectionConfig kms_connection_config,
EncryptionConfiguration encryption_config):
"""Create file encryption properties.
Parameters
----------
kms_connection_config : KmsConnectionConfig
Configuration of connection to KMS
encryption_config : EncryptionConfiguration
Configuration of the encryption, such as which columns to encrypt
Returns
-------
file_encryption_properties : FileEncryptionProperties
File encryption properties.
"""
cdef:
CResult[shared_ptr[CFileEncryptionProperties]] \
file_encryption_properties_result
with nogil:
file_encryption_properties_result = \
self.factory.get().SafeGetFileEncryptionProperties(
deref(kms_connection_config.unwrap().get()),
deref(encryption_config.unwrap().get()))
file_encryption_properties = GetResultValue(
file_encryption_properties_result)
return FileEncryptionProperties.wrap(file_encryption_properties)
def file_decryption_properties(
self,
KmsConnectionConfig kms_connection_config,
DecryptionConfiguration decryption_config=None):
"""Create file decryption properties.
Parameters
----------
kms_connection_config : KmsConnectionConfig
Configuration of connection to KMS
decryption_config : DecryptionConfiguration, default None
Configuration of the decryption, such as cache timeout.
Can be None.
Returns
-------
file_decryption_properties : FileDecryptionProperties
File decryption properties.
"""
cdef:
CDecryptionConfiguration c_decryption_config
CResult[shared_ptr[CFileDecryptionProperties]] \
c_file_decryption_properties
if decryption_config is None:
c_decryption_config = CDecryptionConfiguration()
else:
c_decryption_config = deref(decryption_config.unwrap().get())
with nogil:
c_file_decryption_properties = \
self.factory.get().SafeGetFileDecryptionProperties(
deref(kms_connection_config.unwrap().get()),
c_decryption_config)
file_decryption_properties = GetResultValue(
c_file_decryption_properties)
return FileDecryptionProperties.wrap(file_decryption_properties)
def remove_cache_entries_for_token(self, access_token):
self.factory.get().RemoveCacheEntriesForToken(tobytes(access_token))
def remove_cache_entries_for_all_tokens(self):
self.factory.get().RemoveCacheEntriesForAllTokens()
cdef inline shared_ptr[CPyCryptoFactory] unwrap(self):
return self.factory
cdef shared_ptr[CCryptoFactory] pyarrow_unwrap_cryptofactory(object crypto_factory) except *:
if isinstance(crypto_factory, CryptoFactory):
pycf = (<CryptoFactory> crypto_factory).unwrap()
return static_pointer_cast[CCryptoFactory, CPyCryptoFactory](pycf)
raise TypeError("Expected CryptoFactory, got %s" % type(crypto_factory))
cdef shared_ptr[CKmsConnectionConfig] pyarrow_unwrap_kmsconnectionconfig(object kmsconnectionconfig) except *:
if isinstance(kmsconnectionconfig, KmsConnectionConfig):
return (<KmsConnectionConfig> kmsconnectionconfig).unwrap()
raise TypeError("Expected KmsConnectionConfig, got %s" % type(kmsconnectionconfig))
cdef shared_ptr[CEncryptionConfiguration] pyarrow_unwrap_encryptionconfig(object encryptionconfig) except *:
if isinstance(encryptionconfig, EncryptionConfiguration):
return (<EncryptionConfiguration> encryptionconfig).unwrap()
raise TypeError("Expected EncryptionConfiguration, got %s" % type(encryptionconfig))
cdef shared_ptr[CDecryptionConfiguration] pyarrow_unwrap_decryptionconfig(object decryptionconfig) except *:
if isinstance(decryptionconfig, DecryptionConfiguration):
return (<DecryptionConfiguration> decryptionconfig).unwrap()
raise TypeError("Expected DecryptionConfiguration, got %s" % type(decryptionconfig))

View File

@@ -0,0 +1,33 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# distutils: language = c++
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport CStatus
ctypedef CStatus cb_test_func()
cdef extern from "arrow/python/python_test.h" namespace "arrow::py::testing" nogil:
cdef cppclass CTestCase "arrow::py::testing::TestCase":
c_string name
cb_test_func func
vector[CTestCase] GetCppTestCases()

View File

@@ -0,0 +1,62 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False, binding=True
# distutils: language = c++
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport check_status
from pyarrow.lib import frombytes
cdef class CppTestCase:
"""
A simple wrapper for a C++ test case.
"""
cdef:
CTestCase c_case
@staticmethod
cdef wrap(CTestCase c_case):
cdef:
CppTestCase obj
obj = CppTestCase.__new__(CppTestCase)
obj.c_case = c_case
return obj
@property
def name(self):
return frombytes(self.c_case.name)
def __repr__(self):
return f"<{self.__class__.__name__} {self.name!r}>"
def __call__(self):
check_status(self.c_case.func())
def get_cpp_tests():
"""
Get a list of C++ test cases.
"""
cases = []
c_cases = GetCppTestCases()
for c_case in c_cases:
cases.append(CppTestCase.wrap(c_case))
return cases

View File

@@ -0,0 +1,491 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata,
pyarrow_unwrap_metadata)
from pyarrow.lib import frombytes, tobytes, KeyValueMetadata
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_fs cimport *
from pyarrow._fs cimport FileSystem
cpdef enum S3LogLevel:
Off = <int8_t> CS3LogLevel_Off
Fatal = <int8_t> CS3LogLevel_Fatal
Error = <int8_t> CS3LogLevel_Error
Warn = <int8_t> CS3LogLevel_Warn
Info = <int8_t> CS3LogLevel_Info
Debug = <int8_t> CS3LogLevel_Debug
Trace = <int8_t> CS3LogLevel_Trace
def initialize_s3(S3LogLevel log_level=S3LogLevel.Fatal, int num_event_loop_threads=1):
"""
Initialize S3 support
Parameters
----------
log_level : S3LogLevel
level of logging
num_event_loop_threads : int, default 1
how many threads to use for the AWS SDK's I/O event loop
Examples
--------
>>> fs.initialize_s3(fs.S3LogLevel.Error) # doctest: +SKIP
"""
cdef CS3GlobalOptions options
options.log_level = <CS3LogLevel> log_level
options.num_event_loop_threads = num_event_loop_threads
check_status(CInitializeS3(options))
def ensure_s3_initialized():
"""
Initialize S3 (with default options) if not already initialized
"""
check_status(CEnsureS3Initialized())
def finalize_s3():
check_status(CFinalizeS3())
def ensure_s3_finalized():
"""
Finalize S3 if already initialized
"""
check_status(CEnsureS3Finalized())
def resolve_s3_region(bucket):
"""
Resolve the S3 region of a bucket.
Parameters
----------
bucket : str
A S3 bucket name
Returns
-------
region : str
A S3 region name
Examples
--------
>>> fs.resolve_s3_region('voltrondata-labs-datasets')
'us-east-2'
"""
cdef:
c_string c_bucket
c_string c_region
ensure_s3_initialized()
c_bucket = tobytes(bucket)
with nogil:
c_region = GetResultValue(ResolveS3BucketRegion(c_bucket))
return frombytes(c_region)
class S3RetryStrategy:
"""
Base class for AWS retry strategies for use with S3.
Parameters
----------
max_attempts : int, default 3
The maximum number of retry attempts to attempt before failing.
"""
def __init__(self, max_attempts=3):
self.max_attempts = max_attempts
class AwsStandardS3RetryStrategy(S3RetryStrategy):
"""
Represents an AWS Standard retry strategy for use with S3.
Parameters
----------
max_attempts : int, default 3
The maximum number of retry attempts to attempt before failing.
"""
pass
class AwsDefaultS3RetryStrategy(S3RetryStrategy):
"""
Represents an AWS Default retry strategy for use with S3.
Parameters
----------
max_attempts : int, default 3
The maximum number of retry attempts to attempt before failing.
"""
pass
cdef class S3FileSystem(FileSystem):
"""
S3-backed FileSystem implementation
AWS access_key and secret_key can be provided explicitly.
If role_arn is provided instead of access_key and secret_key, temporary
credentials will be fetched by issuing a request to STS to assume the
specified role.
If neither access_key nor secret_key are provided, and role_arn is also not
provided, then attempts to establish the credentials automatically.
S3FileSystem will try the following methods, in order:
* ``AWS_ACCESS_KEY_ID``, ``AWS_SECRET_ACCESS_KEY``, and ``AWS_SESSION_TOKEN`` environment variables
* configuration files such as ``~/.aws/credentials`` and ``~/.aws/config``
* for nodes on Amazon EC2, the EC2 Instance Metadata Service
Note: S3 buckets are special and the operations available on them may be
limited or more expensive than desired.
When S3FileSystem creates new buckets (assuming allow_bucket_creation is
True), it does not pass any non-default settings. In AWS S3, the bucket and
all objects will be not publicly visible, and will have no bucket policies
and no resource tags. To have more control over how buckets are created,
use a different API to create them.
Parameters
----------
access_key : str, default None
AWS Access Key ID. Pass None to use the standard AWS environment
variables and/or configuration file.
secret_key : str, default None
AWS Secret Access key. Pass None to use the standard AWS environment
variables and/or configuration file.
session_token : str, default None
AWS Session Token. An optional session token, required if access_key
and secret_key are temporary credentials from STS.
anonymous : bool, default False
Whether to connect anonymously if access_key and secret_key are None.
If true, will not attempt to look up credentials using standard AWS
configuration methods.
role_arn : str, default None
AWS Role ARN. If provided instead of access_key and secret_key,
temporary credentials will be fetched by assuming this role.
session_name : str, default None
An optional identifier for the assumed role session.
external_id : str, default None
An optional unique identifier that might be required when you assume
a role in another account.
load_frequency : int, default 900
The frequency (in seconds) with which temporary credentials from an
assumed role session will be refreshed.
region : str, default None
AWS region to connect to. If not set, the AWS SDK will attempt to
determine the region using heuristics such as environment variables,
configuration profile, EC2 metadata, or default to 'us-east-1' when SDK
version <1.8. One can also use :func:`pyarrow.fs.resolve_s3_region` to
automatically resolve the region from a bucket name.
request_timeout : double, default None
Socket read timeouts on Windows and macOS, in seconds.
If omitted, the AWS SDK default value is used (typically 3 seconds).
This option is ignored on non-Windows, non-macOS systems.
connect_timeout : double, default None
Socket connection timeout, in seconds.
If omitted, the AWS SDK default value is used (typically 1 second).
scheme : str, default 'https'
S3 connection transport scheme.
endpoint_override : str, default None
Override region with a connect string such as "localhost:9000"
background_writes : bool, default True
Whether file writes will be issued in the background, without
blocking.
default_metadata : mapping or pyarrow.KeyValueMetadata, default None
Default metadata for open_output_stream. This will be ignored if
non-empty metadata is passed to open_output_stream.
proxy_options : dict or str, default None
If a proxy is used, provide the options here. Supported options are:
'scheme' (str: 'http' or 'https'; required), 'host' (str; required),
'port' (int; required), 'username' (str; optional),
'password' (str; optional).
A proxy URI (str) can also be provided, in which case these options
will be derived from the provided URI.
The following are equivalent::
S3FileSystem(proxy_options='http://username:password@localhost:8020')
S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost',
'port': 8020, 'username': 'username',
'password': 'password'})
allow_delayed_open : bool, default False
Whether to allow file-open methods to return before the actual open. This option
may reduce latency as it decreases the number of round trips.
The downside is failures such as opening a file in a non-existing bucket will
only be reported when actual I/O is done (at worst, when attempting to close the
file).
allow_bucket_creation : bool, default False
Whether to allow directory creation at the bucket-level. This option may also be
passed in a URI query parameter.
allow_bucket_deletion : bool, default False
Whether to allow directory deletion at the bucket-level. This option may also be
passed in a URI query parameter.
check_directory_existence_before_creation : bool, default false
Whether to check the directory existence before creating it.
If false, when creating a directory the code will not check if it already
exists or not. It's an optimization to try directory creation and catch the error,
rather than issue two dependent I/O calls.
If true, when creating a directory the code will only create the directory when necessary
at the cost of extra I/O calls. This can be used for key/value cloud storage which has
a hard rate limit to number of object mutation operations or scenarios such as
the directories already exist and you do not have creation access.
retry_strategy : S3RetryStrategy, default AwsStandardS3RetryStrategy(max_attempts=3)
The retry strategy to use with S3; fail after max_attempts. Available
strategies are AwsStandardS3RetryStrategy, AwsDefaultS3RetryStrategy.
force_virtual_addressing : bool, default False
Whether to use virtual addressing of buckets.
If true, then virtual addressing is always enabled.
If false, then virtual addressing is only enabled if `endpoint_override` is empty.
This can be used for non-AWS backends that only support virtual hosted-style access.
tls_ca_file_path : str, default None
If set, this should be the path of a file containing TLS certificates
in PEM format which will be used for TLS verification.
Examples
--------
>>> from pyarrow import fs
>>> s3 = fs.S3FileSystem(region='us-west-2')
>>> s3.get_file_info(fs.FileSelector(
... 'power-analysis-ready-datastore/power_901_constants.zarr/FROCEAN', recursive=True
... )) # doctest: +SKIP
[<FileInfo for 'power-analysis-ready-datastore/power_901_constants.zarr/FROCEAN/.zarray...
For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`.
"""
cdef:
CS3FileSystem* s3fs
def __init__(self, *, access_key=None, secret_key=None, session_token=None,
bint anonymous=False, region=None, request_timeout=None,
connect_timeout=None, scheme=None, endpoint_override=None,
bint background_writes=True, default_metadata=None,
role_arn=None, session_name=None, external_id=None,
load_frequency=900, proxy_options=None,
allow_delayed_open=False,
allow_bucket_creation=False, allow_bucket_deletion=False,
check_directory_existence_before_creation=False,
retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(
max_attempts=3),
force_virtual_addressing=False, tls_ca_file_path=None):
cdef:
optional[CS3Options] options
shared_ptr[CS3FileSystem] wrapped
# Need to do this before initializing `options` as the S3Options
# constructor has a debug check against use after S3 finalization.
ensure_s3_initialized()
if access_key is not None and secret_key is None:
raise ValueError(
'In order to initialize with explicit credentials both '
'access_key and secret_key must be provided, '
'`secret_key` is not set.'
)
elif access_key is None and secret_key is not None:
raise ValueError(
'In order to initialize with explicit credentials both '
'access_key and secret_key must be provided, '
'`access_key` is not set.'
)
elif session_token is not None and (access_key is None or
secret_key is None):
raise ValueError(
'In order to initialize a session with temporary credentials, '
'both secret_key and access_key must be provided in addition '
'to session_token.'
)
elif (access_key is not None or secret_key is not None):
if anonymous:
raise ValueError(
'Cannot pass anonymous=True together with access_key '
'and secret_key.')
if role_arn:
raise ValueError(
'Cannot provide role_arn with access_key and secret_key')
if session_token is None:
session_token = ""
options = CS3Options.FromAccessKey(
tobytes(access_key),
tobytes(secret_key),
tobytes(session_token)
)
elif anonymous:
if role_arn:
raise ValueError(
'Cannot provide role_arn with anonymous=True')
options = CS3Options.Anonymous()
elif role_arn:
if session_name is None:
session_name = ''
if external_id is None:
external_id = ''
options = CS3Options.FromAssumeRole(
tobytes(role_arn),
tobytes(session_name),
tobytes(external_id),
load_frequency
)
else:
options = CS3Options.Defaults()
if region is not None:
options.value().region = tobytes(region)
if request_timeout is not None:
options.value().request_timeout = request_timeout
if connect_timeout is not None:
options.value().connect_timeout = connect_timeout
if scheme is not None:
options.value().scheme = tobytes(scheme)
if endpoint_override is not None:
options.value().endpoint_override = tobytes(endpoint_override)
if background_writes is not None:
options.value().background_writes = background_writes
if default_metadata is not None:
if not isinstance(default_metadata, KeyValueMetadata):
default_metadata = KeyValueMetadata(default_metadata)
options.value().default_metadata = pyarrow_unwrap_metadata(
default_metadata)
if proxy_options is not None:
if isinstance(proxy_options, dict):
options.value().proxy_options.scheme = tobytes(
proxy_options["scheme"])
options.value().proxy_options.host = tobytes(
proxy_options["host"])
options.value().proxy_options.port = proxy_options["port"]
proxy_username = proxy_options.get("username", None)
if proxy_username:
options.value().proxy_options.username = tobytes(
proxy_username)
proxy_password = proxy_options.get("password", None)
if proxy_password:
options.value().proxy_options.password = tobytes(
proxy_password)
elif isinstance(proxy_options, str):
options.value().proxy_options = GetResultValue(
CS3ProxyOptions.FromUriString(tobytes(proxy_options)))
else:
raise TypeError(
"'proxy_options': expected 'dict' or 'str', "
f"got {type(proxy_options)} instead.")
options.value().allow_delayed_open = allow_delayed_open
options.value().allow_bucket_creation = allow_bucket_creation
options.value().allow_bucket_deletion = allow_bucket_deletion
options.value().check_directory_existence_before_creation = check_directory_existence_before_creation
options.value().force_virtual_addressing = force_virtual_addressing
if isinstance(retry_strategy, AwsStandardS3RetryStrategy):
options.value().retry_strategy = CS3RetryStrategy.GetAwsStandardRetryStrategy(
retry_strategy.max_attempts)
elif isinstance(retry_strategy, AwsDefaultS3RetryStrategy):
options.value().retry_strategy = CS3RetryStrategy.GetAwsDefaultRetryStrategy(
retry_strategy.max_attempts)
else:
raise ValueError(f'Invalid retry_strategy {retry_strategy!r}')
if tls_ca_file_path is not None:
options.value().tls_ca_file_path = tobytes(tls_ca_file_path)
with nogil:
wrapped = GetResultValue(CS3FileSystem.Make(options.value()))
self.init(<shared_ptr[CFileSystem]> wrapped)
cdef init(self, const shared_ptr[CFileSystem]& wrapped):
FileSystem.init(self, wrapped)
self.s3fs = <CS3FileSystem*> wrapped.get()
@staticmethod
def _reconstruct(kwargs):
# __reduce__ doesn't allow passing named arguments directly to the
# reconstructor, hence this wrapper.
return S3FileSystem(**kwargs)
def __reduce__(self):
cdef CS3Options opts = self.s3fs.options()
# if creds were explicitly provided, then use them
# else obtain them as they were last time.
if opts.credentials_kind == CS3CredentialsKind_Explicit:
access_key = frombytes(opts.GetAccessKey())
secret_key = frombytes(opts.GetSecretKey())
session_token = frombytes(opts.GetSessionToken())
else:
access_key = None
secret_key = None
session_token = None
return (
S3FileSystem._reconstruct, (dict(
access_key=access_key,
secret_key=secret_key,
session_token=session_token,
anonymous=(opts.credentials_kind ==
CS3CredentialsKind_Anonymous),
region=frombytes(opts.region),
scheme=frombytes(opts.scheme),
connect_timeout=opts.connect_timeout,
request_timeout=opts.request_timeout,
endpoint_override=frombytes(opts.endpoint_override),
role_arn=frombytes(opts.role_arn),
session_name=frombytes(opts.session_name),
external_id=frombytes(opts.external_id),
load_frequency=opts.load_frequency,
background_writes=opts.background_writes,
allow_delayed_open=opts.allow_delayed_open,
allow_bucket_creation=opts.allow_bucket_creation,
allow_bucket_deletion=opts.allow_bucket_deletion,
check_directory_existence_before_creation=opts.check_directory_existence_before_creation,
default_metadata=pyarrow_wrap_metadata(opts.default_metadata),
proxy_options={'scheme': frombytes(opts.proxy_options.scheme),
'host': frombytes(opts.proxy_options.host),
'port': opts.proxy_options.port,
'username': frombytes(
opts.proxy_options.username),
'password': frombytes(
opts.proxy_options.password)},
force_virtual_addressing=opts.force_virtual_addressing,
tls_ca_file_path=frombytes(opts.tls_ca_file_path),
),)
)
@property
def region(self):
"""
The AWS region this filesystem connects to.
"""
return frombytes(self.s3fs.region())

View File

@@ -0,0 +1,481 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from cython.operator cimport dereference as deref
from libcpp.vector cimport vector as std_vector
from pyarrow import Buffer, py_buffer
from pyarrow._compute cimport Expression
from pyarrow.lib import frombytes, tobytes
from pyarrow.lib cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_substrait cimport *
try:
import substrait as py_substrait
except ImportError:
py_substrait = None
else:
import substrait.proto # no-cython-lint
# TODO GH-37235: Fix exception handling
cdef CDeclaration _create_named_table_provider(
dict named_args, const std_vector[c_string]& names, const CSchema& schema
) noexcept:
cdef:
c_string c_name
shared_ptr[CTable] c_in_table
shared_ptr[CTableSourceNodeOptions] c_tablesourceopts
shared_ptr[CExecNodeOptions] c_input_node_opts
vector[CDeclaration.Input] no_c_inputs
py_names = []
for i in range(names.size()):
c_name = names[i]
py_names.append(frombytes(c_name))
py_schema = pyarrow_wrap_schema(make_shared[CSchema](schema))
py_table = named_args["provider"](py_names, py_schema)
c_in_table = pyarrow_unwrap_table(py_table)
c_tablesourceopts = make_shared[CTableSourceNodeOptions](c_in_table)
c_input_node_opts = static_pointer_cast[CExecNodeOptions, CTableSourceNodeOptions](
c_tablesourceopts)
return CDeclaration(tobytes("table_source"),
no_c_inputs, c_input_node_opts)
def run_query(plan, *, table_provider=None, use_threads=True):
"""
Execute a Substrait plan and read the results as a RecordBatchReader.
Parameters
----------
plan : Union[Buffer, bytes]
The serialized Substrait plan to execute.
table_provider : object (optional)
A function to resolve any NamedTable relation to a table.
The function will receive two arguments which will be a list
of strings representing the table name and a pyarrow.Schema representing
the expected schema and should return a pyarrow.Table.
use_threads : bool, default True
If True then multiple threads will be used to run the query. If False then
all CPU intensive work will be done on the calling thread.
Returns
-------
RecordBatchReader
A reader containing the result of the executed query
Examples
--------
>>> import pyarrow as pa
>>> from pyarrow.lib import tobytes
>>> import pyarrow.substrait as substrait
>>> test_table_1 = pa.Table.from_pydict({"x": [1, 2, 3]})
>>> test_table_2 = pa.Table.from_pydict({"x": [4, 5, 6]})
>>> def table_provider(names, schema):
... if not names:
... raise Exception("No names provided")
... elif names[0] == "t1":
... return test_table_1
... elif names[1] == "t2":
... return test_table_2
... else:
... raise Exception("Unrecognized table name")
...
>>> substrait_query = '''
... {
... "relations": [
... {"rel": {
... "read": {
... "base_schema": {
... "struct": {
... "types": [
... {"i64": {}}
... ]
... },
... "names": [
... "x"
... ]
... },
... "namedTable": {
... "names": ["t1"]
... }
... }
... }}
... ]
... }
... '''
>>> buf = pa._substrait._parse_json_plan(tobytes(substrait_query))
>>> reader = pa.substrait.run_query(buf, table_provider=table_provider)
>>> reader.read_all()
pyarrow.Table
x: int64
----
x: [[1,2,3]]
"""
cdef:
CResult[shared_ptr[CRecordBatchReader]] c_res_reader
shared_ptr[CRecordBatchReader] c_reader
RecordBatchReader reader
shared_ptr[CBuffer] c_buf_plan
CConversionOptions c_conversion_options
c_bool c_use_threads
c_use_threads = use_threads
if isinstance(plan, (bytes, memoryview)):
c_buf_plan = pyarrow_unwrap_buffer(py_buffer(plan))
elif isinstance(plan, Buffer):
c_buf_plan = pyarrow_unwrap_buffer(plan)
else:
raise TypeError(
f"Expected 'pyarrow.Buffer' or bytes, got '{type(plan)}'")
if table_provider is not None:
named_table_args = {
"provider": table_provider
}
c_conversion_options.named_table_provider = BindFunction[CNamedTableProvider](
&_create_named_table_provider, named_table_args)
with nogil:
c_res_reader = ExecuteSerializedPlan(
deref(c_buf_plan), default_extension_id_registry(),
GetFunctionRegistry(), c_conversion_options, c_use_threads)
c_reader = GetResultValue(c_res_reader)
reader = RecordBatchReader.__new__(RecordBatchReader)
reader.reader = c_reader
return reader
def _parse_json_plan(plan):
"""
Parse a JSON plan into equivalent serialized Protobuf.
Parameters
----------
plan : bytes
Substrait plan in JSON.
Returns
-------
Buffer
A buffer containing the serialized Protobuf plan.
"""
cdef:
CResult[shared_ptr[CBuffer]] c_res_buffer
c_string c_str_plan
shared_ptr[CBuffer] c_buf_plan
c_str_plan = plan
c_res_buffer = SerializeJsonPlan(c_str_plan)
with nogil:
c_buf_plan = GetResultValue(c_res_buffer)
return pyarrow_wrap_buffer(c_buf_plan)
class SubstraitSchema:
"""A Schema encoded for Substrait usage.
The SubstraitSchema contains a schema represented
both as a substrait ``NamedStruct`` and as an
``ExtendedExpression``.
The ``ExtendedExpression`` is available for cases where types
used by the schema require extensions to decode them.
In such case the schema will be the ``base_schema`` of the
``ExtendedExpression`` and all extensions will be provided.
"""
def __init__(self, schema, expression):
self.schema = schema
self.expression = expression
def to_pysubstrait(self):
"""Convert the schema to a substrait-python ExtendedExpression object."""
if py_substrait is None:
raise ImportError("The 'substrait' package is required.")
return py_substrait.proto.ExtendedExpression.FromString(self.expression)
def serialize_schema(schema):
"""
Serialize a schema into a SubstraitSchema object.
Parameters
----------
schema : Schema
The schema to serialize
Returns
-------
SubstraitSchema
The schema stored in a SubstraitSchema object.
"""
return SubstraitSchema(
schema=_serialize_namedstruct_schema(schema),
expression=serialize_expressions([], [], schema, allow_arrow_extensions=True)
)
def _serialize_namedstruct_schema(schema):
cdef:
CResult[shared_ptr[CBuffer]] c_res_buffer
shared_ptr[CBuffer] c_buffer
CConversionOptions c_conversion_options
CExtensionSet c_extensions
with nogil:
c_res_buffer = SerializeSchema(deref((<Schema> schema).sp_schema), &c_extensions, c_conversion_options)
c_buffer = GetResultValue(c_res_buffer)
return memoryview(pyarrow_wrap_buffer(c_buffer))
def deserialize_schema(buf):
"""
Deserialize a ``NamedStruct`` Substrait message
or a SubstraitSchema object into an Arrow Schema object
Parameters
----------
buf : Buffer or bytes or SubstraitSchema
The message to deserialize
Returns
-------
Schema
The deserialized schema
"""
cdef:
shared_ptr[CBuffer] c_buffer
CResult[shared_ptr[CSchema]] c_res_schema
shared_ptr[CSchema] c_schema
CConversionOptions c_conversion_options
CExtensionSet c_extensions
if isinstance(buf, SubstraitSchema):
return deserialize_expressions(buf.expression).schema
if isinstance(buf, (bytes, memoryview)):
c_buffer = pyarrow_unwrap_buffer(py_buffer(buf))
elif isinstance(buf, Buffer):
c_buffer = pyarrow_unwrap_buffer(buf)
else:
raise TypeError(
f"Expected 'pyarrow.Buffer' or bytes, got '{type(buf)}'")
with nogil:
c_res_schema = DeserializeSchema(
deref(c_buffer), c_extensions, c_conversion_options)
c_schema = GetResultValue(c_res_schema)
return pyarrow_wrap_schema(c_schema)
def serialize_expressions(exprs, names, schema, *, allow_arrow_extensions=False):
"""
Serialize a collection of expressions into Substrait
Substrait expressions must be bound to a schema. For example,
the Substrait expression ``a:i32 + b:i32`` is different from the
Substrait expression ``a:i64 + b:i64``. Pyarrow expressions are
typically unbound. For example, both of the above expressions
would be represented as ``a + b`` in pyarrow.
This means a schema must be provided when serializing an expression.
It also means that the serialization may fail if a matching function
call cannot be found for the expression.
Parameters
----------
exprs : list of Expression
The expressions to serialize
names : list of str
Names for the expressions
schema : Schema
The schema the expressions will be bound to
allow_arrow_extensions : bool, default False
If False then only functions that are part of the core Substrait function
definitions will be allowed. Set this to True to allow pyarrow-specific functions
and user defined functions but the result may not be accepted by other
compute libraries.
Returns
-------
Buffer
An ExtendedExpression message containing the serialized expressions
"""
cdef:
CResult[shared_ptr[CBuffer]] c_res_buffer
shared_ptr[CBuffer] c_buffer
CNamedExpression c_named_expr
CBoundExpressions c_bound_exprs
CConversionOptions c_conversion_options
if len(exprs) != len(names):
raise ValueError("exprs and names need to have the same length")
for expr, name in zip(exprs, names):
if not isinstance(expr, Expression):
raise TypeError(f"Expected Expression, got '{type(expr)}' in exprs")
if not isinstance(name, str):
raise TypeError(f"Expected str, got '{type(name)}' in names")
c_named_expr.expression = (<Expression> expr).unwrap()
c_named_expr.name = tobytes(<str> name)
c_bound_exprs.named_expressions.push_back(c_named_expr)
c_bound_exprs.schema = (<Schema> schema).sp_schema
c_conversion_options.allow_arrow_extensions = allow_arrow_extensions
with nogil:
c_res_buffer = SerializeExpressions(c_bound_exprs, c_conversion_options)
c_buffer = GetResultValue(c_res_buffer)
return memoryview(pyarrow_wrap_buffer(c_buffer))
cdef class BoundExpressions(_Weakrefable):
"""
A collection of named expressions and the schema they are bound to
This is equivalent to the Substrait ExtendedExpression message
"""
cdef:
CBoundExpressions c_bound_exprs
def __init__(self):
msg = 'BoundExpressions is an abstract class thus cannot be initialized.'
raise TypeError(msg)
cdef void init(self, CBoundExpressions bound_expressions):
self.c_bound_exprs = bound_expressions
@property
def schema(self):
"""
The common schema that all expressions are bound to
"""
return pyarrow_wrap_schema(self.c_bound_exprs.schema)
@property
def expressions(self):
"""
A dict from expression name to expression
"""
expr_dict = {}
for named_expr in self.c_bound_exprs.named_expressions:
name = frombytes(named_expr.name)
expr = Expression.wrap(named_expr.expression)
expr_dict[name] = expr
return expr_dict
@staticmethod
cdef wrap(const CBoundExpressions& bound_expressions):
cdef BoundExpressions self = BoundExpressions.__new__(BoundExpressions)
self.init(bound_expressions)
return self
@classmethod
def from_substrait(cls, message):
"""
Convert a Substrait message into a BoundExpressions object
Parameters
----------
message : Buffer or bytes or protobuf Message
The message to convert to a BoundExpressions object
Returns
-------
BoundExpressions
The converted expressions, their names, and the bound schema
"""
if isinstance(message, (bytes, memoryview)):
return deserialize_expressions(message)
elif isinstance(message, Buffer):
return deserialize_expressions(message)
else:
try:
return deserialize_expressions(message.SerializeToString())
except AttributeError:
raise TypeError(
f"Expected 'pyarrow.Buffer' or bytes or protobuf Message, got '{type(message)}'")
def deserialize_expressions(buf):
"""
Deserialize an ExtendedExpression Substrait message into a BoundExpressions object
Parameters
----------
buf : Buffer or bytes
The message to deserialize
Returns
-------
BoundExpressions
The deserialized expressions, their names, and the bound schema
"""
cdef:
shared_ptr[CBuffer] c_buffer
CResult[CBoundExpressions] c_res_bound_exprs
CBoundExpressions c_bound_exprs
if isinstance(buf, (bytes, memoryview)):
c_buffer = pyarrow_unwrap_buffer(py_buffer(buf))
elif isinstance(buf, Buffer):
c_buffer = pyarrow_unwrap_buffer(buf)
else:
raise TypeError(
f"Expected 'pyarrow.Buffer' or bytes, got '{type(buf)}'")
with nogil:
c_res_bound_exprs = DeserializeExpressions(deref(c_buffer))
c_bound_exprs = GetResultValue(c_res_bound_exprs)
return BoundExpressions.wrap(c_bound_exprs)
def get_supported_functions():
"""
Get a list of Substrait functions that the underlying
engine currently supports.
Returns
-------
list[str]
A list of function ids encoded as '{uri}#{name}'
"""
cdef:
ExtensionIdRegistry* c_id_registry
std_vector[c_string] c_ids
c_id_registry = default_extension_id_registry()
c_ids = c_id_registry.GetSupportedSubstraitFunctions()
functions_list = []
for c_id in c_ids:
functions_list.append(frombytes(c_id))
return functions_list

View File

@@ -0,0 +1,418 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ---------------------------------------------------------------------
# Implement Internal ExecPlan bindings
# cython: profile=False
# distutils: language = c++
# cython: language_level = 3
from pyarrow.lib import Table, RecordBatch, array
from pyarrow.compute import Expression, field
try:
from pyarrow._acero import ( # noqa
Declaration,
ExecNodeOptions,
TableSourceNodeOptions,
FilterNodeOptions,
ProjectNodeOptions,
AggregateNodeOptions,
OrderByNodeOptions,
HashJoinNodeOptions,
AsofJoinNodeOptions,
)
except ImportError as exc:
raise ImportError(
f"The pyarrow installation is not built with support for 'acero' ({str(exc)})"
) from None
try:
import pyarrow.dataset as ds
from pyarrow._dataset import ScanNodeOptions
except ImportError:
class DatasetModuleStub:
class Dataset:
pass
class InMemoryDataset:
pass
ds = DatasetModuleStub
def _dataset_to_decl(dataset, use_threads=True, implicit_ordering=False):
decl = Declaration("scan", ScanNodeOptions(
dataset, use_threads=use_threads,
implicit_ordering=implicit_ordering))
# Get rid of special dataset columns
# "__fragment_index", "__batch_index", "__last_in_fragment", "__filename"
projections = [field(f) for f in dataset.schema.names]
decl = Declaration.from_sequence(
[decl, Declaration("project", ProjectNodeOptions(projections))]
)
filter_expr = dataset._scan_options.get("filter")
if filter_expr is not None:
# Filters applied in CScanNodeOptions are "best effort" for the scan node itself
# so we always need to inject an additional Filter node to apply them for real.
decl = Declaration.from_sequence(
[decl, Declaration("filter", FilterNodeOptions(filter_expr))]
)
return decl
def _perform_join(join_type, left_operand, left_keys,
right_operand, right_keys,
left_suffix=None, right_suffix=None,
use_threads=True, coalesce_keys=False,
output_type=Table, filter_expression=None):
"""
Perform join of two tables or datasets.
The result will be an output table with the result of the join operation
Parameters
----------
join_type : str
One of supported join types.
left_operand : Table or Dataset
The left operand for the join operation.
left_keys : str or list[str]
The left key (or keys) on which the join operation should be performed.
right_operand : Table or Dataset
The right operand for the join operation.
right_keys : str or list[str]
The right key (or keys) on which the join operation should be performed.
left_suffix : str, default None
Which suffix to add to left column names. This prevents confusion
when the columns in left and right operands have colliding names.
right_suffix : str, default None
Which suffix to add to the right column names. This prevents confusion
when the columns in left and right operands have colliding names.
use_threads : bool, default True
Whether to use multithreading or not.
coalesce_keys : bool, default False
If the duplicated keys should be omitted from one of the sides
in the join result.
output_type: Table or InMemoryDataset
The output type for the exec plan result.
filter_expression : pyarrow.compute.Expression
Residual filter which is applied to matching row.
Returns
-------
result_table : Table or InMemoryDataset
"""
if not isinstance(left_operand, (Table, ds.Dataset)):
raise TypeError(f"Expected Table or Dataset, got {type(left_operand)}")
if not isinstance(right_operand, (Table, ds.Dataset)):
raise TypeError(f"Expected Table or Dataset, got {type(right_operand)}")
# Prepare left and right tables Keys to send them to the C++ function
left_keys_order = {}
if not isinstance(left_keys, (tuple, list)):
left_keys = [left_keys]
for idx, key in enumerate(left_keys):
left_keys_order[key] = idx
right_keys_order = {}
if not isinstance(right_keys, (list, tuple)):
right_keys = [right_keys]
for idx, key in enumerate(right_keys):
right_keys_order[key] = idx
# By default expose all columns on both left and right table
left_columns = left_operand.schema.names
right_columns = right_operand.schema.names
# Pick the join type
if join_type == "left semi" or join_type == "left anti":
right_columns = []
elif join_type == "right semi" or join_type == "right anti":
left_columns = []
elif join_type == "inner" or join_type == "left outer":
right_columns = [
col for col in right_columns if col not in right_keys_order
]
elif join_type == "right outer":
left_columns = [
col for col in left_columns if col not in left_keys_order
]
# Turn the columns to vectors of FieldRefs
# and set aside indices of keys.
left_column_keys_indices = {}
for idx, colname in enumerate(left_columns):
if colname in left_keys:
left_column_keys_indices[colname] = idx
right_column_keys_indices = {}
for idx, colname in enumerate(right_columns):
if colname in right_keys:
right_column_keys_indices[colname] = idx
# Add the join node to the execplan
if isinstance(left_operand, ds.Dataset):
left_source = _dataset_to_decl(left_operand, use_threads=use_threads)
else:
left_source = Declaration("table_source", TableSourceNodeOptions(left_operand))
if isinstance(right_operand, ds.Dataset):
right_source = _dataset_to_decl(right_operand, use_threads=use_threads)
else:
right_source = Declaration(
"table_source", TableSourceNodeOptions(right_operand)
)
if coalesce_keys:
join_opts = HashJoinNodeOptions(
join_type, left_keys, right_keys, left_columns, right_columns,
output_suffix_for_left=left_suffix or "",
output_suffix_for_right=right_suffix or "",
filter_expression=filter_expression,
)
else:
join_opts = HashJoinNodeOptions(
join_type, left_keys, right_keys,
output_suffix_for_left=left_suffix or "",
output_suffix_for_right=right_suffix or "",
filter_expression=filter_expression,
)
decl = Declaration(
"hashjoin", options=join_opts, inputs=[left_source, right_source]
)
if coalesce_keys and join_type == "full outer":
# In case of full outer joins, the join operation will output all columns
# so that we can coalesce the keys and exclude duplicates in a subsequent
# projection.
left_columns_set = set(left_columns)
right_columns_set = set(right_columns)
# Where the right table columns start.
right_operand_index = len(left_columns)
projected_col_names = []
projections = []
for idx, col in enumerate(left_columns + right_columns):
if idx < len(left_columns) and col in left_column_keys_indices:
# Include keys only once and coalesce left+right table keys.
projected_col_names.append(col)
# Get the index of the right key that is being paired
# with this left key. We do so by retrieving the name
# of the right key that is in the same position in the provided keys
# and then looking up the index for that name in the right table.
right_key_index = right_column_keys_indices[
right_keys[left_keys_order[col]]]
projections.append(
Expression._call("coalesce", [
Expression._field(idx), Expression._field(
right_operand_index+right_key_index)
])
)
elif idx >= right_operand_index and col in right_column_keys_indices:
# Do not include right table keys. As they would lead to duplicated keys
continue
else:
# For all the other columns include them as they are.
# Just recompute the suffixes that the join produced as the projection
# would lose them otherwise.
if (
left_suffix and idx < right_operand_index
and col in right_columns_set
):
col += left_suffix
if (
right_suffix and idx >= right_operand_index
and col in left_columns_set
):
col += right_suffix
projected_col_names.append(col)
projections.append(
Expression._field(idx)
)
projection = Declaration(
"project", ProjectNodeOptions(projections, projected_col_names)
)
decl = Declaration.from_sequence([decl, projection])
result_table = decl.to_table(use_threads=use_threads)
if output_type == Table:
return result_table
elif output_type == ds.InMemoryDataset:
return ds.InMemoryDataset(result_table)
else:
raise TypeError("Unsupported output type")
def _perform_join_asof(left_operand, left_on, left_by,
right_operand, right_on, right_by,
tolerance, use_threads=True,
output_type=Table):
"""
Perform asof join of two tables or datasets.
The result will be an output table with the result of the join operation
Parameters
----------
left_operand : Table or Dataset
The left operand for the join operation.
left_on : str
The left key (or keys) on which the join operation should be performed.
left_by: str or list[str]
The left key (or keys) on which the join operation should be performed.
right_operand : Table or Dataset
The right operand for the join operation.
right_on : str or list[str]
The right key (or keys) on which the join operation should be performed.
right_by: str or list[str]
The right key (or keys) on which the join operation should be performed.
tolerance : int
The tolerance to use for the asof join. The tolerance is interpreted in
the same units as the "on" key.
output_type: Table or InMemoryDataset
The output type for the exec plan result.
Returns
-------
result_table : Table or InMemoryDataset
"""
if not isinstance(left_operand, (Table, ds.Dataset)):
raise TypeError(f"Expected Table or Dataset, got {type(left_operand)}")
if not isinstance(right_operand, (Table, ds.Dataset)):
raise TypeError(f"Expected Table or Dataset, got {type(right_operand)}")
if not isinstance(left_by, (tuple, list)):
left_by = [left_by]
if not isinstance(right_by, (tuple, list)):
right_by = [right_by]
# AsofJoin does not return on or by columns for right_operand.
right_columns = [
col for col in right_operand.schema.names
if col not in [right_on] + right_by
]
columns_collisions = set(left_operand.schema.names) & set(right_columns)
if columns_collisions:
raise ValueError(
f"Columns {columns_collisions} present in both tables. "
"AsofJoin does not support column collisions."
)
# Add the join node to the execplan
if isinstance(left_operand, ds.Dataset):
left_source = _dataset_to_decl(
left_operand,
use_threads=use_threads,
implicit_ordering=True)
else:
left_source = Declaration(
"table_source", TableSourceNodeOptions(left_operand),
)
if isinstance(right_operand, ds.Dataset):
right_source = _dataset_to_decl(
right_operand, use_threads=use_threads,
implicit_ordering=True)
else:
right_source = Declaration(
"table_source", TableSourceNodeOptions(right_operand)
)
join_opts = AsofJoinNodeOptions(
left_on, left_by, right_on, right_by, tolerance
)
decl = Declaration(
"asofjoin", options=join_opts, inputs=[left_source, right_source]
)
result_table = decl.to_table(use_threads=use_threads)
if output_type == Table:
return result_table
elif output_type == ds.InMemoryDataset:
return ds.InMemoryDataset(result_table)
else:
raise TypeError("Unsupported output type")
def _filter_table(table, expression):
"""Filter rows of a table based on the provided expression.
The result will be an output table with only the rows matching
the provided expression.
Parameters
----------
table : Table or RecordBatch
Table that should be filtered.
expression : Expression
The expression on which rows should be filtered.
Returns
-------
Table or RecordBatch
"""
is_batch = False
if isinstance(table, RecordBatch):
table = Table.from_batches([table])
is_batch = True
decl = Declaration.from_sequence([
Declaration("table_source", options=TableSourceNodeOptions(table)),
Declaration("filter", options=FilterNodeOptions(expression))
])
result = decl.to_table(use_threads=True)
if is_batch:
if result.num_rows > 0:
result = result.combine_chunks().to_batches()[0]
else:
arrays = [array([], type=field.type) for field in result.schema]
result = RecordBatch.from_arrays(arrays, schema=result.schema)
return result
def _sort_source(table_or_dataset, sort_keys, output_type=Table, **kwargs):
if isinstance(table_or_dataset, ds.Dataset):
data_source = _dataset_to_decl(table_or_dataset, use_threads=True)
else:
data_source = Declaration(
"table_source", TableSourceNodeOptions(table_or_dataset)
)
order_by = Declaration("order_by", OrderByNodeOptions(sort_keys, **kwargs))
decl = Declaration.from_sequence([data_source, order_by])
result_table = decl.to_table(use_threads=True)
if output_type == Table:
return result_table
elif output_type == ds.InMemoryDataset:
return ds.InMemoryDataset(result_table)
else:
raise TypeError("Unsupported output type")
def _group_by(table, aggregates, keys, use_threads=True):
decl = Declaration.from_sequence([
Declaration("table_source", TableSourceNodeOptions(table)),
Declaration("aggregate", AggregateNodeOptions(aggregates, keys=keys))
])
return decl.to_table(use_threads=use_threads)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
def benchmark_PandasObjectIsNull(list obj):
Benchmark_PandasObjectIsNull(obj)

View File

@@ -0,0 +1,21 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# flake8: noqa
from pyarrow.lib import benchmark_PandasObjectIsNull

View File

@@ -0,0 +1,150 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import math
cdef class StringBuilder(_Weakrefable):
"""
Builder class for UTF8 strings.
This class exposes facilities for incrementally adding string values and
building the null bitmap for a pyarrow.Array (type='string').
"""
cdef:
unique_ptr[CStringBuilder] builder
def __cinit__(self, MemoryPool memory_pool=None):
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
self.builder.reset(new CStringBuilder(pool))
def append(self, value):
"""
Append a single value to the builder.
The value can either be a string/bytes object or a null value
(np.nan or None).
Parameters
----------
value : string/bytes or np.nan/None
The value to append to the string array builder.
"""
if isinstance(value, (bytes, str)):
self.builder.get().Append(tobytes(value))
elif value is None or math.isnan(value):
self.builder.get().AppendNull()
else:
raise TypeError('StringBuilder only accepts string objects')
def append_values(self, values):
"""
Append all the values from an iterable.
Parameters
----------
values : iterable of string/bytes or np.nan/None values
The values to append to the string array builder.
"""
for value in values:
self.append(value)
def finish(self):
"""
Return result of builder as an Array object; also resets the builder.
Returns
-------
array : pyarrow.Array
"""
cdef shared_ptr[CArray] out
with nogil:
self.builder.get().Finish(&out)
return pyarrow_wrap_array(out)
@property
def null_count(self):
return self.builder.get().null_count()
def __len__(self):
return self.builder.get().length()
cdef class StringViewBuilder(_Weakrefable):
"""
Builder class for UTF8 string views.
This class exposes facilities for incrementally adding string values and
building the null bitmap for a pyarrow.Array (type='string_view').
"""
cdef:
unique_ptr[CStringViewBuilder] builder
def __cinit__(self, MemoryPool memory_pool=None):
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
self.builder.reset(new CStringViewBuilder(pool))
def append(self, value):
"""
Append a single value to the builder.
The value can either be a string/bytes object or a null value
(np.nan or None).
Parameters
----------
value : string/bytes or np.nan/None
The value to append to the string array builder.
"""
if isinstance(value, (bytes, str)):
self.builder.get().Append(tobytes(value))
elif value is None or math.isnan(value):
self.builder.get().AppendNull()
else:
raise TypeError('StringViewBuilder only accepts string objects')
def append_values(self, values):
"""
Append all the values from an iterable.
Parameters
----------
values : iterable of string/bytes or np.nan/None values
The values to append to the string array builder.
"""
for value in values:
self.append(value)
def finish(self):
"""
Return result of builder as an Array object; also resets the builder.
Returns
-------
array : pyarrow.Array
"""
cdef shared_ptr[CArray] out
with nogil:
self.builder.get().Finish(&out)
return pyarrow_wrap_array(out)
@property
def null_count(self):
return self.builder.get().null_count()
def __len__(self):
return self.builder.get().length()

View File

@@ -0,0 +1,81 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import
import cffi
c_source = """
struct ArrowSchema {
// Array type description
const char* format;
const char* name;
const char* metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema** children;
struct ArrowSchema* dictionary;
// Release callback
void (*release)(struct ArrowSchema*);
// Opaque producer-specific data
void* private_data;
};
struct ArrowArray {
// Array data description
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
const void** buffers;
struct ArrowArray** children;
struct ArrowArray* dictionary;
// Release callback
void (*release)(struct ArrowArray*);
// Opaque producer-specific data
void* private_data;
};
struct ArrowArrayStream {
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
const char* (*get_last_error)(struct ArrowArrayStream*);
// Release callback
void (*release)(struct ArrowArrayStream*);
// Opaque producer-specific data
void* private_data;
};
typedef int32_t ArrowDeviceType;
struct ArrowDeviceArray {
struct ArrowArray array;
int64_t device_id;
ArrowDeviceType device_type;
void* sync_event;
int64_t reserved[3];
};
"""
# TODO use out-of-line mode for faster import and avoid C parsing
ffi = cffi.FFI()
ffi.cdef(c_source)

View File

@@ -0,0 +1,71 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
def encode_file_path(path):
if isinstance(path, str):
# POSIX systems can handle utf-8. UTF8 is converted to utf16-le in
# libarrow
encoded_path = path.encode('utf-8')
else:
encoded_path = path
# Windows file system requires utf-16le for file names; Arrow C++ libraries
# will convert utf8 to utf16
return encoded_path
# Starting with Python 3.7, dicts are guaranteed to be insertion-ordered.
ordered_dict = dict
try:
import cloudpickle as pickle
except ImportError:
import pickle
def tobytes(o):
"""
Encode a unicode or bytes string to bytes.
Parameters
----------
o : str or bytes
Input string.
"""
if isinstance(o, str):
return o.encode('utf8')
else:
return o
def frombytes(o, *, safe=False):
"""
Decode the given bytestring to unicode.
Parameters
----------
o : bytes-like
Input object.
safe : bool, default False
If true, raise on encoding errors.
"""
if safe:
return o.decode('utf8', errors='replace')
else:
return o.decode('utf8')

View File

@@ -0,0 +1,764 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from pyarrow._compute import ( # noqa
Function,
FunctionOptions,
FunctionRegistry,
HashAggregateFunction,
HashAggregateKernel,
Kernel,
ScalarAggregateFunction,
ScalarAggregateKernel,
ScalarFunction,
ScalarKernel,
VectorFunction,
VectorKernel,
# Option classes
ArraySortOptions,
AssumeTimezoneOptions,
CastOptions,
CountOptions,
CumulativeOptions,
CumulativeSumOptions,
DayOfWeekOptions,
DictionaryEncodeOptions,
RunEndEncodeOptions,
ElementWiseAggregateOptions,
ExtractRegexOptions,
ExtractRegexSpanOptions,
FilterOptions,
IndexOptions,
JoinOptions,
ListSliceOptions,
ListFlattenOptions,
MakeStructOptions,
MapLookupOptions,
MatchSubstringOptions,
ModeOptions,
NullOptions,
PadOptions,
PairwiseOptions,
PartitionNthOptions,
PivotWiderOptions,
QuantileOptions,
RandomOptions,
RankOptions,
RankQuantileOptions,
ReplaceSliceOptions,
ReplaceSubstringOptions,
RoundBinaryOptions,
RoundOptions,
RoundTemporalOptions,
RoundToMultipleOptions,
ScalarAggregateOptions,
SelectKOptions,
SetLookupOptions,
SkewOptions,
SliceOptions,
SortOptions,
SplitOptions,
SplitPatternOptions,
StrftimeOptions,
StrptimeOptions,
StructFieldOptions,
TakeOptions,
TDigestOptions,
TrimOptions,
Utf8NormalizeOptions,
VarianceOptions,
WeekOptions,
WinsorizeOptions,
ZeroFillOptions,
# Functions
call_function,
function_registry,
get_function,
list_functions,
# Udf
call_tabular_function,
register_scalar_function,
register_tabular_function,
register_aggregate_function,
register_vector_function,
UdfContext,
# Expressions
Expression,
)
from collections import namedtuple
import inspect
from textwrap import dedent
import warnings
import pyarrow as pa
from pyarrow import _compute_docstrings
from pyarrow.vendored import docscrape
def _get_arg_names(func):
return func._doc.arg_names
_OptionsClassDoc = namedtuple('_OptionsClassDoc', ('params',))
def _scrape_options_class_doc(options_class):
if not options_class.__doc__:
return None
doc = docscrape.NumpyDocString(options_class.__doc__)
return _OptionsClassDoc(doc['Parameters'])
def _decorate_compute_function(wrapper, exposed_name, func, options_class):
# Decorate the given compute function wrapper with useful metadata
# and documentation.
cpp_doc = func._doc
wrapper.__arrow_compute_function__ = dict(
name=func.name,
arity=func.arity,
options_class=cpp_doc.options_class,
options_required=cpp_doc.options_required)
wrapper.__name__ = exposed_name
wrapper.__qualname__ = exposed_name
doc_pieces = []
# 1. One-line summary
summary = cpp_doc.summary
if not summary:
arg_str = "arguments" if func.arity > 1 else "argument"
summary = f"Call compute function {func.name!r} with the given {arg_str}"
doc_pieces.append(f"{summary}.\n\n")
# 2. Multi-line description
description = cpp_doc.description
if description:
doc_pieces.append(f"{description}\n\n")
doc_addition = _compute_docstrings.function_doc_additions.get(func.name)
# 3. Parameter description
doc_pieces.append(dedent("""\
Parameters
----------
"""))
# 3a. Compute function parameters
arg_names = _get_arg_names(func)
for arg_name in arg_names:
if func.kind in ('vector', 'scalar_aggregate'):
arg_type = 'Array-like'
else:
arg_type = 'Array-like or scalar-like'
doc_pieces.append(f"{arg_name} : {arg_type}\n")
doc_pieces.append(" Argument to compute function.\n")
# 3b. Compute function option values
if options_class is not None:
options_class_doc = _scrape_options_class_doc(options_class)
if options_class_doc:
for p in options_class_doc.params:
doc_pieces.append(f"{p.name} : {p.type}\n")
for s in p.desc:
doc_pieces.append(f" {s}\n")
else:
warnings.warn(f"Options class {options_class.__name__} "
f"does not have a docstring", RuntimeWarning)
options_sig = inspect.signature(options_class)
for p in options_sig.parameters.values():
doc_pieces.append(dedent(f"""\
{p.name} : optional
Parameter for {options_class.__name__} constructor. Either `options`
or `{p.name}` can be passed, but not both at the same time.
"""))
doc_pieces.append(dedent(f"""\
options : pyarrow.compute.{options_class.__name__}, optional
Alternative way of passing options.
"""))
doc_pieces.append(dedent("""\
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
"""))
# 4. Custom addition (e.g. examples)
if doc_addition is not None:
stripped = dedent(doc_addition).strip('\n')
doc_pieces.append(f"\n{stripped}\n")
wrapper.__doc__ = "".join(doc_pieces)
return wrapper
def _get_options_class(func):
class_name = func._doc.options_class
if not class_name:
return None
try:
return globals()[class_name]
except KeyError:
warnings.warn(f"Python binding for {class_name} not exposed",
RuntimeWarning)
return None
def _handle_options(name, options_class, options, args, kwargs):
if args or kwargs:
if options is not None:
raise TypeError(
f"Function {name!r} called with both an 'options' argument "
f"and additional arguments")
return options_class(*args, **kwargs)
if options is not None:
if isinstance(options, dict):
return options_class(**options)
elif isinstance(options, options_class):
return options
raise TypeError(
f"Function {name!r} expected a {options_class} parameter, "
f"got {type(options)}")
return None
def _make_generic_wrapper(func_name, func, options_class, arity):
if options_class is None:
def wrapper(*args, memory_pool=None):
if arity is not Ellipsis and len(args) != arity:
raise TypeError(
f"{func_name} takes {arity} positional argument(s), "
f"but {len(args)} were given"
)
if args and isinstance(args[0], Expression):
return Expression._call(func_name, list(args))
return func.call(args, None, memory_pool)
else:
def wrapper(*args, memory_pool=None, options=None, **kwargs):
if arity is not Ellipsis:
if len(args) < arity:
raise TypeError(
f"{func_name} takes {arity} positional argument(s), "
f"but {len(args)} were given"
)
option_args = args[arity:]
args = args[:arity]
else:
option_args = ()
options = _handle_options(func_name, options_class, options,
option_args, kwargs)
if args and isinstance(args[0], Expression):
return Expression._call(func_name, list(args), options)
return func.call(args, options, memory_pool)
return wrapper
def _make_signature(arg_names, var_arg_names, options_class):
from inspect import Parameter
params = []
for name in arg_names:
params.append(Parameter(name, Parameter.POSITIONAL_ONLY))
for name in var_arg_names:
params.append(Parameter(name, Parameter.VAR_POSITIONAL))
if options_class is not None:
options_sig = inspect.signature(options_class)
for p in options_sig.parameters.values():
assert p.kind in (Parameter.POSITIONAL_OR_KEYWORD,
Parameter.KEYWORD_ONLY)
if var_arg_names:
# Cannot have a positional argument after a *args
p = p.replace(kind=Parameter.KEYWORD_ONLY)
params.append(p)
params.append(Parameter("options", Parameter.KEYWORD_ONLY,
default=None))
params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY,
default=None))
return inspect.Signature(params)
def _wrap_function(name, func):
options_class = _get_options_class(func)
arg_names = _get_arg_names(func)
has_vararg = arg_names and arg_names[-1].startswith('*')
if has_vararg:
var_arg_names = [arg_names.pop().lstrip('*')]
else:
var_arg_names = []
wrapper = _make_generic_wrapper(
name, func, options_class, arity=func.arity)
wrapper.__signature__ = _make_signature(arg_names, var_arg_names,
options_class)
return _decorate_compute_function(wrapper, name, func, options_class)
def _make_global_functions():
"""
Make global functions wrapping each compute function.
Note that some of the automatically-generated wrappers may be overridden
by custom versions below.
"""
g = globals()
reg = function_registry()
# Avoid clashes with Python keywords
rewrites = {'and': 'and_',
'or': 'or_'}
for cpp_name in reg.list_functions():
name = rewrites.get(cpp_name, cpp_name)
func = reg.get_function(cpp_name)
if func.kind == "hash_aggregate":
# Hash aggregate functions are not callable,
# so let's not expose them at module level.
continue
if func.kind == "scalar_aggregate" and func.arity == 0:
# Nullary scalar aggregate functions are not callable
# directly so let's not expose them at module level.
continue
assert name not in g, name
g[cpp_name] = g[name] = _wrap_function(name, func)
_make_global_functions()
# Alias for consistency; globals() is needed to avoid Python lint errors
utf8_zfill = utf8_zero_fill = globals()["utf8_zero_fill"]
def cast(arr, target_type=None, safe=None, options=None, memory_pool=None):
"""
Cast array values to another data type. Can also be invoked as an array
instance method.
Parameters
----------
arr : Array-like
target_type : DataType or str
Type to cast to
safe : bool, default True
Check for overflows or other unsafe conversions
options : CastOptions, default None
Additional checks pass by CastOptions
memory_pool : MemoryPool, optional
memory pool to use for allocations during function execution.
Examples
--------
>>> from datetime import datetime
>>> import pyarrow as pa
>>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
>>> arr.type
TimestampType(timestamp[us])
You can use ``pyarrow.DataType`` objects to specify the target type:
>>> cast(arr, pa.timestamp('ms'))
<pyarrow.lib.TimestampArray object at ...>
[
2010-01-01 00:00:00.000,
2015-01-01 00:00:00.000
]
>>> cast(arr, pa.timestamp('ms')).type
TimestampType(timestamp[ms])
Alternatively, it is also supported to use the string aliases for these
types:
>>> arr.cast('timestamp[ms]')
<pyarrow.lib.TimestampArray object at ...>
[
2010-01-01 00:00:00.000,
2015-01-01 00:00:00.000
]
>>> arr.cast('timestamp[ms]').type
TimestampType(timestamp[ms])
Returns
-------
casted : Array
The cast result as a new Array
"""
safe_vars_passed = (safe is not None) or (target_type is not None)
if safe_vars_passed and (options is not None):
raise ValueError("Must either pass values for 'target_type' and 'safe'"
" or pass a value for 'options'")
if options is None:
target_type = pa.types.lib.ensure_type(target_type)
if safe is False:
options = CastOptions.unsafe(target_type)
else:
options = CastOptions.safe(target_type)
return call_function("cast", [arr], options, memory_pool)
def index(data, value, start=None, end=None, *, memory_pool=None):
"""
Find the index of the first occurrence of a given value.
Parameters
----------
data : Array-like
value : Scalar-like object
The value to search for.
start : int, optional
end : int, optional
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
Returns
-------
index : int
the index, or -1 if not found
Examples
--------
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"])
>>> pc.index(arr, "ipsum")
<pyarrow.Int64Scalar: 1>
>>> pc.index(arr, "ipsum", start=2)
<pyarrow.Int64Scalar: 5>
>>> pc.index(arr, "amet")
<pyarrow.Int64Scalar: -1>
"""
if start is not None:
if end is not None:
data = data.slice(start, end - start)
else:
data = data.slice(start)
elif end is not None:
data = data.slice(0, end)
if not isinstance(value, pa.Scalar):
value = pa.scalar(value, type=data.type)
elif data.type != value.type:
value = pa.scalar(value.as_py(), type=data.type)
options = IndexOptions(value=value)
result = call_function('index', [data], options, memory_pool)
if start is not None and result.as_py() >= 0:
result = pa.scalar(result.as_py() + start, type=pa.int64())
return result
def take(data, indices, *, boundscheck=True, memory_pool=None):
"""
Select values (or records) from array- or table-like data given integer
selection indices.
The result will be of the same type(s) as the input, with elements taken
from the input array (or record batch / table fields) at the given
indices. If an index is null then the corresponding value in the output
will be null.
Parameters
----------
data : Array, ChunkedArray, RecordBatch, or Table
indices : Array, ChunkedArray
Must be of integer type
boundscheck : boolean, default True
Whether to boundscheck the indices. If False and there is an out of
bounds index, will likely cause the process to crash.
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
Returns
-------
result : depends on inputs
Selected values for the given indices
Examples
--------
>>> import pyarrow as pa
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
>>> indices = pa.array([0, None, 4, 3])
>>> arr.take(indices)
<pyarrow.lib.StringArray object at ...>
[
"a",
null,
"e",
null
]
"""
options = TakeOptions(boundscheck=boundscheck)
return call_function('take', [data, indices], options, memory_pool)
def fill_null(values, fill_value):
"""Replace each null element in values with a corresponding
element from fill_value.
If fill_value is scalar-like, then every null element in values
will be replaced with fill_value. If fill_value is array-like,
then the i-th element in values will be replaced with the i-th
element in fill_value.
The fill_value's type must be the same as that of values, or it
must be able to be implicitly casted to the array's type.
This is an alias for :func:`coalesce`.
Parameters
----------
values : Array, ChunkedArray, or Scalar-like object
Each null element is replaced with the corresponding value
from fill_value.
fill_value : Array, ChunkedArray, or Scalar-like object
If not same type as values, will attempt to cast.
Returns
-------
result : depends on inputs
Values with all null elements replaced
Examples
--------
>>> import pyarrow as pa
>>> arr = pa.array([1, 2, None, 3], type=pa.int8())
>>> fill_value = pa.scalar(5, type=pa.int8())
>>> arr.fill_null(fill_value)
<pyarrow.lib.Int8Array object at ...>
[
1,
2,
5,
3
]
>>> arr = pa.array([1, 2, None, 4, None])
>>> arr.fill_null(pa.array([10, 20, 30, 40, 50]))
<pyarrow.lib.Int64Array object at ...>
[
1,
2,
30,
4,
50
]
"""
if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)):
fill_value = pa.scalar(fill_value, type=values.type)
elif values.type != fill_value.type:
fill_value = pa.scalar(fill_value.as_py(), type=values.type)
return call_function("coalesce", [values, fill_value])
def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
"""
Select the indices of the top-k ordered elements from array- or table-like
data.
This is a specialization for :func:`select_k_unstable`. Output is not
guaranteed to be stable.
Parameters
----------
values : Array, ChunkedArray, RecordBatch, or Table
Data to sort and get top indices from.
k : int
The number of `k` elements to keep.
sort_keys : List-like
Column key names to order by when input is table-like data.
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
Returns
-------
result : Array
Indices of the top-k ordered elements
Examples
--------
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
>>> pc.top_k_unstable(arr, k=3)
<pyarrow.lib.UInt64Array object at ...>
[
5,
4,
2
]
"""
if sort_keys is None:
sort_keys = []
if isinstance(values, (pa.Array, pa.ChunkedArray)):
sort_keys.append(("dummy", "descending"))
else:
sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys)
options = SelectKOptions(k, sort_keys)
return call_function("select_k_unstable", [values], options, memory_pool)
def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
"""
Select the indices of the bottom-k ordered elements from
array- or table-like data.
This is a specialization for :func:`select_k_unstable`. Output is not
guaranteed to be stable.
Parameters
----------
values : Array, ChunkedArray, RecordBatch, or Table
Data to sort and get bottom indices from.
k : int
The number of `k` elements to keep.
sort_keys : List-like
Column key names to order by when input is table-like data.
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
Returns
-------
result : Array of indices
Indices of the bottom-k ordered elements
Examples
--------
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
>>> pc.bottom_k_unstable(arr, k=3)
<pyarrow.lib.UInt64Array object at ...>
[
0,
1,
2
]
"""
if sort_keys is None:
sort_keys = []
if isinstance(values, (pa.Array, pa.ChunkedArray)):
sort_keys.append(("dummy", "ascending"))
else:
sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys)
options = SelectKOptions(k, sort_keys)
return call_function("select_k_unstable", [values], options, memory_pool)
def random(n, *, initializer='system', options=None, memory_pool=None):
"""
Generate numbers in the range [0, 1).
Generated values are uniformly-distributed, double-precision
in range [0, 1). Algorithm and seed can be changed via RandomOptions.
Parameters
----------
n : int
Number of values to generate, must be greater than or equal to 0
initializer : int or str
How to initialize the underlying random generator.
If an integer is given, it is used as a seed.
If "system" is given, the random generator is initialized with
a system-specific source of (hopefully true) randomness.
Other values are invalid.
options : pyarrow.compute.RandomOptions, optional
Alternative way of passing options.
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
"""
options = RandomOptions(initializer=initializer)
return call_function("random", [], options, memory_pool, length=n)
def field(*name_or_index):
"""Reference a column of the dataset.
Stores only the field's name. Type and other information is known only when
the expression is bound to a dataset having an explicit scheme.
Nested references are allowed by passing multiple names or a tuple of
names. For example ``('foo', 'bar')`` references the field named "bar"
inside the field named "foo".
Parameters
----------
*name_or_index : string, multiple strings, tuple or int
The name or index of the (possibly nested) field the expression
references to.
Returns
-------
field_expr : Expression
Reference to the given field
Examples
--------
>>> import pyarrow.compute as pc
>>> pc.field("a")
<pyarrow.compute.Expression a>
>>> pc.field(1)
<pyarrow.compute.Expression FieldPath(1)>
>>> pc.field(("a", "b"))
<pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...
>>> pc.field("a", "b")
<pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...
"""
n = len(name_or_index)
if n == 1:
if isinstance(name_or_index[0], (str, int)):
return Expression._field(name_or_index[0])
elif isinstance(name_or_index[0], tuple):
return Expression._nested_field(name_or_index[0])
else:
raise TypeError(
"field reference should be str, multiple str, tuple or "
f"integer, got {type(name_or_index[0])}"
)
# In case of multiple strings not supplied in a tuple
else:
return Expression._nested_field(name_or_index)
def scalar(value):
"""Expression representing a scalar value.
Creates an Expression object representing a scalar value that can be used
in compute expressions and predicates.
Parameters
----------
value : bool, int, float or string
Python value of the scalar. This function accepts any value that can be
converted to a ``pyarrow.Scalar`` using ``pa.scalar()``.
Notes
-----
This function differs from ``pyarrow.scalar()`` in the following way:
* ``pyarrow.scalar()`` creates a ``pyarrow.Scalar`` object that represents
a single value in Arrow's memory model.
* ``pyarrow.compute.scalar()`` creates an ``Expression`` object representing
a scalar value that can be used in compute expressions, predicates, and
dataset filtering operations.
Returns
-------
scalar_expr : Expression
An Expression representing the scalar value
"""
return Expression._scalar(value)

View File

@@ -0,0 +1,116 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
cimport pyarrow.includes.libarrow as libarrow
cimport pyarrow.includes.libarrow_python as libarrow_python
from collections import namedtuple
import os
VersionInfo = namedtuple('VersionInfo', ('major', 'minor', 'patch'))
RuntimeInfo = namedtuple('RuntimeInfo',
('simd_level', 'detected_simd_level'))
def runtime_info():
"""
Get runtime information.
Returns
-------
info : pyarrow.RuntimeInfo
"""
cdef:
CRuntimeInfo c_info
c_info = GetRuntimeInfo()
return RuntimeInfo(
simd_level=frombytes(c_info.simd_level),
detected_simd_level=frombytes(c_info.detected_simd_level))
BuildInfo = namedtuple(
'BuildInfo',
('build_type', 'cpp_build_info'))
CppBuildInfo = namedtuple(
'CppBuildInfo',
('version', 'version_info', 'so_version', 'full_so_version',
'compiler_id', 'compiler_version', 'compiler_flags',
'git_id', 'git_description', 'package_kind', 'build_type'))
def _build_info():
"""
Get PyArrow build information.
Returns
-------
info : pyarrow.BuildInfo
"""
cdef:
const libarrow_python.CBuildInfo* c_info
const libarrow.CCppBuildInfo* c_cpp_info
c_info = &libarrow_python.GetBuildInfo()
c_cpp_info = &libarrow.GetCppBuildInfo()
cpp_build_info = CppBuildInfo(version=frombytes(c_cpp_info.version_string),
version_info=VersionInfo(c_cpp_info.version_major,
c_cpp_info.version_minor,
c_cpp_info.version_patch),
so_version=frombytes(c_cpp_info.so_version),
full_so_version=frombytes(c_cpp_info.full_so_version),
compiler_id=frombytes(c_cpp_info.compiler_id),
compiler_version=frombytes(
c_cpp_info.compiler_version),
compiler_flags=frombytes(c_cpp_info.compiler_flags),
git_id=frombytes(c_cpp_info.git_id),
git_description=frombytes(c_cpp_info.git_description),
package_kind=frombytes(c_cpp_info.package_kind),
build_type=frombytes(c_cpp_info.build_type).lower(),
)
return BuildInfo(build_type=c_info.build_type.decode('utf-8').lower(),
cpp_build_info=cpp_build_info)
build_info = _build_info()
cpp_build_info = build_info.cpp_build_info
cpp_version = build_info.cpp_build_info.version
cpp_version_info = build_info.cpp_build_info.version_info
def set_timezone_db_path(path):
"""
Configure the path to text timezone database on Windows.
Parameters
----------
path : str
Path to text timezone database.
"""
cdef:
CGlobalOptions options
if path is not None:
options.timezone_db_path = <c_string>tobytes(path)
check_status(Initialize(options))

View File

@@ -0,0 +1,386 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
import os
import pyarrow as pa
from pyarrow import Codec
from pyarrow import fs
from pyarrow.lib import is_threading_enabled
from pyarrow.tests.util import windows_has_tzdata
import sys
groups = [
'acero',
'azure',
'brotli',
'bz2',
'cython',
'dataset',
'hypothesis',
'fastparquet',
'flight',
'gandiva',
'gcs',
'gdb',
'gzip',
'hdfs',
'large_memory',
'lz4',
'memory_leak',
'nopandas',
'nonumpy',
'numpy',
'orc',
'pandas',
'parquet',
'parquet_encryption',
'processes',
'requires_testing_data',
's3',
'slow',
'snappy',
'sockets',
'substrait',
'threading',
'timezone_data',
'zstd',
]
defaults = {
'acero': False,
'azure': False,
'brotli': Codec.is_available('brotli'),
'bz2': Codec.is_available('bz2'),
'cython': False,
'dataset': False,
'fastparquet': False,
'flight': False,
'gandiva': False,
'gcs': False,
'gdb': True,
'gzip': Codec.is_available('gzip'),
'hdfs': False,
'hypothesis': False,
'large_memory': False,
'lz4': Codec.is_available('lz4'),
'memory_leak': False,
'nopandas': False,
'nonumpy': False,
'numpy': False,
'orc': False,
'pandas': False,
'parquet': False,
'parquet_encryption': False,
'processes': True,
'requires_testing_data': True,
's3': False,
'slow': False,
'snappy': Codec.is_available('snappy'),
'sockets': True,
'substrait': False,
'threading': is_threading_enabled(),
'timezone_data': True,
'zstd': Codec.is_available('zstd'),
}
if sys.platform == "emscripten":
# Emscripten doesn't support subprocess,
# multiprocessing, gdb or socket based
# networking
defaults['gdb'] = False
defaults['processes'] = False
defaults['sockets'] = False
if sys.platform == "win32":
defaults['timezone_data'] = windows_has_tzdata()
elif sys.platform == "emscripten":
defaults['timezone_data'] = os.path.exists("/usr/share/zoneinfo")
try:
import cython # noqa
defaults['cython'] = True
except ImportError:
pass
try:
import fastparquet # noqa
defaults['fastparquet'] = True
except ImportError:
pass
try:
import pyarrow.gandiva # noqa
defaults['gandiva'] = True
except ImportError:
pass
try:
import pyarrow.acero # noqa
defaults['acero'] = True
except ImportError:
pass
try:
import pyarrow.dataset # noqa
defaults['dataset'] = True
except ImportError:
pass
try:
import pyarrow.orc # noqa
if sys.platform == "win32":
defaults['orc'] = True
else:
# orc tests on non-Windows platforms only work
# if timezone data exists, so skip them if
# not.
defaults['orc'] = defaults['timezone_data']
except ImportError:
pass
try:
import pandas # noqa
defaults['pandas'] = True
except ImportError:
defaults['nopandas'] = True
try:
import numpy # noqa
defaults['numpy'] = True
except ImportError:
defaults['nonumpy'] = True
try:
import pyarrow.parquet # noqa
defaults['parquet'] = True
except ImportError:
pass
try:
import pyarrow.parquet.encryption # noqa
defaults['parquet_encryption'] = True
except ImportError:
pass
try:
import pyarrow.flight # noqa
defaults['flight'] = True
except ImportError:
pass
try:
from pyarrow.fs import AzureFileSystem # noqa
defaults['azure'] = True
except ImportError:
pass
try:
from pyarrow.fs import GcsFileSystem # noqa
defaults['gcs'] = True
except ImportError:
pass
try:
from pyarrow.fs import S3FileSystem # noqa
defaults['s3'] = True
except ImportError:
pass
try:
from pyarrow.fs import HadoopFileSystem # noqa
defaults['hdfs'] = True
except ImportError:
pass
try:
import pyarrow.substrait # noqa
defaults['substrait'] = True
except ImportError:
pass
# Doctest should ignore files for the modules that are not built
def pytest_ignore_collect(collection_path, config):
if config.option.doctestmodules:
# don't try to run doctests on the /tests directory
if "/pyarrow/tests/" in str(collection_path):
return True
doctest_groups = [
'dataset',
'orc',
'parquet',
'flight',
'substrait',
]
# handle cuda, flight, etc
for group in doctest_groups:
if f'pyarrow/{group}' in str(collection_path):
if not defaults[group]:
return True
if 'pyarrow/parquet/encryption' in str(collection_path):
if not defaults['parquet_encryption']:
return True
if 'pyarrow/cuda' in str(collection_path):
try:
import pyarrow.cuda # noqa
return False
except ImportError:
return True
if 'pyarrow/fs' in str(collection_path):
try:
from pyarrow.fs import S3FileSystem # noqa
return False
except ImportError:
return True
if getattr(config.option, "doctest_cython", False):
if "/pyarrow/tests/" in str(collection_path):
return True
if "/pyarrow/_parquet_encryption" in str(collection_path):
return True
return False
# Save output files from doctest examples into temp dir
@pytest.fixture(autouse=True)
def _docdir(request):
# Trigger ONLY for the doctests
doctest_m = request.config.option.doctestmodules
doctest_c = getattr(request.config.option, "doctest_cython", False)
if doctest_m or doctest_c:
# Get the fixture dynamically by its name.
tmpdir = request.getfixturevalue('tmpdir')
# Chdir only for the duration of the test.
with tmpdir.as_cwd():
yield
else:
yield
# Define doctest_namespace for fs module docstring import
@pytest.fixture(autouse=True)
def add_fs(doctest_namespace, request, tmp_path):
# Trigger ONLY for the doctests
doctest_m = request.config.option.doctestmodules
doctest_c = getattr(request.config.option, "doctest_cython", False)
if doctest_m or doctest_c:
# fs import
doctest_namespace["fs"] = fs
# Creation of an object and file with data
local = fs.LocalFileSystem()
path = tmp_path / 'pyarrow-fs-example.dat'
with local.open_output_stream(str(path)) as stream:
stream.write(b'data')
doctest_namespace["local"] = local
doctest_namespace["local_path"] = str(tmp_path)
doctest_namespace["path"] = str(path)
yield
# Define udf fixture for test_udf.py and test_substrait.py
@pytest.fixture(scope="session")
def unary_func_fixture():
"""
Register a unary scalar function.
"""
from pyarrow import compute as pc
def unary_function(ctx, x):
return pc.call_function("add", [x, 1],
memory_pool=ctx.memory_pool)
func_name = "y=x+1"
unary_doc = {"summary": "add function",
"description": "test add function"}
pc.register_scalar_function(unary_function,
func_name,
unary_doc,
{"array": pa.int64()},
pa.int64())
return unary_function, func_name
@pytest.fixture(scope="session")
def unary_agg_func_fixture():
"""
Register a unary aggregate function (mean)
"""
from pyarrow import compute as pc
import numpy as np
def func(ctx, x):
return pa.scalar(np.nanmean(x))
func_name = "mean_udf"
func_doc = {"summary": "y=avg(x)",
"description": "find mean of x"}
pc.register_aggregate_function(func,
func_name,
func_doc,
{
"x": pa.float64(),
},
pa.float64()
)
return func, func_name
@pytest.fixture(scope="session")
def varargs_agg_func_fixture():
"""
Register a unary aggregate function
"""
from pyarrow import compute as pc
import numpy as np
def func(ctx, *args):
sum = 0.0
for arg in args:
sum += np.nanmean(arg)
return pa.scalar(sum)
func_name = "sum_mean"
func_doc = {"summary": "Varargs aggregate",
"description": "Varargs aggregate"}
pc.register_aggregate_function(func,
func_name,
func_doc,
{
"x": pa.int64(),
"y": pa.float64()
},
pa.float64()
)
return func, func_name

View File

@@ -0,0 +1,22 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from pyarrow._csv import ( # noqa
ReadOptions, ParseOptions, ConvertOptions, ISO8601,
open_csv, read_csv, CSVStreamingReader, write_csv,
WriteOptions, CSVWriter, InvalidRow)

View File

@@ -0,0 +1,25 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# flake8: noqa
from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer,
HostBuffer, BufferReader, BufferWriter,
new_host_buffer,
serialize_record_batch, read_message,
read_record_batch)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,167 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
# cython: embedsignature = True
cpdef enum DeviceAllocationType:
CPU = <char> CDeviceAllocationType_kCPU
CUDA = <char> CDeviceAllocationType_kCUDA
CUDA_HOST = <char> CDeviceAllocationType_kCUDA_HOST
OPENCL = <char> CDeviceAllocationType_kOPENCL
VULKAN = <char> CDeviceAllocationType_kVULKAN
METAL = <char> CDeviceAllocationType_kMETAL
VPI = <char> CDeviceAllocationType_kVPI
ROCM = <char> CDeviceAllocationType_kROCM
ROCM_HOST = <char> CDeviceAllocationType_kROCM_HOST
EXT_DEV = <char> CDeviceAllocationType_kEXT_DEV
CUDA_MANAGED = <char> CDeviceAllocationType_kCUDA_MANAGED
ONEAPI = <char> CDeviceAllocationType_kONEAPI
WEBGPU = <char> CDeviceAllocationType_kWEBGPU
HEXAGON = <char> CDeviceAllocationType_kHEXAGON
cdef object _wrap_device_allocation_type(CDeviceAllocationType device_type):
return DeviceAllocationType(<char> device_type)
cdef class Device(_Weakrefable):
"""
Abstract interface for hardware devices
This object represents a device with access to some memory spaces.
When handling a Buffer or raw memory address, it allows deciding in which
context the raw memory address should be interpreted
(e.g. CPU-accessible memory, or embedded memory on some particular GPU).
"""
def __init__(self):
raise TypeError("Do not call Device's constructor directly, "
"use the device attribute of the MemoryManager instead.")
cdef void init(self, const shared_ptr[CDevice]& device):
self.device = device
@staticmethod
cdef wrap(const shared_ptr[CDevice]& device):
cdef Device self = Device.__new__(Device)
self.init(device)
return self
cdef inline shared_ptr[CDevice] unwrap(self) nogil:
return self.device
def __eq__(self, other):
if not isinstance(other, Device):
return False
return self.device.get().Equals(deref((<Device>other).device.get()))
def __repr__(self):
return f"<pyarrow.Device: {frombytes(self.device.get().ToString())}>"
@property
def type_name(self):
"""
A shorthand for this device's type.
"""
return frombytes(self.device.get().type_name())
@property
def device_id(self):
"""
A device ID to identify this device if there are multiple of this type.
If there is no "device_id" equivalent (such as for the main CPU device on
non-numa systems) returns -1.
"""
return self.device.get().device_id()
@property
def is_cpu(self):
"""
Whether this device is the main CPU device.
This shorthand method is very useful when deciding whether a memory address
is CPU-accessible.
"""
return self.device.get().is_cpu()
@property
def device_type(self):
"""
Return the DeviceAllocationType of this device.
"""
return _wrap_device_allocation_type(self.device.get().device_type())
cdef class MemoryManager(_Weakrefable):
"""
An object that provides memory management primitives.
A MemoryManager is always tied to a particular Device instance.
It can also have additional parameters (such as a MemoryPool to
allocate CPU memory).
"""
def __init__(self):
raise TypeError("Do not call MemoryManager's constructor directly, "
"use pyarrow.default_cpu_memory_manager() instead.")
cdef void init(self, const shared_ptr[CMemoryManager]& mm):
self.memory_manager = mm
@staticmethod
cdef wrap(const shared_ptr[CMemoryManager]& mm):
cdef MemoryManager self = MemoryManager.__new__(MemoryManager)
self.init(mm)
return self
cdef inline shared_ptr[CMemoryManager] unwrap(self) nogil:
return self.memory_manager
def __repr__(self):
device_str = frombytes(self.memory_manager.get().device().get().ToString())
return f"<pyarrow.MemoryManager device: {device_str}>"
@property
def device(self):
"""
The device this MemoryManager is tied to.
"""
return Device.wrap(self.memory_manager.get().device())
@property
def is_cpu(self):
"""
Whether this MemoryManager is tied to the main CPU device.
This shorthand method is very useful when deciding whether a memory
address is CPU-accessible.
"""
return self.memory_manager.get().is_cpu()
def default_cpu_memory_manager():
"""
Return the default CPU MemoryManager instance.
The returned singleton instance uses the default MemoryPool.
"""
return MemoryManager.wrap(c_default_cpu_memory_manager())

View File

@@ -0,0 +1,274 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetInterrupt
from pyarrow.includes.libarrow cimport CStatus
from pyarrow.includes.libarrow_python cimport IsPyError, RestorePyError
from pyarrow.includes.common cimport c_string
from contextlib import contextmanager
import os
import signal
import threading
from pyarrow.lib import is_threading_enabled
from pyarrow.util import _break_traceback_cycle_from_frame
class ArrowException(Exception):
pass
class ArrowInvalid(ValueError, ArrowException):
pass
class ArrowMemoryError(MemoryError, ArrowException):
pass
class ArrowKeyError(KeyError, ArrowException):
def __str__(self):
# Override KeyError.__str__, as it uses the repr() of the key
return ArrowException.__str__(self)
class ArrowTypeError(TypeError, ArrowException):
pass
class ArrowNotImplementedError(NotImplementedError, ArrowException):
pass
class ArrowCapacityError(ArrowException):
pass
class ArrowIndexError(IndexError, ArrowException):
pass
class ArrowSerializationError(ArrowException):
pass
class ArrowCancelled(ArrowException):
def __init__(self, message, signum=None):
super().__init__(message)
self.signum = signum
# Compatibility alias
ArrowIOError = IOError
# check_status() and convert_status() could be written directly in C++
# if we didn't define Arrow-specific subclasses (ArrowInvalid etc.)
cdef int check_status(const CStatus& status) except -1 nogil:
if status.ok():
return 0
with gil:
if IsPyError(status):
RestorePyError(status)
return -1
raise convert_status(status)
cdef object convert_status(const CStatus& status):
if IsPyError(status):
try:
RestorePyError(status)
except BaseException as e:
return e
# We don't use Status::ToString() as it would redundantly include
# the C++ class name.
message = frombytes(status.message(), safe=True)
detail = status.detail()
if detail != nullptr:
message += ". Detail: " + frombytes(detail.get().ToString(),
safe=True)
if status.IsInvalid():
return ArrowInvalid(message)
elif status.IsIOError():
# Note: OSError constructor is
# OSError(message)
# or
# OSError(errno, message, filename=None)
# or (on Windows)
# OSError(errno, message, filename, winerror)
errno = ErrnoFromStatus(status)
winerror = WinErrorFromStatus(status)
if winerror != 0:
return IOError(errno, message, None, winerror)
elif errno != 0:
return IOError(errno, message)
else:
return IOError(message)
elif status.IsOutOfMemory():
return ArrowMemoryError(message)
elif status.IsKeyError():
return ArrowKeyError(message)
elif status.IsNotImplemented():
return ArrowNotImplementedError(message)
elif status.IsTypeError():
return ArrowTypeError(message)
elif status.IsCapacityError():
return ArrowCapacityError(message)
elif status.IsIndexError():
return ArrowIndexError(message)
elif status.IsSerializationError():
return ArrowSerializationError(message)
elif status.IsCancelled():
signum = SignalFromStatus(status)
if signum > 0:
return ArrowCancelled(message, signum)
else:
return ArrowCancelled(message)
else:
message = frombytes(status.ToString(), safe=True)
return ArrowException(message)
# These are API functions for C++ PyArrow
cdef api int pyarrow_internal_check_status(const CStatus& status) \
except -1 nogil:
return check_status(status)
cdef api object pyarrow_internal_convert_status(const CStatus& status):
return convert_status(status)
cdef class StopToken:
cdef void init(self, CStopToken stop_token):
self.stop_token = move(stop_token)
cdef c_bool signal_handlers_enabled = True
def enable_signal_handlers(c_bool enable):
"""
Enable or disable interruption of long-running operations.
By default, certain long running operations will detect user
interruptions, such as by pressing Ctrl-C. This detection relies
on setting a signal handler for the duration of the long-running
operation, and may therefore interfere with other frameworks or
libraries (such as an event loop).
Parameters
----------
enable : bool
Whether to enable user interruption by setting a temporary
signal handler.
"""
global signal_handlers_enabled
signal_handlers_enabled = enable
# For internal use
# Whether we need a workaround for https://bugs.python.org/issue42248
have_signal_refcycle = (sys.version_info < (3, 8, 10) or
(3, 9) <= sys.version_info < (3, 9, 5) or
sys.version_info[:2] == (3, 10))
cdef class SignalStopHandler:
cdef:
StopToken _stop_token
vector[int] _signals
c_bool _enabled
def __cinit__(self):
self._enabled = False
self._init_signals()
if have_signal_refcycle:
_break_traceback_cycle_from_frame(sys._getframe(0))
self._stop_token = StopToken()
if not self._signals.empty():
maybe_source = SetSignalStopSource()
if not maybe_source.ok():
# See ARROW-11841 / ARROW-17173: in complex interaction
# scenarios (such as R calling into Python), SetSignalStopSource()
# may have already activated a signal-receiving StopSource.
# Just warn instead of erroring out.
maybe_source.status().Warn()
else:
self._stop_token.init(deref(maybe_source).token())
# signals don't work on Emscripten without threads.
# and possibly other single-thread environments.
self._enabled = is_threading_enabled()
def _init_signals(self):
if (signal_handlers_enabled and
threading.current_thread() is threading.main_thread()):
self._signals = [
sig for sig in (signal.SIGINT, signal.SIGTERM)
if signal.getsignal(sig) not in (signal.SIG_DFL,
signal.SIG_IGN, None)]
def __enter__(self):
if self._enabled:
check_status(RegisterCancellingSignalHandler(self._signals))
return self
def __exit__(self, exc_type, exc_value, exc_tb):
if self._enabled:
UnregisterCancellingSignalHandler()
if exc_value is None:
# Make sure we didn't lose a signal
try:
check_status(self._stop_token.stop_token.Poll())
except ArrowCancelled as e:
exc_value = e
if isinstance(exc_value, ArrowCancelled):
if exc_value.signum:
# Re-emit the exact same signal. We restored the Python signal
# handler above, so it should receive it.
if os.name == 'nt':
SendSignal(exc_value.signum)
else:
SendSignalToThread(exc_value.signum,
threading.main_thread().ident)
else:
# Simulate Python receiving a SIGINT
# (see https://bugs.python.org/issue43356 for why we can't
# simulate the exact signal number)
PyErr_SetInterrupt()
# Maximize chances of the Python signal handler being executed now.
# Otherwise a potential KeyboardInterrupt might be missed by an
# immediately enclosing try/except block.
PyErr_CheckSignals()
# ArrowCancelled will be re-raised if PyErr_CheckSignals()
# returned successfully.
def __dealloc__(self):
if self._enabled:
ResetSignalStopSource()
@property
def stop_token(self):
return self._stop_token

View File

@@ -0,0 +1,279 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from collections.abc import Sequence
import os
from pyarrow.pandas_compat import _pandas_api # noqa
from pyarrow.lib import (Codec, Table, # noqa
concat_tables, schema)
import pyarrow.lib as ext
from pyarrow import _feather
from pyarrow._feather import FeatherError # noqa: F401
class FeatherDataset:
"""
Encapsulates details of reading a list of Feather files.
Parameters
----------
path_or_paths : List[str]
A list of file names
validate_schema : bool, default True
Check that individual file schemas are all the same / compatible
"""
def __init__(self, path_or_paths, validate_schema=True):
self.paths = path_or_paths
self.validate_schema = validate_schema
def read_table(self, columns=None):
"""
Read multiple feather files as a single pyarrow.Table
Parameters
----------
columns : List[str]
Names of columns to read from the file
Returns
-------
pyarrow.Table
Content of the file as a table (of columns)
"""
_fil = read_table(self.paths[0], columns=columns)
self._tables = [_fil]
self.schema = _fil.schema
for path in self.paths[1:]:
table = read_table(path, columns=columns)
if self.validate_schema:
self.validate_schemas(path, table)
self._tables.append(table)
return concat_tables(self._tables)
def validate_schemas(self, piece, table):
if not self.schema.equals(table.schema):
raise ValueError(f'Schema in {piece} was different. \n'
f'{self.schema}\n\nvs\n\n{table.schema}')
def read_pandas(self, columns=None, use_threads=True):
"""
Read multiple Parquet files as a single pandas DataFrame
Parameters
----------
columns : List[str]
Names of columns to read from the file
use_threads : bool, default True
Use multiple threads when converting to pandas
Returns
-------
pandas.DataFrame
Content of the file as a pandas DataFrame (of columns)
"""
return self.read_table(columns=columns).to_pandas(
use_threads=use_threads)
def check_chunked_overflow(name, col):
if col.num_chunks == 1:
return
if col.type in (ext.binary(), ext.string()):
raise ValueError(f"Column '{name}' exceeds 2GB maximum capacity of "
"a Feather binary column. This restriction may be "
"lifted in the future")
else:
# TODO(wesm): Not sure when else this might be reached
raise ValueError(
f"Column '{name}' of type {col.type} was chunked on conversion to Arrow "
"and cannot be currently written to Feather format"
)
_FEATHER_SUPPORTED_CODECS = {'lz4', 'zstd', 'uncompressed'}
def write_feather(df, dest, compression=None, compression_level=None,
chunksize=None, version=2):
"""
Write a pandas.DataFrame to Feather format.
Parameters
----------
df : pandas.DataFrame or pyarrow.Table
Data to write out as Feather format.
dest : str
Local destination path.
compression : string, default None
Can be one of {"zstd", "lz4", "uncompressed"}. The default of None uses
LZ4 for V2 files if it is available, otherwise uncompressed.
compression_level : int, default None
Use a compression level particular to the chosen compressor. If None
use the default compression level
chunksize : int, default None
For V2 files, the internal maximum size of Arrow RecordBatch chunks
when writing the Arrow IPC file format. None means use the default,
which is currently 64K
version : int, default 2
Feather file version. Version 2 is the current. Version 1 is the more
limited legacy format
"""
if _pandas_api.have_pandas:
if (_pandas_api.has_sparse and
isinstance(df, _pandas_api.pd.SparseDataFrame)):
df = df.to_dense()
if _pandas_api.is_data_frame(df):
# Feather v1 creates a new column in the resultant Table to
# store index information if index type is not RangeIndex
if version == 1:
preserve_index = False
elif version == 2:
preserve_index = None
else:
raise ValueError("Version value should either be 1 or 2")
table = Table.from_pandas(df, preserve_index=preserve_index)
if version == 1:
# Version 1 does not chunking
for i, name in enumerate(table.schema.names):
col = table[i]
check_chunked_overflow(name, col)
else:
table = df
if version == 1:
if len(table.column_names) > len(set(table.column_names)):
raise ValueError("cannot serialize duplicate column names")
if compression is not None:
raise ValueError("Feather V1 files do not support compression "
"option")
if chunksize is not None:
raise ValueError("Feather V1 files do not support chunksize "
"option")
else:
if compression is None and Codec.is_available('lz4_frame'):
compression = 'lz4'
elif (compression is not None and
compression not in _FEATHER_SUPPORTED_CODECS):
raise ValueError(f'compression="{compression}" not supported, must be '
f'one of {_FEATHER_SUPPORTED_CODECS}')
try:
_feather.write_feather(table, dest, compression=compression,
compression_level=compression_level,
chunksize=chunksize, version=version)
except Exception:
if isinstance(dest, str):
try:
os.remove(dest)
except os.error:
pass
raise
def read_feather(source, columns=None, use_threads=True,
memory_map=False, **kwargs):
"""
Read a pandas.DataFrame from Feather format. To read as pyarrow.Table use
feather.read_table.
Parameters
----------
source : str file path, or file-like object
You can use MemoryMappedFile as source, for explicitly use memory map.
columns : sequence, optional
Only read a specific set of columns. If not provided, all columns are
read.
use_threads : bool, default True
Whether to parallelize reading using multiple threads. If false the
restriction is used in the conversion to Pandas as well as in the
reading from Feather format.
memory_map : boolean, default False
Use memory mapping when opening file on disk, when source is a str.
**kwargs
Additional keyword arguments passed on to `pyarrow.Table.to_pandas`.
Returns
-------
df : pandas.DataFrame
The contents of the Feather file as a pandas.DataFrame
"""
return (read_table(
source, columns=columns, memory_map=memory_map,
use_threads=use_threads).to_pandas(use_threads=use_threads, **kwargs))
def read_table(source, columns=None, memory_map=False, use_threads=True):
"""
Read a pyarrow.Table from Feather format
Parameters
----------
source : str file path, or file-like object
You can use MemoryMappedFile as source, for explicitly use memory map.
columns : sequence, optional
Only read a specific set of columns. If not provided, all columns are
read.
memory_map : boolean, default False
Use memory mapping when opening file on disk, when source is a str
use_threads : bool, default True
Whether to parallelize reading using multiple threads.
Returns
-------
table : pyarrow.Table
The contents of the Feather file as a pyarrow.Table
"""
reader = _feather.FeatherReader(
source, use_memory_map=memory_map, use_threads=use_threads)
if columns is None:
return reader.read()
if not isinstance(columns, Sequence):
raise TypeError("Columns must be a sequence but, got {}"
.format(type(columns).__name__))
column_types = [type(column) for column in columns]
if all(map(lambda t: t == int, column_types)):
table = reader.read_indices(columns)
elif all(map(lambda t: t == str, column_types)):
table = reader.read_names(columns)
else:
column_type_names = [t.__name__ for t in column_types]
raise TypeError("Columns must be indices or names. "
f"Got columns {columns} of types {column_type_names}")
# Feather v1 already respects the column selection
if reader.version < 3:
return table
# Feather v2 reads with sorted / deduplicated selection
elif sorted(set(columns)) == columns:
return table
else:
# follow exact order / selection of names
return table.select(columns)

View File

@@ -0,0 +1,69 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
try:
from pyarrow._flight import ( # noqa:F401
connect,
Action,
ActionType,
BasicAuth,
CallInfo,
CertKeyPair,
ClientAuthHandler,
ClientMiddleware,
ClientMiddlewareFactory,
DescriptorType,
FlightCallOptions,
FlightCancelledError,
FlightClient,
FlightDataStream,
FlightDescriptor,
FlightEndpoint,
FlightError,
FlightInfo,
FlightInternalError,
FlightMetadataReader,
FlightMetadataWriter,
FlightMethod,
FlightServerBase,
FlightServerError,
FlightStreamChunk,
FlightStreamReader,
FlightStreamWriter,
FlightTimedOutError,
FlightUnauthenticatedError,
FlightUnauthorizedError,
FlightUnavailableError,
FlightWriteSizeExceededError,
GeneratorStream,
Location,
MetadataRecordBatchReader,
MetadataRecordBatchWriter,
RecordBatchStream,
Result,
SchemaResult,
ServerAuthHandler,
ServerCallContext,
ServerMiddleware,
ServerMiddlewareFactory,
Ticket,
TracingServerMiddlewareFactory,
)
except ImportError as exc:
raise ImportError(
f"The pyarrow installation is not built with support for 'flight' ({str(exc)})"
) from None

View File

@@ -0,0 +1,428 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
FileSystem abstraction to interact with various local and remote filesystems.
"""
from pyarrow.util import _is_path_like, _stringify_path
from pyarrow._fs import ( # noqa
FileSelector,
FileType,
FileInfo,
FileSystem,
LocalFileSystem,
SubTreeFileSystem,
_MockFileSystem,
FileSystemHandler,
PyFileSystem,
_copy_files,
_copy_files_selector,
)
# For backward compatibility.
FileStats = FileInfo
_not_imported = []
try:
from pyarrow._azurefs import AzureFileSystem # noqa
except ImportError:
_not_imported.append("AzureFileSystem")
try:
from pyarrow._hdfs import HadoopFileSystem # noqa
except ImportError:
_not_imported.append("HadoopFileSystem")
try:
from pyarrow._gcsfs import GcsFileSystem # noqa
except ImportError:
_not_imported.append("GcsFileSystem")
try:
from pyarrow._s3fs import ( # noqa
AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy,
S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized,
finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region)
except ImportError:
_not_imported.append("S3FileSystem")
else:
# GH-38364: we don't initialize S3 eagerly as that could lead
# to crashes at shutdown even when S3 isn't used.
# Instead, S3 is initialized lazily using `ensure_s3_initialized`
# in assorted places.
import atexit
atexit.register(ensure_s3_finalized)
def __getattr__(name):
if name in _not_imported:
raise ImportError(
"The pyarrow installation is not built with support for "
f"'{name}'"
)
raise AttributeError(
f"module 'pyarrow.fs' has no attribute '{name}'"
)
def _ensure_filesystem(filesystem, *, use_mmap=False):
if isinstance(filesystem, FileSystem):
return filesystem
elif isinstance(filesystem, str):
# create a filesystem from a URI string, note that the `path` part of the URI
# is treated as a prefix if specified, so the filesystem is wrapped in a
# SubTreeFileSystem
if use_mmap:
raise ValueError(
"Specifying to use memory mapping not supported for "
"filesystem specified as an URI string"
)
fs, path = FileSystem.from_uri(filesystem)
prefix = fs.normalize_path(path)
if prefix:
# validate that the prefix is pointing to a directory
prefix_info = fs.get_file_info([prefix])[0]
if prefix_info.type != FileType.Directory:
raise ValueError(
"The path component of the filesystem URI must point to a "
f"directory but it has a type: `{prefix_info.type.name}`. The path "
f"component is `{prefix_info.path}` and the given filesystem URI "
f"is `{filesystem}`"
)
fs = SubTreeFileSystem(prefix, fs)
return fs
else:
# handle fsspec-compatible filesystems
try:
import fsspec
except ImportError:
pass
else:
if isinstance(filesystem, fsspec.AbstractFileSystem):
if type(filesystem).__name__ == 'LocalFileSystem':
# In case its a simple LocalFileSystem, use native arrow one
return LocalFileSystem(use_mmap=use_mmap)
return PyFileSystem(FSSpecHandler(filesystem))
raise TypeError(
f"Unrecognized filesystem: {type(filesystem)}. `filesystem` argument must "
"be a FileSystem instance or a valid file system URI"
)
def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False):
"""
Return filesystem/path from path which could be an URI or a plain
filesystem path or a combination of fsspec protocol and URI.
"""
if not _is_path_like(path):
if filesystem is not None:
raise ValueError(
"'filesystem' passed but the specified path is file-like, so"
" there is nothing to open with 'filesystem'."
)
return filesystem, path
if filesystem is not None:
filesystem = _ensure_filesystem(filesystem, use_mmap=memory_map)
if isinstance(filesystem, LocalFileSystem):
path = _stringify_path(path)
elif not isinstance(path, str):
raise TypeError(
"Expected string path; path-like objects are only allowed "
"with a local filesystem"
)
path = filesystem.normalize_path(path)
return filesystem, path
path = _stringify_path(path)
# if filesystem is not given, try to automatically determine one
# first check if the file exists as a local (relative) file path
# if not then try to parse the path as an URI
filesystem = LocalFileSystem(use_mmap=memory_map)
try:
file_info = filesystem.get_file_info(path)
except ValueError: # ValueError means path is likely an URI
file_info = None
exists_locally = False
else:
exists_locally = (file_info.type != FileType.NotFound)
# if the file or directory doesn't exists locally, then assume that
# the path is an URI describing the file system as well
if not exists_locally:
try:
filesystem, path = FileSystem.from_uri(path)
except ValueError as e:
msg = str(e)
if "empty scheme" in msg or "Cannot parse URI" in msg:
# neither an URI nor a locally existing path, so assume that
# local path was given and propagate a nicer file not found
# error instead of a more confusing scheme parsing error
pass
else:
raise e
else:
path = filesystem.normalize_path(path)
return filesystem, path
def copy_files(source, destination,
source_filesystem=None, destination_filesystem=None,
*, chunk_size=1024*1024, use_threads=True):
"""
Copy files between FileSystems.
This functions allows you to recursively copy directories of files from
one file system to another, such as from S3 to your local machine.
Parameters
----------
source : string
Source file path or URI to a single file or directory.
If a directory, files will be copied recursively from this path.
destination : string
Destination file path or URI. If `source` is a file, `destination`
is also interpreted as the destination file (not directory).
Directories will be created as necessary.
source_filesystem : FileSystem, optional
Source filesystem, needs to be specified if `source` is not a URI,
otherwise inferred.
destination_filesystem : FileSystem, optional
Destination filesystem, needs to be specified if `destination` is not
a URI, otherwise inferred.
chunk_size : int, default 1MB
The maximum size of block to read before flushing to the
destination file. A larger chunk_size will use more memory while
copying but may help accommodate high latency FileSystems.
use_threads : bool, default True
Whether to use multiple threads to accelerate copying.
Examples
--------
Inspect an S3 bucket's files:
>>> s3, path = fs.FileSystem.from_uri(
... "s3://registry.opendata.aws/roda/ndjson/")
>>> selector = fs.FileSelector(path)
>>> s3.get_file_info(selector)
[<FileInfo for 'registry.opendata.aws/roda/ndjson/index.ndjson':...]
Copy one file from S3 bucket to a local directory:
>>> fs.copy_files("s3://registry.opendata.aws/roda/ndjson/index.ndjson",
... f"file:///{local_path}/index_copy.ndjson")
>>> fs.LocalFileSystem().get_file_info(str(local_path)+
... '/index_copy.ndjson')
<FileInfo for '.../index_copy.ndjson': type=FileType.File, size=...>
Copy file using a FileSystem object:
>>> fs.copy_files("registry.opendata.aws/roda/ndjson/index.ndjson",
... f"file:///{local_path}/index_copy.ndjson",
... source_filesystem=fs.S3FileSystem())
"""
source_fs, source_path = _resolve_filesystem_and_path(
source, source_filesystem
)
destination_fs, destination_path = _resolve_filesystem_and_path(
destination, destination_filesystem
)
file_info = source_fs.get_file_info(source_path)
if file_info.type == FileType.Directory:
source_sel = FileSelector(source_path, recursive=True)
_copy_files_selector(source_fs, source_sel,
destination_fs, destination_path,
chunk_size, use_threads)
else:
_copy_files(source_fs, source_path,
destination_fs, destination_path,
chunk_size, use_threads)
class FSSpecHandler(FileSystemHandler):
"""
Handler for fsspec-based Python filesystems.
https://filesystem-spec.readthedocs.io/en/latest/index.html
Parameters
----------
fs : FSSpec-compliant filesystem instance
Examples
--------
>>> PyFileSystem(FSSpecHandler(fsspec_fs)) # doctest: +SKIP
"""
def __init__(self, fs):
self.fs = fs
def __eq__(self, other):
if isinstance(other, FSSpecHandler):
return self.fs == other.fs
return NotImplemented
def __ne__(self, other):
if isinstance(other, FSSpecHandler):
return self.fs != other.fs
return NotImplemented
def get_type_name(self):
protocol = self.fs.protocol
if isinstance(protocol, list):
protocol = protocol[0]
return f"fsspec+{protocol}"
def normalize_path(self, path):
return path
@staticmethod
def _create_file_info(path, info):
size = info["size"]
if info["type"] == "file":
ftype = FileType.File
elif info["type"] == "directory":
ftype = FileType.Directory
# some fsspec filesystems include a file size for directories
size = None
else:
ftype = FileType.Unknown
return FileInfo(path, ftype, size=size, mtime=info.get("mtime", None))
def get_file_info(self, paths):
infos = []
for path in paths:
try:
info = self.fs.info(path)
except FileNotFoundError:
infos.append(FileInfo(path, FileType.NotFound))
else:
infos.append(self._create_file_info(path, info))
return infos
def get_file_info_selector(self, selector):
if not self.fs.isdir(selector.base_dir):
if self.fs.exists(selector.base_dir):
raise NotADirectoryError(selector.base_dir)
else:
if selector.allow_not_found:
return []
else:
raise FileNotFoundError(selector.base_dir)
if selector.recursive:
maxdepth = None
else:
maxdepth = 1
infos = []
selected_files = self.fs.find(
selector.base_dir, maxdepth=maxdepth, withdirs=True, detail=True
)
for path, info in selected_files.items():
_path = path.strip("/")
base_dir = selector.base_dir.strip("/")
# Need to exclude base directory from selected files if present
# (fsspec filesystems, see GH-37555)
if _path != base_dir:
infos.append(self._create_file_info(path, info))
return infos
def create_dir(self, path, recursive):
# mkdir also raises FileNotFoundError when base directory is not found
try:
self.fs.mkdir(path, create_parents=recursive)
except FileExistsError:
pass
def delete_dir(self, path):
self.fs.rm(path, recursive=True)
def _delete_dir_contents(self, path, missing_dir_ok):
try:
subpaths = self.fs.listdir(path, detail=False)
except FileNotFoundError:
if missing_dir_ok:
return
raise
for subpath in subpaths:
if self.fs.isdir(subpath):
self.fs.rm(subpath, recursive=True)
elif self.fs.isfile(subpath):
self.fs.rm(subpath)
def delete_dir_contents(self, path, missing_dir_ok):
if path.strip("/") == "":
raise ValueError(
"delete_dir_contents called on path '", path, "'")
self._delete_dir_contents(path, missing_dir_ok)
def delete_root_dir_contents(self):
self._delete_dir_contents("/")
def delete_file(self, path):
# fs.rm correctly raises IsADirectoryError when `path` is a directory
# instead of a file and `recursive` is not set to True
if not self.fs.exists(path):
raise FileNotFoundError(path)
self.fs.rm(path)
def move(self, src, dest):
self.fs.mv(src, dest, recursive=True)
def copy_file(self, src, dest):
# fs.copy correctly raises IsADirectoryError when `src` is a directory
# instead of a file
self.fs.copy(src, dest)
# TODO can we read/pass metadata (e.g. Content-Type) in the methods below?
def open_input_stream(self, path):
from pyarrow import PythonFile
if not self.fs.isfile(path):
raise FileNotFoundError(path)
return PythonFile(self.fs.open(path, mode="rb"), mode="r")
def open_input_file(self, path):
from pyarrow import PythonFile
if not self.fs.isfile(path):
raise FileNotFoundError(path)
return PythonFile(self.fs.open(path, mode="rb"), mode="r")
def open_output_stream(self, path, metadata):
from pyarrow import PythonFile
return PythonFile(self.fs.open(path, mode="wb"), mode="w")
def open_append_stream(self, path, metadata):
from pyarrow import PythonFile
return PythonFile(self.fs.open(path, mode="ab"), mode="w")

View File

@@ -0,0 +1,756 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
# cython: language_level = 3
from libcpp.memory cimport shared_ptr
from libcpp.string cimport string as c_string
from libcpp.vector cimport vector as c_vector
from libcpp.unordered_set cimport unordered_set as c_unordered_set
from libc.stdint cimport int64_t, int32_t
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport (DataType, Field, MemoryPool, RecordBatch,
Schema, check_status, pyarrow_wrap_array,
pyarrow_wrap_data_type, ensure_type, _Weakrefable,
pyarrow_wrap_field)
from pyarrow.includes.libgandiva cimport (
CCondition, CGandivaExpression,
CNode, CProjector, CFilter,
CSelectionVector,
_ensure_selection_mode,
CConfiguration,
CConfigurationBuilder,
TreeExprBuilder_MakeExpression,
TreeExprBuilder_MakeFunction,
TreeExprBuilder_MakeBoolLiteral,
TreeExprBuilder_MakeUInt8Literal,
TreeExprBuilder_MakeUInt16Literal,
TreeExprBuilder_MakeUInt32Literal,
TreeExprBuilder_MakeUInt64Literal,
TreeExprBuilder_MakeInt8Literal,
TreeExprBuilder_MakeInt16Literal,
TreeExprBuilder_MakeInt32Literal,
TreeExprBuilder_MakeInt64Literal,
TreeExprBuilder_MakeFloatLiteral,
TreeExprBuilder_MakeDoubleLiteral,
TreeExprBuilder_MakeStringLiteral,
TreeExprBuilder_MakeBinaryLiteral,
TreeExprBuilder_MakeField,
TreeExprBuilder_MakeIf,
TreeExprBuilder_MakeAnd,
TreeExprBuilder_MakeOr,
TreeExprBuilder_MakeCondition,
TreeExprBuilder_MakeInExpressionInt32,
TreeExprBuilder_MakeInExpressionInt64,
TreeExprBuilder_MakeInExpressionTime32,
TreeExprBuilder_MakeInExpressionTime64,
TreeExprBuilder_MakeInExpressionDate32,
TreeExprBuilder_MakeInExpressionDate64,
TreeExprBuilder_MakeInExpressionTimeStamp,
TreeExprBuilder_MakeInExpressionString,
SelectionVector_MakeInt16,
SelectionVector_MakeInt32,
SelectionVector_MakeInt64,
Projector_Make,
Filter_Make,
CFunctionSignature,
GetRegisteredFunctionSignatures)
cdef class Node(_Weakrefable):
cdef:
shared_ptr[CNode] node
def __init__(self):
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly, use the "
"TreeExprBuilder API directly")
@staticmethod
cdef create(shared_ptr[CNode] node):
cdef Node self = Node.__new__(Node)
self.node = node
return self
def __str__(self):
return self.node.get().ToString().decode()
def __repr__(self):
type_format = object.__repr__(self)
return f"{type_format}\n{self}"
def return_type(self):
return pyarrow_wrap_data_type(self.node.get().return_type())
cdef class Expression(_Weakrefable):
cdef:
shared_ptr[CGandivaExpression] expression
cdef void init(self, shared_ptr[CGandivaExpression] expression):
self.expression = expression
def __str__(self):
return self.expression.get().ToString().decode()
def __repr__(self):
type_format = object.__repr__(self)
return f"{type_format}\n{self}"
def root(self):
return Node.create(self.expression.get().root())
def result(self):
return pyarrow_wrap_field(self.expression.get().result())
cdef class Condition(_Weakrefable):
cdef:
shared_ptr[CCondition] condition
def __init__(self):
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly, use the "
"TreeExprBuilder API instead")
@staticmethod
cdef create(shared_ptr[CCondition] condition):
cdef Condition self = Condition.__new__(Condition)
self.condition = condition
return self
def __str__(self):
return self.condition.get().ToString().decode()
def __repr__(self):
type_format = object.__repr__(self)
return f"{type_format}\n{self}"
def root(self):
return Node.create(self.condition.get().root())
def result(self):
return pyarrow_wrap_field(self.condition.get().result())
cdef class SelectionVector(_Weakrefable):
cdef:
shared_ptr[CSelectionVector] selection_vector
def __init__(self):
raise TypeError(
f"Do not call {self.__class__.__name__}'s constructor directly.")
@staticmethod
cdef create(shared_ptr[CSelectionVector] selection_vector):
cdef SelectionVector self = SelectionVector.__new__(SelectionVector)
self.selection_vector = selection_vector
return self
def to_array(self):
cdef shared_ptr[CArray] result = self.selection_vector.get().ToArray()
return pyarrow_wrap_array(result)
cdef class Projector(_Weakrefable):
cdef:
shared_ptr[CProjector] projector
MemoryPool pool
def __init__(self):
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly, use "
"make_projector instead")
@staticmethod
cdef create(shared_ptr[CProjector] projector, MemoryPool pool):
cdef Projector self = Projector.__new__(Projector)
self.projector = projector
self.pool = pool
return self
@property
def llvm_ir(self):
return self.projector.get().DumpIR().decode()
def evaluate(self, RecordBatch batch, SelectionVector selection=None):
"""
Evaluate the specified record batch and return the arrays at the
filtered positions.
Parameters
----------
batch : pyarrow.RecordBatch
selection : pyarrow.gandiva.SelectionVector
Returns
-------
list[pyarrow.Array]
"""
cdef vector[shared_ptr[CArray]] results
if selection is None:
check_status(self.projector.get().Evaluate(
batch.sp_batch.get()[0], self.pool.pool, &results))
else:
check_status(
self.projector.get().Evaluate(
batch.sp_batch.get()[0], selection.selection_vector.get(),
self.pool.pool, &results))
cdef shared_ptr[CArray] result
arrays = []
for result in results:
arrays.append(pyarrow_wrap_array(result))
return arrays
cdef class Filter(_Weakrefable):
cdef:
shared_ptr[CFilter] filter
def __init__(self):
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly, use "
"make_filter instead")
@staticmethod
cdef create(shared_ptr[CFilter] filter):
cdef Filter self = Filter.__new__(Filter)
self.filter = filter
return self
@property
def llvm_ir(self):
return self.filter.get().DumpIR().decode()
def evaluate(self, RecordBatch batch, MemoryPool pool, dtype='int32'):
"""
Evaluate the specified record batch and return a selection vector.
Parameters
----------
batch : pyarrow.RecordBatch
pool : MemoryPool
dtype : DataType or str, default int32
Returns
-------
pyarrow.gandiva.SelectionVector
"""
cdef:
DataType type = ensure_type(dtype)
shared_ptr[CSelectionVector] selection
if type.id == _Type_INT16:
check_status(SelectionVector_MakeInt16(
batch.num_rows, pool.pool, &selection))
elif type.id == _Type_INT32:
check_status(SelectionVector_MakeInt32(
batch.num_rows, pool.pool, &selection))
elif type.id == _Type_INT64:
check_status(SelectionVector_MakeInt64(
batch.num_rows, pool.pool, &selection))
else:
raise ValueError("'dtype' of the selection vector should be "
"one of 'int16', 'int32' and 'int64'.")
check_status(self.filter.get().Evaluate(
batch.sp_batch.get()[0], selection))
return SelectionVector.create(selection)
cdef class TreeExprBuilder(_Weakrefable):
def make_literal(self, value, dtype):
"""
Create a node on a literal.
Parameters
----------
value : a literal value
dtype : DataType
Returns
-------
pyarrow.gandiva.Node
"""
cdef:
DataType type = ensure_type(dtype)
shared_ptr[CNode] r
if type.id == _Type_BOOL:
r = TreeExprBuilder_MakeBoolLiteral(value)
elif type.id == _Type_UINT8:
r = TreeExprBuilder_MakeUInt8Literal(value)
elif type.id == _Type_UINT16:
r = TreeExprBuilder_MakeUInt16Literal(value)
elif type.id == _Type_UINT32:
r = TreeExprBuilder_MakeUInt32Literal(value)
elif type.id == _Type_UINT64:
r = TreeExprBuilder_MakeUInt64Literal(value)
elif type.id == _Type_INT8:
r = TreeExprBuilder_MakeInt8Literal(value)
elif type.id == _Type_INT16:
r = TreeExprBuilder_MakeInt16Literal(value)
elif type.id == _Type_INT32:
r = TreeExprBuilder_MakeInt32Literal(value)
elif type.id == _Type_INT64:
r = TreeExprBuilder_MakeInt64Literal(value)
elif type.id == _Type_FLOAT:
r = TreeExprBuilder_MakeFloatLiteral(value)
elif type.id == _Type_DOUBLE:
r = TreeExprBuilder_MakeDoubleLiteral(value)
elif type.id == _Type_STRING:
r = TreeExprBuilder_MakeStringLiteral(value.encode('UTF-8'))
elif type.id == _Type_BINARY:
r = TreeExprBuilder_MakeBinaryLiteral(value)
else:
raise TypeError("Didn't recognize dtype " + str(dtype))
return Node.create(r)
def make_expression(self, Node root_node not None,
Field return_field not None):
"""
Create an expression with the specified root_node,
and the result written to result_field.
Parameters
----------
root_node : pyarrow.gandiva.Node
return_field : pyarrow.Field
Returns
-------
pyarrow.gandiva.Expression
"""
cdef shared_ptr[CGandivaExpression] r = TreeExprBuilder_MakeExpression(
root_node.node, return_field.sp_field)
cdef Expression expression = Expression()
expression.init(r)
return expression
def make_function(self, name, children, DataType return_type):
"""
Create a node with a function.
Parameters
----------
name : str
children : pyarrow.gandiva.NodeVector
return_type : DataType
Returns
-------
pyarrow.gandiva.Node
"""
cdef c_vector[shared_ptr[CNode]] c_children
cdef Node child
for child in children:
if child is None:
raise TypeError("Child nodes must not be None")
c_children.push_back(child.node)
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeFunction(
name.encode(), c_children, return_type.sp_type)
return Node.create(r)
def make_field(self, Field field not None):
"""
Create a node with an Arrow field.
Parameters
----------
field : pyarrow.Field
Returns
-------
pyarrow.gandiva.Node
"""
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeField(field.sp_field)
return Node.create(r)
def make_if(self, Node condition not None, Node this_node not None,
Node else_node not None, DataType return_type not None):
"""
Create a node with an if-else expression.
Parameters
----------
condition : pyarrow.gandiva.Node
this_node : pyarrow.gandiva.Node
else_node : pyarrow.gandiva.Node
return_type : DataType
Returns
-------
pyarrow.gandiva.Node
"""
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeIf(
condition.node, this_node.node, else_node.node,
return_type.sp_type)
return Node.create(r)
def make_and(self, children):
"""
Create a Node with a boolean AND expression.
Parameters
----------
children : list[pyarrow.gandiva.Node]
Returns
-------
pyarrow.gandiva.Node
"""
cdef c_vector[shared_ptr[CNode]] c_children
cdef Node child
for child in children:
if child is None:
raise TypeError("Child nodes must not be None")
c_children.push_back(child.node)
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeAnd(c_children)
return Node.create(r)
def make_or(self, children):
"""
Create a Node with a boolean OR expression.
Parameters
----------
children : list[pyarrow.gandiva.Node]
Returns
-------
pyarrow.gandiva.Node
"""
cdef c_vector[shared_ptr[CNode]] c_children
cdef Node child
for child in children:
if child is None:
raise TypeError("Child nodes must not be None")
c_children.push_back(child.node)
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeOr(c_children)
return Node.create(r)
def _make_in_expression_int32(self, Node node not None, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int32_t] c_values
cdef int32_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionInt32(node.node, c_values)
return Node.create(r)
def _make_in_expression_int64(self, Node node not None, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int64_t] c_values
cdef int64_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionInt64(node.node, c_values)
return Node.create(r)
def _make_in_expression_time32(self, Node node not None, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int32_t] c_values
cdef int32_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionTime32(node.node, c_values)
return Node.create(r)
def _make_in_expression_time64(self, Node node not None, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int64_t] c_values
cdef int64_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionTime64(node.node, c_values)
return Node.create(r)
def _make_in_expression_date32(self, Node node not None, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int32_t] c_values
cdef int32_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionDate32(node.node, c_values)
return Node.create(r)
def _make_in_expression_date64(self, Node node not None, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int64_t] c_values
cdef int64_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionDate64(node.node, c_values)
return Node.create(r)
def _make_in_expression_timestamp(self, Node node not None, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int64_t] c_values
cdef int64_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionTimeStamp(node.node, c_values)
return Node.create(r)
def _make_in_expression_binary(self, Node node not None, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[c_string] c_values
cdef c_string v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionString(node.node, c_values)
return Node.create(r)
def _make_in_expression_string(self, Node node not None, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[c_string] c_values
cdef c_string _v
for v in values:
_v = v.encode('UTF-8')
c_values.insert(_v)
r = TreeExprBuilder_MakeInExpressionString(node.node, c_values)
return Node.create(r)
def make_in_expression(self, Node node not None, values, dtype):
"""
Create a Node with an IN expression.
Parameters
----------
node : pyarrow.gandiva.Node
values : iterable
dtype : DataType
Returns
-------
pyarrow.gandiva.Node
"""
cdef DataType type = ensure_type(dtype)
if type.id == _Type_INT32:
return self._make_in_expression_int32(node, values)
elif type.id == _Type_INT64:
return self._make_in_expression_int64(node, values)
elif type.id == _Type_TIME32:
return self._make_in_expression_time32(node, values)
elif type.id == _Type_TIME64:
return self._make_in_expression_time64(node, values)
elif type.id == _Type_TIMESTAMP:
return self._make_in_expression_timestamp(node, values)
elif type.id == _Type_DATE32:
return self._make_in_expression_date32(node, values)
elif type.id == _Type_DATE64:
return self._make_in_expression_date64(node, values)
elif type.id == _Type_BINARY:
return self._make_in_expression_binary(node, values)
elif type.id == _Type_STRING:
return self._make_in_expression_string(node, values)
else:
raise TypeError("Data type " + str(dtype) + " not supported.")
def make_condition(self, Node condition not None):
"""
Create a condition with the specified node.
Parameters
----------
condition : pyarrow.gandiva.Node
Returns
-------
pyarrow.gandiva.Condition
"""
cdef shared_ptr[CCondition] r = TreeExprBuilder_MakeCondition(
condition.node)
return Condition.create(r)
cdef class Configuration(_Weakrefable):
cdef:
shared_ptr[CConfiguration] configuration
def __cinit__(self, bint optimize=True, bint dump_ir=False):
"""
Initialize the configuration with specified options.
Parameters
----------
optimize : bool, default True
Whether to enable optimizations.
dump_ir : bool, default False
Whether to dump LLVM IR.
"""
self.configuration = CConfigurationBuilder().build()
self.configuration.get().set_optimize(optimize)
self.configuration.get().set_dump_ir(dump_ir)
@staticmethod
cdef create(shared_ptr[CConfiguration] configuration):
"""
Create a Configuration instance from an existing CConfiguration pointer.
Parameters
----------
configuration : shared_ptr[CConfiguration]
Existing CConfiguration pointer.
Returns
-------
Configuration instance
"""
cdef Configuration self = Configuration.__new__(Configuration)
self.configuration = configuration
return self
cpdef make_projector(Schema schema, children, MemoryPool pool,
str selection_mode="NONE",
Configuration configuration=None):
"""
Construct a projection using expressions.
A projector is built for a specific schema and vector of expressions.
Once the projector is built, it can be used to evaluate many row batches.
Parameters
----------
schema : pyarrow.Schema
Schema for the record batches, and the expressions.
children : list[pyarrow.gandiva.Expression]
List of projectable expression objects.
pool : pyarrow.MemoryPool
Memory pool used to allocate output arrays.
selection_mode : str, default "NONE"
Possible values are NONE, UINT16, UINT32, UINT64.
configuration : pyarrow.gandiva.Configuration, default None
Configuration for the projector.
Returns
-------
Projector instance
"""
cdef:
Expression child
c_vector[shared_ptr[CGandivaExpression]] c_children
shared_ptr[CProjector] result
if configuration is None:
configuration = Configuration()
for child in children:
if child is None:
raise TypeError("Expressions must not be None")
c_children.push_back(child.expression)
check_status(
Projector_Make(schema.sp_schema, c_children,
_ensure_selection_mode(selection_mode),
configuration.configuration,
&result))
return Projector.create(result, pool)
cpdef make_filter(Schema schema, Condition condition,
Configuration configuration=None):
"""
Construct a filter based on a condition.
A filter is built for a specific schema and condition. Once the filter is
built, it can be used to evaluate many row batches.
Parameters
----------
schema : pyarrow.Schema
Schema for the record batches, and the condition.
condition : pyarrow.gandiva.Condition
Filter condition.
configuration : pyarrow.gandiva.Configuration, default None
Configuration for the filter.
Returns
-------
Filter instance
"""
cdef shared_ptr[CFilter] result
if condition is None:
raise TypeError("Condition must not be None")
if configuration is None:
configuration = Configuration()
check_status(
Filter_Make(schema.sp_schema, condition.condition, configuration.configuration, &result))
return Filter.create(result)
cdef class FunctionSignature(_Weakrefable):
"""
Signature of a Gandiva function including name, parameter types
and return type.
"""
cdef:
shared_ptr[CFunctionSignature] signature
def __init__(self):
raise TypeError(
f"Do not call {self.__class__.__name__}'s constructor directly.")
@staticmethod
cdef create(shared_ptr[CFunctionSignature] signature):
cdef FunctionSignature self = FunctionSignature.__new__(
FunctionSignature)
self.signature = signature
return self
def return_type(self):
return pyarrow_wrap_data_type(self.signature.get().ret_type())
def param_types(self):
result = []
cdef vector[shared_ptr[CDataType]] types = \
self.signature.get().param_types()
for t in types:
result.append(pyarrow_wrap_data_type(t))
return result
def name(self):
return self.signature.get().base_name().decode()
def __repr__(self):
signature = self.signature.get().ToString().decode()
return "FunctionSignature(" + signature + ")"
def get_registered_function_signatures():
"""
Return the function in Gandiva's ExpressionRegistry.
Returns
-------
registry: a list of registered function signatures
"""
results = []
cdef vector[shared_ptr[CFunctionSignature]] signatures = \
GetRegisteredFunctionSignatures()
for signature in signatures:
results.append(FunctionSignature.create(signature))
return results

View File

@@ -0,0 +1,162 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <functional>
#include <optional>
#include <vector>
#include "arrow/acero/visibility.h"
#include "arrow/compute/exec.h"
#include "arrow/result.h"
namespace arrow {
namespace acero {
namespace util {
using arrow::compute::ExecBatch;
/// \brief A container that accumulates batches until they are ready to
/// be processed.
class ARROW_ACERO_EXPORT AccumulationQueue {
public:
AccumulationQueue() : row_count_(0) {}
~AccumulationQueue() = default;
// We should never be copying ExecBatch around
AccumulationQueue(const AccumulationQueue&) = delete;
AccumulationQueue& operator=(const AccumulationQueue&) = delete;
AccumulationQueue(AccumulationQueue&& that);
AccumulationQueue& operator=(AccumulationQueue&& that);
void Concatenate(AccumulationQueue&& that);
void InsertBatch(ExecBatch batch);
int64_t row_count() { return row_count_; }
size_t batch_count() { return batches_.size(); }
bool empty() const { return batches_.empty(); }
void Clear();
ExecBatch& operator[](size_t i);
private:
int64_t row_count_;
std::vector<ExecBatch> batches_;
};
/// A queue that sequences incoming batches
///
/// This can be used when a node needs to do some kind of ordered processing on
/// the stream.
///
/// Batches can be inserted in any order. The process_callback will be called on
/// the batches, in order, without reentrant calls. For this reason the callback
/// should be quick.
///
/// For example, in a top-n node, the process callback should determine how many
/// rows need to be delivered for the given batch, and then return a task to actually
/// deliver those rows.
class ARROW_ACERO_EXPORT SequencingQueue {
public:
using Task = std::function<Status()>;
/// Strategy that describes how to handle items
class Processor {
public:
/// Process the batch, potentially generating a task
///
/// This method will be called on each batch in order. Calls to this method
/// will be serialized and it will not be called reentrantly. This makes it
/// safe to do things that rely on order but minimal time should be spent here
/// to avoid becoming a bottleneck.
///
/// \return a follow-up task that will be scheduled. The follow-up task(s) are
/// is not guaranteed to run in any particular order. If nullopt is
/// returned then nothing will be scheduled.
virtual Result<std::optional<Task>> Process(ExecBatch batch) = 0;
/// Schedule a task
virtual void Schedule(Task task) = 0;
};
virtual ~SequencingQueue() = default;
/// Insert a batch into the queue
///
/// This will insert the batch into the queue. If this batch was the next batch
/// to deliver then this will trigger 1+ calls to the process callback to generate
/// 1+ tasks.
///
/// The task generated by this call will be executed immediately. The remaining
/// tasks will be scheduled using the schedule callback.
///
/// From a data pipeline perspective the sequencing queue is a "sometimes" breaker. If
/// a task arrives in order then this call will usually execute the downstream pipeline.
/// If this task arrives early then this call will only queue the data.
virtual Status InsertBatch(ExecBatch batch) = 0;
/// Create a queue
/// \param processor describes how to process the batches, must outlive the queue
static std::unique_ptr<SequencingQueue> Make(Processor* processor);
};
/// A queue that sequences incoming batches
///
/// Unlike SequencingQueue the Process method is not expected to schedule new tasks.
///
/// If a batch arrives and another thread is currently processing then the batch
/// will be queued and control will return. In other words, delivery of batches will
/// not block on the Process method.
///
/// It can be helpful to think of this as if a dedicated thread is running Process as
/// batches arrive
class ARROW_ACERO_EXPORT SerialSequencingQueue {
public:
/// Strategy that describes how to handle items
class Processor {
public:
virtual ~Processor() = default;
/// Process the batch
///
/// This method will be called on each batch in order. Calls to this method
/// will be serialized and it will not be called reentrantly. This makes it
/// safe to do things that rely on order.
///
/// If this falls behind then data may accumulate
///
/// TODO: Could add backpressure if needed but right now all uses of this should
/// be pretty fast and so are unlikely to block.
virtual Status Process(ExecBatch batch) = 0;
};
virtual ~SerialSequencingQueue() = default;
/// Insert a batch into the queue
///
/// This will insert the batch into the queue. If this batch was the next batch
/// to deliver then this may trigger calls to the processor which will be run
/// as part of this call.
virtual Status InsertBatch(ExecBatch batch) = 0;
/// Create a queue
/// \param processor describes how to process the batches, must outlive the queue
static std::unique_ptr<SerialSequencingQueue> Make(Processor* processor);
};
} // namespace util
} // namespace acero
} // namespace arrow

Some files were not shown because too many files have changed in this diff Show More