Initial commit
This commit is contained in:
@@ -0,0 +1,20 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# flake8: noqa
|
||||
|
||||
from .from_dataframe import from_dataframe
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
107
venv/lib/python3.10/site-packages/pyarrow/interchange/buffer.py
Normal file
107
venv/lib/python3.10/site-packages/pyarrow/interchange/buffer.py
Normal file
@@ -0,0 +1,107 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import enum
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
|
||||
class DlpackDeviceType(enum.IntEnum):
|
||||
"""Integer enum for device type codes matching DLPack."""
|
||||
|
||||
CPU = 1
|
||||
CUDA = 2
|
||||
CPU_PINNED = 3
|
||||
OPENCL = 4
|
||||
VULKAN = 7
|
||||
METAL = 8
|
||||
VPI = 9
|
||||
ROCM = 10
|
||||
|
||||
|
||||
class _PyArrowBuffer:
|
||||
"""
|
||||
Data in the buffer is guaranteed to be contiguous in memory.
|
||||
|
||||
Note that there is no dtype attribute present, a buffer can be thought of
|
||||
as simply a block of memory. However, if the column that the buffer is
|
||||
attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
|
||||
implemented, then that dtype information will be contained in the return
|
||||
value from ``__dlpack__``.
|
||||
|
||||
This distinction is useful to support both data exchange via DLPack on a
|
||||
buffer and (b) dtypes like variable-length strings which do not have a
|
||||
fixed number of bytes per element.
|
||||
"""
|
||||
|
||||
def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None:
|
||||
"""
|
||||
Handle PyArrow Buffers.
|
||||
"""
|
||||
self._x = x
|
||||
|
||||
@property
|
||||
def bufsize(self) -> int:
|
||||
"""
|
||||
Buffer size in bytes.
|
||||
"""
|
||||
return self._x.size
|
||||
|
||||
@property
|
||||
def ptr(self) -> int:
|
||||
"""
|
||||
Pointer to start of the buffer as an integer.
|
||||
"""
|
||||
return self._x.address
|
||||
|
||||
def __dlpack__(self):
|
||||
"""
|
||||
Produce DLPack capsule (see array API standard).
|
||||
|
||||
Raises:
|
||||
- TypeError : if the buffer contains unsupported dtypes.
|
||||
- NotImplementedError : if DLPack support is not implemented
|
||||
|
||||
Useful to have to connect to array libraries. Support optional because
|
||||
it's not completely trivial to implement for a Python-only library.
|
||||
"""
|
||||
raise NotImplementedError("__dlpack__")
|
||||
|
||||
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
|
||||
"""
|
||||
Device type and device ID for where the data in the buffer resides.
|
||||
Uses device type codes matching DLPack.
|
||||
Note: must be implemented even if ``__dlpack__`` is not.
|
||||
"""
|
||||
if self._x.is_cpu:
|
||||
return (DlpackDeviceType.CPU, None)
|
||||
else:
|
||||
raise NotImplementedError("__dlpack_device__")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
"PyArrowBuffer(" +
|
||||
str(
|
||||
{
|
||||
"bufsize": self.bufsize,
|
||||
"ptr": self.ptr,
|
||||
"device": self.__dlpack_device__()[0].name,
|
||||
}
|
||||
) +
|
||||
")"
|
||||
)
|
||||
529
venv/lib/python3.10/site-packages/pyarrow/interchange/column.py
Normal file
529
venv/lib/python3.10/site-packages/pyarrow/interchange/column.py
Normal file
@@ -0,0 +1,529 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import enum
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Optional,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
import sys
|
||||
if sys.version_info >= (3, 8):
|
||||
from typing import TypedDict
|
||||
else:
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
from pyarrow.interchange.buffer import _PyArrowBuffer
|
||||
|
||||
|
||||
class DtypeKind(enum.IntEnum):
|
||||
"""
|
||||
Integer enum for data types.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
INT : int
|
||||
Matches to signed integer data type.
|
||||
UINT : int
|
||||
Matches to unsigned integer data type.
|
||||
FLOAT : int
|
||||
Matches to floating point data type.
|
||||
BOOL : int
|
||||
Matches to boolean data type.
|
||||
STRING : int
|
||||
Matches to string data type (UTF-8 encoded).
|
||||
DATETIME : int
|
||||
Matches to datetime data type.
|
||||
CATEGORICAL : int
|
||||
Matches to categorical data type.
|
||||
"""
|
||||
|
||||
INT = 0
|
||||
UINT = 1
|
||||
FLOAT = 2
|
||||
BOOL = 20
|
||||
STRING = 21 # UTF-8
|
||||
DATETIME = 22
|
||||
CATEGORICAL = 23
|
||||
|
||||
|
||||
Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype
|
||||
|
||||
|
||||
_PYARROW_KINDS = {
|
||||
pa.int8(): (DtypeKind.INT, "c"),
|
||||
pa.int16(): (DtypeKind.INT, "s"),
|
||||
pa.int32(): (DtypeKind.INT, "i"),
|
||||
pa.int64(): (DtypeKind.INT, "l"),
|
||||
pa.uint8(): (DtypeKind.UINT, "C"),
|
||||
pa.uint16(): (DtypeKind.UINT, "S"),
|
||||
pa.uint32(): (DtypeKind.UINT, "I"),
|
||||
pa.uint64(): (DtypeKind.UINT, "L"),
|
||||
pa.float16(): (DtypeKind.FLOAT, "e"),
|
||||
pa.float32(): (DtypeKind.FLOAT, "f"),
|
||||
pa.float64(): (DtypeKind.FLOAT, "g"),
|
||||
pa.bool_(): (DtypeKind.BOOL, "b"),
|
||||
pa.string(): (DtypeKind.STRING, "u"),
|
||||
pa.large_string(): (DtypeKind.STRING, "U"),
|
||||
}
|
||||
|
||||
|
||||
class ColumnNullType(enum.IntEnum):
|
||||
"""
|
||||
Integer enum for null type representation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
NON_NULLABLE : int
|
||||
Non-nullable column.
|
||||
USE_NAN : int
|
||||
Use explicit float NaN value.
|
||||
USE_SENTINEL : int
|
||||
Sentinel value besides NaN.
|
||||
USE_BITMASK : int
|
||||
The bit is set/unset representing a null on a certain position.
|
||||
USE_BYTEMASK : int
|
||||
The byte is set/unset representing a null on a certain position.
|
||||
"""
|
||||
|
||||
NON_NULLABLE = 0
|
||||
USE_NAN = 1
|
||||
USE_SENTINEL = 2
|
||||
USE_BITMASK = 3
|
||||
USE_BYTEMASK = 4
|
||||
|
||||
|
||||
class ColumnBuffers(TypedDict):
|
||||
# first element is a buffer containing the column data;
|
||||
# second element is the data buffer's associated dtype
|
||||
data: Tuple[_PyArrowBuffer, Dtype]
|
||||
|
||||
# first element is a buffer containing mask values indicating missing data;
|
||||
# second element is the mask value buffer's associated dtype.
|
||||
# None if the null representation is not a bit or byte mask
|
||||
validity: Optional[Tuple[_PyArrowBuffer, Dtype]]
|
||||
|
||||
# first element is a buffer containing the offset values for
|
||||
# variable-size binary data (e.g., variable-length strings);
|
||||
# second element is the offsets buffer's associated dtype.
|
||||
# None if the data buffer does not have an associated offsets buffer
|
||||
offsets: Optional[Tuple[_PyArrowBuffer, Dtype]]
|
||||
|
||||
|
||||
class CategoricalDescription(TypedDict):
|
||||
# whether the ordering of dictionary indices is semantically meaningful
|
||||
is_ordered: bool
|
||||
# whether a dictionary-style mapping of categorical values to other objects
|
||||
# exists
|
||||
is_dictionary: bool
|
||||
# Python-level only (e.g. ``{int: str}``).
|
||||
# None if not a dictionary-style categorical.
|
||||
categories: Optional[_PyArrowColumn]
|
||||
|
||||
|
||||
class Endianness:
|
||||
"""Enum indicating the byte-order of a data-type."""
|
||||
|
||||
LITTLE = "<"
|
||||
BIG = ">"
|
||||
NATIVE = "="
|
||||
NA = "|"
|
||||
|
||||
|
||||
class NoBufferPresent(Exception):
|
||||
"""Exception to signal that there is no requested buffer."""
|
||||
|
||||
|
||||
class _PyArrowColumn:
|
||||
"""
|
||||
A column object, with only the methods and properties required by the
|
||||
interchange protocol defined.
|
||||
|
||||
A column can contain one or more chunks. Each chunk can contain up to three
|
||||
buffers - a data buffer, a mask buffer (depending on null representation),
|
||||
and an offsets buffer (if variable-size binary; e.g., variable-length
|
||||
strings).
|
||||
|
||||
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
|
||||
Instead, it seems to use "children" for both columns with a bit mask,
|
||||
and for nested dtypes. Unclear whether this is elegant or confusing.
|
||||
This design requires checking the null representation explicitly.
|
||||
|
||||
The Arrow design requires checking:
|
||||
1. the ARROW_FLAG_NULLABLE (for sentinel values)
|
||||
2. if a column has two children, combined with one of those children
|
||||
having a null dtype.
|
||||
|
||||
Making the mask concept explicit seems useful. One null dtype would
|
||||
not be enough to cover both bit and byte masks, so that would mean
|
||||
even more checking if we did it the Arrow way.
|
||||
|
||||
TBD: there's also the "chunk" concept here, which is implicit in Arrow as
|
||||
multiple buffers per array (= column here). Semantically it may make
|
||||
sense to have both: chunks were meant for example for lazy evaluation
|
||||
of data which doesn't fit in memory, while multiple buffers per column
|
||||
could also come from doing a selection operation on a single
|
||||
contiguous buffer.
|
||||
|
||||
Given these concepts, one would expect chunks to be all of the same
|
||||
size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
|
||||
while multiple buffers could have data-dependent lengths. Not an issue
|
||||
in pandas if one column is backed by a single NumPy array, but in
|
||||
Arrow it seems possible.
|
||||
Are multiple chunks *and* multiple buffers per column necessary for
|
||||
the purposes of this interchange protocol, or must producers either
|
||||
reuse the chunk concept for this or copy the data?
|
||||
|
||||
Note: this Column object can only be produced by ``__dataframe__``, so
|
||||
doesn't need its own version or ``__column__`` protocol.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True
|
||||
) -> None:
|
||||
"""
|
||||
Handles PyArrow Arrays and ChunkedArrays.
|
||||
"""
|
||||
# Store the column as a private attribute
|
||||
if isinstance(column, pa.ChunkedArray):
|
||||
if column.num_chunks == 1:
|
||||
column = column.chunk(0)
|
||||
else:
|
||||
if not allow_copy:
|
||||
raise RuntimeError(
|
||||
"Chunks will be combined and a copy is required which "
|
||||
"is forbidden by allow_copy=False"
|
||||
)
|
||||
column = column.combine_chunks()
|
||||
|
||||
self._allow_copy = allow_copy
|
||||
|
||||
if pa.types.is_boolean(column.type):
|
||||
if not allow_copy:
|
||||
raise RuntimeError(
|
||||
"Boolean column will be casted to uint8 and a copy "
|
||||
"is required which is forbidden by allow_copy=False"
|
||||
)
|
||||
self._dtype = self._dtype_from_arrowdtype(column.type, 8)
|
||||
self._col = pc.cast(column, pa.uint8())
|
||||
else:
|
||||
self._col = column
|
||||
dtype = self._col.type
|
||||
try:
|
||||
bit_width = dtype.bit_width
|
||||
except ValueError:
|
||||
# in case of a variable-length strings, considered as array
|
||||
# of bytes (8 bits)
|
||||
bit_width = 8
|
||||
self._dtype = self._dtype_from_arrowdtype(dtype, bit_width)
|
||||
|
||||
def size(self) -> int:
|
||||
"""
|
||||
Size of the column, in elements.
|
||||
|
||||
Corresponds to DataFrame.num_rows() if column is a single chunk;
|
||||
equal to size of this current chunk otherwise.
|
||||
|
||||
Is a method rather than a property because it may cause a (potentially
|
||||
expensive) computation for some dataframe implementations.
|
||||
"""
|
||||
return len(self._col)
|
||||
|
||||
@property
|
||||
def offset(self) -> int:
|
||||
"""
|
||||
Offset of first element.
|
||||
|
||||
May be > 0 if using chunks; for example for a column with N chunks of
|
||||
equal size M (only the last chunk may be shorter),
|
||||
``offset = n * M``, ``n = 0 .. N-1``.
|
||||
"""
|
||||
return self._col.offset
|
||||
|
||||
@property
|
||||
def dtype(self) -> Tuple[DtypeKind, int, str, str]:
|
||||
"""
|
||||
Dtype description as a tuple ``(kind, bit-width, format string,
|
||||
endianness)``.
|
||||
|
||||
Bit-width : the number of bits as an integer
|
||||
Format string : data type description format string in Apache Arrow C
|
||||
Data Interface format.
|
||||
Endianness : current only native endianness (``=``) is supported
|
||||
|
||||
Notes:
|
||||
- Kind specifiers are aligned with DLPack where possible (hence the
|
||||
jump to 20, leave enough room for future extension)
|
||||
- Masks must be specified as boolean with either bit width 1 (for
|
||||
bit masks) or 8 (for byte masks).
|
||||
- Dtype width in bits was preferred over bytes
|
||||
- Endianness isn't too useful, but included now in case in the
|
||||
future we need to support non-native endianness
|
||||
- Went with Apache Arrow format strings over NumPy format strings
|
||||
because they're more complete from a dataframe perspective
|
||||
- Format strings are mostly useful for datetime specification, and
|
||||
for categoricals.
|
||||
- For categoricals, the format string describes the type of the
|
||||
categorical in the data buffer. In case of a separate encoding of
|
||||
the categorical (e.g. an integer to string mapping), this can
|
||||
be derived from ``self.describe_categorical``.
|
||||
- Data types not included: complex, Arrow-style null, binary,
|
||||
decimal, and nested (list, struct, map, union) dtypes.
|
||||
"""
|
||||
return self._dtype
|
||||
|
||||
def _dtype_from_arrowdtype(
|
||||
self, dtype: pa.DataType, bit_width: int
|
||||
) -> Tuple[DtypeKind, int, str, str]:
|
||||
"""
|
||||
See `self.dtype` for details.
|
||||
"""
|
||||
# Note: 'c' (complex) not handled yet (not in array spec v1).
|
||||
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
|
||||
# not handled datetime and timedelta both map to datetime
|
||||
# (is timedelta handled?)
|
||||
|
||||
if pa.types.is_timestamp(dtype):
|
||||
kind = DtypeKind.DATETIME
|
||||
ts = dtype.unit[0]
|
||||
tz = dtype.tz if dtype.tz else ""
|
||||
f_string = f"ts{ts}:{tz}"
|
||||
return kind, bit_width, f_string, Endianness.NATIVE
|
||||
elif pa.types.is_dictionary(dtype):
|
||||
kind = DtypeKind.CATEGORICAL
|
||||
arr = self._col
|
||||
indices_dtype = arr.indices.type
|
||||
_, f_string = _PYARROW_KINDS.get(indices_dtype)
|
||||
return kind, bit_width, f_string, Endianness.NATIVE
|
||||
else:
|
||||
kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
|
||||
if kind is None:
|
||||
raise ValueError(
|
||||
f"Data type {dtype} not supported by interchange protocol")
|
||||
|
||||
return kind, bit_width, f_string, Endianness.NATIVE
|
||||
|
||||
@property
|
||||
def describe_categorical(self) -> CategoricalDescription:
|
||||
"""
|
||||
If the dtype is categorical, there are two options:
|
||||
- There are only values in the data buffer.
|
||||
- There is a separate non-categorical Column encoding categorical
|
||||
values.
|
||||
|
||||
Raises TypeError if the dtype is not categorical
|
||||
|
||||
Returns the dictionary with description on how to interpret the
|
||||
data buffer:
|
||||
- "is_ordered" : bool, whether the ordering of dictionary indices
|
||||
is semantically meaningful.
|
||||
- "is_dictionary" : bool, whether a mapping of
|
||||
categorical values to other objects exists
|
||||
- "categories" : Column representing the (implicit) mapping of
|
||||
indices to category values (e.g. an array of
|
||||
cat1, cat2, ...). None if not a dictionary-style
|
||||
categorical.
|
||||
|
||||
TBD: are there any other in-memory representations that are needed?
|
||||
"""
|
||||
arr = self._col
|
||||
if not pa.types.is_dictionary(arr.type):
|
||||
raise TypeError(
|
||||
"describe_categorical only works on a column with "
|
||||
"categorical dtype!"
|
||||
)
|
||||
|
||||
return {
|
||||
"is_ordered": self._col.type.ordered,
|
||||
"is_dictionary": True,
|
||||
"categories": _PyArrowColumn(arr.dictionary),
|
||||
}
|
||||
|
||||
@property
|
||||
def describe_null(self) -> Tuple[ColumnNullType, Any]:
|
||||
"""
|
||||
Return the missing value (or "null") representation the column dtype
|
||||
uses, as a tuple ``(kind, value)``.
|
||||
|
||||
Value : if kind is "sentinel value", the actual value. If kind is a bit
|
||||
mask or a byte mask, the value (0 or 1) indicating a missing value.
|
||||
None otherwise.
|
||||
"""
|
||||
# In case of no missing values, we need to set ColumnNullType to
|
||||
# non nullable as in the current __dataframe__ protocol bit/byte masks
|
||||
# cannot be None
|
||||
if self.null_count == 0:
|
||||
return ColumnNullType.NON_NULLABLE, None
|
||||
else:
|
||||
return ColumnNullType.USE_BITMASK, 0
|
||||
|
||||
@property
|
||||
def null_count(self) -> int:
|
||||
"""
|
||||
Number of null elements, if known.
|
||||
|
||||
Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
|
||||
"""
|
||||
arrow_null_count = self._col.null_count
|
||||
n = arrow_null_count if arrow_null_count != -1 else None
|
||||
return n
|
||||
|
||||
@property
|
||||
def metadata(self) -> Dict[str, Any]:
|
||||
"""
|
||||
The metadata for the column. See `DataFrame.metadata` for more details.
|
||||
"""
|
||||
pass
|
||||
|
||||
def num_chunks(self) -> int:
|
||||
"""
|
||||
Return the number of chunks the column consists of.
|
||||
"""
|
||||
return 1
|
||||
|
||||
def get_chunks(
|
||||
self, n_chunks: Optional[int] = None
|
||||
) -> Iterable[_PyArrowColumn]:
|
||||
"""
|
||||
Return an iterator yielding the chunks.
|
||||
|
||||
See `DataFrame.get_chunks` for details on ``n_chunks``.
|
||||
"""
|
||||
if n_chunks and n_chunks > 1:
|
||||
chunk_size = self.size() // n_chunks
|
||||
if self.size() % n_chunks != 0:
|
||||
chunk_size += 1
|
||||
|
||||
array = self._col
|
||||
i = 0
|
||||
for start in range(0, chunk_size * n_chunks, chunk_size):
|
||||
yield _PyArrowColumn(
|
||||
array.slice(start, chunk_size), self._allow_copy
|
||||
)
|
||||
i += 1
|
||||
else:
|
||||
yield self
|
||||
|
||||
def get_buffers(self) -> ColumnBuffers:
|
||||
"""
|
||||
Return a dictionary containing the underlying buffers.
|
||||
|
||||
The returned dictionary has the following contents:
|
||||
|
||||
- "data": a two-element tuple whose first element is a buffer
|
||||
containing the data and whose second element is the data
|
||||
buffer's associated dtype.
|
||||
- "validity": a two-element tuple whose first element is a buffer
|
||||
containing mask values indicating missing data and
|
||||
whose second element is the mask value buffer's
|
||||
associated dtype. None if the null representation is
|
||||
not a bit or byte mask.
|
||||
- "offsets": a two-element tuple whose first element is a buffer
|
||||
containing the offset values for variable-size binary
|
||||
data (e.g., variable-length strings) and whose second
|
||||
element is the offsets buffer's associated dtype. None
|
||||
if the data buffer does not have an associated offsets
|
||||
buffer.
|
||||
"""
|
||||
buffers: ColumnBuffers = {
|
||||
"data": self._get_data_buffer(),
|
||||
"validity": None,
|
||||
"offsets": None,
|
||||
}
|
||||
|
||||
try:
|
||||
buffers["validity"] = self._get_validity_buffer()
|
||||
except NoBufferPresent:
|
||||
pass
|
||||
|
||||
try:
|
||||
buffers["offsets"] = self._get_offsets_buffer()
|
||||
except NoBufferPresent:
|
||||
pass
|
||||
|
||||
return buffers
|
||||
|
||||
def _get_data_buffer(
|
||||
self,
|
||||
) -> Tuple[_PyArrowBuffer, Any]: # Any is for self.dtype tuple
|
||||
"""
|
||||
Return the buffer containing the data and the buffer's
|
||||
associated dtype.
|
||||
"""
|
||||
array = self._col
|
||||
dtype = self.dtype
|
||||
|
||||
# In case of dictionary arrays, use indices
|
||||
# to define a buffer, codes are transferred through
|
||||
# describe_categorical()
|
||||
if pa.types.is_dictionary(array.type):
|
||||
array = array.indices
|
||||
dtype = _PyArrowColumn(array).dtype
|
||||
|
||||
n = len(array.buffers())
|
||||
if n == 2:
|
||||
return _PyArrowBuffer(array.buffers()[1]), dtype
|
||||
elif n == 3:
|
||||
return _PyArrowBuffer(array.buffers()[2]), dtype
|
||||
|
||||
def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
|
||||
"""
|
||||
Return the buffer containing the mask values indicating missing data
|
||||
and the buffer's associated dtype.
|
||||
Raises NoBufferPresent if null representation is not a bit or byte
|
||||
mask.
|
||||
"""
|
||||
# Define the dtype of the returned buffer
|
||||
dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE)
|
||||
array = self._col
|
||||
buff = array.buffers()[0]
|
||||
if buff:
|
||||
return _PyArrowBuffer(buff), dtype
|
||||
else:
|
||||
raise NoBufferPresent(
|
||||
"There are no missing values so "
|
||||
"does not have a separate mask")
|
||||
|
||||
def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
|
||||
"""
|
||||
Return the buffer containing the offset values for variable-size binary
|
||||
data (e.g., variable-length strings) and the buffer's associated dtype.
|
||||
Raises NoBufferPresent if the data buffer does not have an associated
|
||||
offsets buffer.
|
||||
"""
|
||||
array = self._col
|
||||
n = len(array.buffers())
|
||||
if n == 2:
|
||||
raise NoBufferPresent(
|
||||
"This column has a fixed-length dtype so "
|
||||
"it does not have an offsets buffer"
|
||||
)
|
||||
elif n == 3:
|
||||
# Define the dtype of the returned buffer
|
||||
dtype = self._col.type
|
||||
if pa.types.is_large_string(dtype):
|
||||
dtype = (DtypeKind.INT, 64, "l", Endianness.NATIVE)
|
||||
else:
|
||||
dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE)
|
||||
return _PyArrowBuffer(array.buffers()[1]), dtype
|
||||
@@ -0,0 +1,217 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import (
|
||||
Any,
|
||||
Iterable,
|
||||
Optional,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
from pyarrow.interchange.column import _PyArrowColumn
|
||||
|
||||
|
||||
class _PyArrowDataFrame:
|
||||
"""
|
||||
A data frame class, with only the methods required by the interchange
|
||||
protocol defined.
|
||||
|
||||
A "data frame" represents an ordered collection of named columns.
|
||||
A column's "name" must be a unique string.
|
||||
Columns may be accessed by name or by position.
|
||||
|
||||
This could be a public data frame class, or an object with the methods and
|
||||
attributes defined on this DataFrame class could be returned from the
|
||||
``__dataframe__`` method of a public data frame class in a library adhering
|
||||
to the dataframe interchange protocol specification.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, df: pa.Table | pa.RecordBatch,
|
||||
nan_as_null: bool = False,
|
||||
allow_copy: bool = True
|
||||
) -> None:
|
||||
"""
|
||||
Constructor - an instance of this (private) class is returned from
|
||||
`pa.Table.__dataframe__` or `pa.RecordBatch.__dataframe__`.
|
||||
"""
|
||||
self._df = df
|
||||
# ``nan_as_null`` is a keyword intended for the consumer to tell the
|
||||
# producer to overwrite null values in the data with ``NaN`` (or
|
||||
# ``NaT``).
|
||||
if nan_as_null is True:
|
||||
raise RuntimeError(
|
||||
"nan_as_null=True currently has no effect, "
|
||||
"use the default nan_as_null=False"
|
||||
)
|
||||
self._nan_as_null = nan_as_null
|
||||
self._allow_copy = allow_copy
|
||||
|
||||
def __dataframe__(
|
||||
self, nan_as_null: bool = False, allow_copy: bool = True
|
||||
) -> _PyArrowDataFrame:
|
||||
"""
|
||||
Construct a new exchange object, potentially changing the parameters.
|
||||
``nan_as_null`` is a keyword intended for the consumer to tell the
|
||||
producer to overwrite null values in the data with ``NaN``.
|
||||
It is intended for cases where the consumer does not support the bit
|
||||
mask or byte mask that is the producer's native representation.
|
||||
``allow_copy`` is a keyword that defines whether or not the library is
|
||||
allowed to make a copy of the data. For example, copying data would be
|
||||
necessary if a library supports strided buffers, given that this
|
||||
protocol specifies contiguous buffers.
|
||||
"""
|
||||
return _PyArrowDataFrame(self._df, nan_as_null, allow_copy)
|
||||
|
||||
@property
|
||||
def metadata(self) -> dict[str, Any]:
|
||||
"""
|
||||
The metadata for the data frame, as a dictionary with string keys. The
|
||||
contents of `metadata` may be anything, they are meant for a library
|
||||
to store information that it needs to, e.g., roundtrip losslessly or
|
||||
for two implementations to share data that is not (yet) part of the
|
||||
interchange protocol specification. For avoiding collisions with other
|
||||
entries, please add name the keys with the name of the library
|
||||
followed by a period and the desired name, e.g, ``pandas.indexcol``.
|
||||
"""
|
||||
# The metadata for the data frame, as a dictionary with string keys.
|
||||
# Add schema metadata here (pandas metadata or custom metadata)
|
||||
if self._df.schema.metadata:
|
||||
schema_metadata = {"pyarrow." + k.decode('utf8'): v.decode('utf8')
|
||||
for k, v in self._df.schema.metadata.items()}
|
||||
return schema_metadata
|
||||
else:
|
||||
return {}
|
||||
|
||||
def num_columns(self) -> int:
|
||||
"""
|
||||
Return the number of columns in the DataFrame.
|
||||
"""
|
||||
return self._df.num_columns
|
||||
|
||||
def num_rows(self) -> int:
|
||||
"""
|
||||
Return the number of rows in the DataFrame, if available.
|
||||
"""
|
||||
return self._df.num_rows
|
||||
|
||||
def num_chunks(self) -> int:
|
||||
"""
|
||||
Return the number of chunks the DataFrame consists of.
|
||||
"""
|
||||
if isinstance(self._df, pa.RecordBatch):
|
||||
return 1
|
||||
else:
|
||||
# pyarrow.Table can have columns with different number
|
||||
# of chunks so we take the number of chunks that
|
||||
# .to_batches() returns as it takes the min chunk size
|
||||
# of all the columns (to_batches is a zero copy method)
|
||||
batches = self._df.to_batches()
|
||||
return len(batches)
|
||||
|
||||
def column_names(self) -> Iterable[str]:
|
||||
"""
|
||||
Return an iterator yielding the column names.
|
||||
"""
|
||||
return self._df.schema.names
|
||||
|
||||
def get_column(self, i: int) -> _PyArrowColumn:
|
||||
"""
|
||||
Return the column at the indicated position.
|
||||
"""
|
||||
return _PyArrowColumn(self._df.column(i),
|
||||
allow_copy=self._allow_copy)
|
||||
|
||||
def get_column_by_name(self, name: str) -> _PyArrowColumn:
|
||||
"""
|
||||
Return the column whose name is the indicated name.
|
||||
"""
|
||||
return _PyArrowColumn(self._df.column(name),
|
||||
allow_copy=self._allow_copy)
|
||||
|
||||
def get_columns(self) -> Iterable[_PyArrowColumn]:
|
||||
"""
|
||||
Return an iterator yielding the columns.
|
||||
"""
|
||||
return [
|
||||
_PyArrowColumn(col, allow_copy=self._allow_copy)
|
||||
for col in self._df.columns
|
||||
]
|
||||
|
||||
def select_columns(self, indices: Sequence[int]) -> _PyArrowDataFrame:
|
||||
"""
|
||||
Create a new DataFrame by selecting a subset of columns by index.
|
||||
"""
|
||||
return _PyArrowDataFrame(
|
||||
self._df.select(list(indices)), self._nan_as_null, self._allow_copy
|
||||
)
|
||||
|
||||
def select_columns_by_name(
|
||||
self, names: Sequence[str]
|
||||
) -> _PyArrowDataFrame:
|
||||
"""
|
||||
Create a new DataFrame by selecting a subset of columns by name.
|
||||
"""
|
||||
return _PyArrowDataFrame(
|
||||
self._df.select(list(names)), self._nan_as_null, self._allow_copy
|
||||
)
|
||||
|
||||
def get_chunks(
|
||||
self, n_chunks: Optional[int] = None
|
||||
) -> Iterable[_PyArrowDataFrame]:
|
||||
"""
|
||||
Return an iterator yielding the chunks.
|
||||
|
||||
By default (None), yields the chunks that the data is stored as by the
|
||||
producer. If given, ``n_chunks`` must be a multiple of
|
||||
``self.num_chunks()``, meaning the producer must subdivide each chunk
|
||||
before yielding it.
|
||||
|
||||
Note that the producer must ensure that all columns are chunked the
|
||||
same way.
|
||||
"""
|
||||
# Subdivide chunks
|
||||
if n_chunks and n_chunks > 1:
|
||||
chunk_size = self.num_rows() // n_chunks
|
||||
if self.num_rows() % n_chunks != 0:
|
||||
chunk_size += 1
|
||||
if isinstance(self._df, pa.Table):
|
||||
batches = self._df.to_batches(max_chunksize=chunk_size)
|
||||
else:
|
||||
batches = []
|
||||
for start in range(0, chunk_size * n_chunks, chunk_size):
|
||||
batches.append(self._df.slice(start, chunk_size))
|
||||
# In case when the size of the chunk is such that the resulting
|
||||
# list is one less chunk then n_chunks -> append an empty chunk
|
||||
if len(batches) == n_chunks - 1:
|
||||
batches.append(pa.record_batch([[]], schema=self._df.schema))
|
||||
# yields the chunks that the data is stored as
|
||||
else:
|
||||
if isinstance(self._df, pa.Table):
|
||||
batches = self._df.to_batches()
|
||||
else:
|
||||
batches = [self._df]
|
||||
|
||||
# Create an iterator of RecordBatches
|
||||
iterator = [_PyArrowDataFrame(batch,
|
||||
self._nan_as_null,
|
||||
self._allow_copy)
|
||||
for batch in batches]
|
||||
return iterator
|
||||
@@ -0,0 +1,614 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
from pyarrow.interchange.column import (
|
||||
DtypeKind,
|
||||
ColumnBuffers,
|
||||
ColumnNullType,
|
||||
)
|
||||
|
||||
import pyarrow as pa
|
||||
import re
|
||||
|
||||
import pyarrow.compute as pc
|
||||
from pyarrow.interchange.column import Dtype
|
||||
|
||||
|
||||
# A typing protocol could be added later to let Mypy validate code using
|
||||
# `from_dataframe` better.
|
||||
DataFrameObject = Any
|
||||
ColumnObject = Any
|
||||
BufferObject = Any
|
||||
|
||||
|
||||
_PYARROW_DTYPES: dict[DtypeKind, dict[int, Any]] = {
|
||||
DtypeKind.INT: {8: pa.int8(),
|
||||
16: pa.int16(),
|
||||
32: pa.int32(),
|
||||
64: pa.int64()},
|
||||
DtypeKind.UINT: {8: pa.uint8(),
|
||||
16: pa.uint16(),
|
||||
32: pa.uint32(),
|
||||
64: pa.uint64()},
|
||||
DtypeKind.FLOAT: {16: pa.float16(),
|
||||
32: pa.float32(),
|
||||
64: pa.float64()},
|
||||
DtypeKind.BOOL: {1: pa.bool_(),
|
||||
8: pa.uint8()},
|
||||
DtypeKind.STRING: {8: pa.string()},
|
||||
}
|
||||
|
||||
|
||||
def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table:
|
||||
"""
|
||||
Build a ``pa.Table`` from any DataFrame supporting the interchange protocol.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrameObject
|
||||
Object supporting the interchange protocol, i.e. `__dataframe__`
|
||||
method.
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.Table
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow
|
||||
>>> from pyarrow.interchange import from_dataframe
|
||||
|
||||
Convert a pandas dataframe to a pyarrow table:
|
||||
|
||||
>>> import pandas as pd
|
||||
>>> df = pd.DataFrame({
|
||||
... "n_attendees": [100, 10, 1],
|
||||
... "country": ["Italy", "Spain", "Slovenia"],
|
||||
... })
|
||||
>>> df
|
||||
n_attendees country
|
||||
0 100 Italy
|
||||
1 10 Spain
|
||||
2 1 Slovenia
|
||||
>>> from_dataframe(df)
|
||||
pyarrow.Table
|
||||
n_attendees: int64
|
||||
country: large_string
|
||||
----
|
||||
n_attendees: [[100,10,1]]
|
||||
country: [["Italy","Spain","Slovenia"]]
|
||||
"""
|
||||
if isinstance(df, pa.Table):
|
||||
return df
|
||||
elif isinstance(df, pa.RecordBatch):
|
||||
return pa.Table.from_batches([df])
|
||||
|
||||
if not hasattr(df, "__dataframe__"):
|
||||
raise ValueError("`df` does not support __dataframe__")
|
||||
|
||||
return _from_dataframe(df.__dataframe__(allow_copy=allow_copy),
|
||||
allow_copy=allow_copy)
|
||||
|
||||
|
||||
def _from_dataframe(df: DataFrameObject, allow_copy=True):
|
||||
"""
|
||||
Build a ``pa.Table`` from the DataFrame interchange object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrameObject
|
||||
Object supporting the interchange protocol, i.e. `__dataframe__`
|
||||
method.
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.Table
|
||||
"""
|
||||
batches = []
|
||||
for chunk in df.get_chunks():
|
||||
batch = protocol_df_chunk_to_pyarrow(chunk, allow_copy)
|
||||
batches.append(batch)
|
||||
|
||||
if not batches:
|
||||
batch = protocol_df_chunk_to_pyarrow(df)
|
||||
batches.append(batch)
|
||||
|
||||
return pa.Table.from_batches(batches)
|
||||
|
||||
|
||||
def protocol_df_chunk_to_pyarrow(
|
||||
df: DataFrameObject,
|
||||
allow_copy: bool = True
|
||||
) -> pa.RecordBatch:
|
||||
"""
|
||||
Convert interchange protocol chunk to ``pa.RecordBatch``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrameObject
|
||||
Object supporting the interchange protocol, i.e. `__dataframe__`
|
||||
method.
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.RecordBatch
|
||||
"""
|
||||
# We need a dict of columns here, with each column being a pa.Array
|
||||
columns: dict[str, pa.Array] = {}
|
||||
for name in df.column_names():
|
||||
if not isinstance(name, str):
|
||||
raise ValueError(f"Column {name} is not a string")
|
||||
if name in columns:
|
||||
raise ValueError(f"Column {name} is not unique")
|
||||
col = df.get_column_by_name(name)
|
||||
dtype = col.dtype[0]
|
||||
if dtype in (
|
||||
DtypeKind.INT,
|
||||
DtypeKind.UINT,
|
||||
DtypeKind.FLOAT,
|
||||
DtypeKind.STRING,
|
||||
DtypeKind.DATETIME,
|
||||
):
|
||||
columns[name] = column_to_array(col, allow_copy)
|
||||
elif dtype == DtypeKind.BOOL:
|
||||
columns[name] = bool_column_to_array(col, allow_copy)
|
||||
elif dtype == DtypeKind.CATEGORICAL:
|
||||
columns[name] = categorical_column_to_dictionary(col, allow_copy)
|
||||
else:
|
||||
raise NotImplementedError(f"Data type {dtype} not handled yet")
|
||||
|
||||
return pa.RecordBatch.from_pydict(columns)
|
||||
|
||||
|
||||
def column_to_array(
|
||||
col: ColumnObject,
|
||||
allow_copy: bool = True,
|
||||
) -> pa.Array:
|
||||
"""
|
||||
Convert a column holding one of the primitive dtypes to a PyArrow array.
|
||||
A primitive type is one of: int, uint, float, bool (1 bit).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col : ColumnObject
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.Array
|
||||
"""
|
||||
buffers = col.get_buffers()
|
||||
data_type = col.dtype
|
||||
data = buffers_to_array(buffers, data_type,
|
||||
col.size(),
|
||||
col.describe_null,
|
||||
col.offset,
|
||||
allow_copy)
|
||||
return data
|
||||
|
||||
|
||||
def bool_column_to_array(
|
||||
col: ColumnObject,
|
||||
allow_copy: bool = True,
|
||||
) -> pa.Array:
|
||||
"""
|
||||
Convert a column holding boolean dtype to a PyArrow array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col : ColumnObject
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.Array
|
||||
"""
|
||||
buffers = col.get_buffers()
|
||||
size = buffers["data"][1][1]
|
||||
|
||||
# If booleans are byte-packed a copy to bit-packed will be made
|
||||
if size == 8 and not allow_copy:
|
||||
raise RuntimeError(
|
||||
"Boolean column will be casted from uint8 and a copy "
|
||||
"is required which is forbidden by allow_copy=False"
|
||||
)
|
||||
|
||||
data_type = col.dtype
|
||||
data = buffers_to_array(buffers, data_type,
|
||||
col.size(),
|
||||
col.describe_null,
|
||||
col.offset)
|
||||
if size == 8:
|
||||
data = pc.cast(data, pa.bool_())
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def categorical_column_to_dictionary(
|
||||
col: ColumnObject,
|
||||
allow_copy: bool = True,
|
||||
) -> pa.DictionaryArray:
|
||||
"""
|
||||
Convert a column holding categorical data to a pa.DictionaryArray.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col : ColumnObject
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.DictionaryArray
|
||||
"""
|
||||
if not allow_copy:
|
||||
raise RuntimeError(
|
||||
"Categorical column will be casted from uint8 and a copy "
|
||||
"is required which is forbidden by allow_copy=False"
|
||||
)
|
||||
|
||||
categorical = col.describe_categorical
|
||||
|
||||
if not categorical["is_dictionary"]:
|
||||
raise NotImplementedError(
|
||||
"Non-dictionary categoricals not supported yet")
|
||||
|
||||
# We need to first convert the dictionary column
|
||||
cat_column = categorical["categories"]
|
||||
dictionary = column_to_array(cat_column)
|
||||
# Then we need to convert the indices
|
||||
# Here we need to use the buffer data type!
|
||||
buffers = col.get_buffers()
|
||||
_, data_type = buffers["data"]
|
||||
indices = buffers_to_array(buffers, data_type,
|
||||
col.size(),
|
||||
col.describe_null,
|
||||
col.offset)
|
||||
|
||||
# Constructing a pa.DictionaryArray
|
||||
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
|
||||
|
||||
return dict_array
|
||||
|
||||
|
||||
def parse_datetime_format_str(format_str):
|
||||
"""Parse datetime `format_str` to interpret the `data`."""
|
||||
|
||||
# timestamp 'ts{unit}:tz'
|
||||
timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
|
||||
if timestamp_meta:
|
||||
unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
|
||||
if unit != "s":
|
||||
# the format string describes only a first letter of the unit, so
|
||||
# add one extra letter to convert the unit to numpy-style:
|
||||
# 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
|
||||
unit += "s"
|
||||
|
||||
return unit, tz
|
||||
|
||||
raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
|
||||
|
||||
|
||||
def map_date_type(data_type):
|
||||
"""Map column date type to pyarrow date type. """
|
||||
kind, bit_width, f_string, _ = data_type
|
||||
|
||||
if kind == DtypeKind.DATETIME:
|
||||
unit, tz = parse_datetime_format_str(f_string)
|
||||
return pa.timestamp(unit, tz=tz)
|
||||
else:
|
||||
pa_dtype = _PYARROW_DTYPES.get(kind, {}).get(bit_width, None)
|
||||
|
||||
# Error if dtype is not supported
|
||||
if pa_dtype:
|
||||
return pa_dtype
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Conversion for {data_type} is not yet supported.")
|
||||
|
||||
|
||||
def buffers_to_array(
|
||||
buffers: ColumnBuffers,
|
||||
data_type: Tuple[DtypeKind, int, str, str],
|
||||
length: int,
|
||||
describe_null: ColumnNullType,
|
||||
offset: int = 0,
|
||||
allow_copy: bool = True,
|
||||
) -> pa.Array:
|
||||
"""
|
||||
Build a PyArrow array from the passed buffer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
buffer : ColumnBuffers
|
||||
Dictionary containing tuples of underlying buffers and
|
||||
their associated dtype.
|
||||
data_type : Tuple[DtypeKind, int, str, str],
|
||||
Dtype description of the column as a tuple ``(kind, bit-width, format string,
|
||||
endianness)``.
|
||||
length : int
|
||||
The number of values in the array.
|
||||
describe_null: ColumnNullType
|
||||
Null representation the column dtype uses,
|
||||
as a tuple ``(kind, value)``
|
||||
offset : int, default: 0
|
||||
Number of elements to offset from the start of the buffer.
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.Array
|
||||
|
||||
Notes
|
||||
-----
|
||||
The returned array doesn't own the memory. The caller of this function
|
||||
is responsible for keeping the memory owner object alive as long as
|
||||
the returned PyArrow array is being used.
|
||||
"""
|
||||
data_buff, _ = buffers["data"]
|
||||
try:
|
||||
validity_buff, validity_dtype = buffers["validity"]
|
||||
except TypeError:
|
||||
validity_buff = None
|
||||
try:
|
||||
offset_buff, offset_dtype = buffers["offsets"]
|
||||
except TypeError:
|
||||
offset_buff = None
|
||||
|
||||
# Construct a pyarrow Buffer
|
||||
data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize,
|
||||
base=data_buff)
|
||||
|
||||
# Construct a validity pyarrow Buffer, if applicable
|
||||
if validity_buff:
|
||||
validity_pa_buff = validity_buffer_from_mask(validity_buff,
|
||||
validity_dtype,
|
||||
describe_null,
|
||||
length,
|
||||
offset,
|
||||
allow_copy)
|
||||
else:
|
||||
validity_pa_buff = validity_buffer_nan_sentinel(data_pa_buffer,
|
||||
data_type,
|
||||
describe_null,
|
||||
length,
|
||||
offset,
|
||||
allow_copy)
|
||||
|
||||
# Construct a pyarrow Array from buffers
|
||||
data_dtype = map_date_type(data_type)
|
||||
|
||||
if offset_buff:
|
||||
_, offset_bit_width, _, _ = offset_dtype
|
||||
# If an offset buffer exists, construct an offset pyarrow Buffer
|
||||
# and add it to the construction of an array
|
||||
offset_pa_buffer = pa.foreign_buffer(offset_buff.ptr,
|
||||
offset_buff.bufsize,
|
||||
base=offset_buff)
|
||||
|
||||
if data_type[2] == 'U':
|
||||
string_type = pa.large_string()
|
||||
else:
|
||||
if offset_bit_width == 64:
|
||||
string_type = pa.large_string()
|
||||
else:
|
||||
string_type = pa.string()
|
||||
array = pa.Array.from_buffers(
|
||||
string_type,
|
||||
length,
|
||||
[validity_pa_buff, offset_pa_buffer, data_pa_buffer],
|
||||
offset=offset,
|
||||
)
|
||||
else:
|
||||
array = pa.Array.from_buffers(
|
||||
data_dtype,
|
||||
length,
|
||||
[validity_pa_buff, data_pa_buffer],
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
return array
|
||||
|
||||
|
||||
def validity_buffer_from_mask(
|
||||
validity_buff: BufferObject,
|
||||
validity_dtype: Dtype,
|
||||
describe_null: ColumnNullType,
|
||||
length: int,
|
||||
offset: int = 0,
|
||||
allow_copy: bool = True,
|
||||
) -> pa.Buffer:
|
||||
"""
|
||||
Build a PyArrow buffer from the passed mask buffer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
validity_buff : BufferObject
|
||||
Tuple of underlying validity buffer and associated dtype.
|
||||
validity_dtype : Dtype
|
||||
Dtype description as a tuple ``(kind, bit-width, format string,
|
||||
endianness)``.
|
||||
describe_null : ColumnNullType
|
||||
Null representation the column dtype uses,
|
||||
as a tuple ``(kind, value)``
|
||||
length : int
|
||||
The number of values in the array.
|
||||
offset : int, default: 0
|
||||
Number of elements to offset from the start of the buffer.
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.Buffer
|
||||
"""
|
||||
null_kind, sentinel_val = describe_null
|
||||
validity_kind, _, _, _ = validity_dtype
|
||||
assert validity_kind == DtypeKind.BOOL
|
||||
|
||||
if null_kind == ColumnNullType.NON_NULLABLE:
|
||||
# Sliced array can have a NON_NULLABLE ColumnNullType due
|
||||
# to no missing values in that slice of an array though the bitmask
|
||||
# exists and validity_buff must be set to None in this case
|
||||
return None
|
||||
|
||||
elif null_kind == ColumnNullType.USE_BYTEMASK or (
|
||||
null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 1
|
||||
):
|
||||
buff = pa.foreign_buffer(validity_buff.ptr,
|
||||
validity_buff.bufsize,
|
||||
base=validity_buff)
|
||||
|
||||
if null_kind == ColumnNullType.USE_BYTEMASK:
|
||||
if not allow_copy:
|
||||
raise RuntimeError(
|
||||
"To create a bitmask a copy of the data is "
|
||||
"required which is forbidden by allow_copy=False"
|
||||
)
|
||||
mask = pa.Array.from_buffers(pa.int8(), length,
|
||||
[None, buff],
|
||||
offset=offset)
|
||||
mask_bool = pc.cast(mask, pa.bool_())
|
||||
else:
|
||||
mask_bool = pa.Array.from_buffers(pa.bool_(), length,
|
||||
[None, buff],
|
||||
offset=offset)
|
||||
|
||||
if sentinel_val == 1:
|
||||
mask_bool = pc.invert(mask_bool)
|
||||
|
||||
return mask_bool.buffers()[1]
|
||||
|
||||
elif null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 0:
|
||||
return pa.foreign_buffer(validity_buff.ptr,
|
||||
validity_buff.bufsize,
|
||||
base=validity_buff)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"{describe_null} null representation is not yet supported.")
|
||||
|
||||
|
||||
def validity_buffer_nan_sentinel(
|
||||
data_pa_buffer: BufferObject,
|
||||
data_type: Dtype,
|
||||
describe_null: ColumnNullType,
|
||||
length: int,
|
||||
offset: int = 0,
|
||||
allow_copy: bool = True,
|
||||
) -> pa.Buffer:
|
||||
"""
|
||||
Build a PyArrow buffer from NaN or sentinel values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data_pa_buffer : pa.Buffer
|
||||
PyArrow buffer for the column data.
|
||||
data_type : Dtype
|
||||
Dtype description as a tuple ``(kind, bit-width, format string,
|
||||
endianness)``.
|
||||
describe_null : ColumnNullType
|
||||
Null representation the column dtype uses,
|
||||
as a tuple ``(kind, value)``
|
||||
length : int
|
||||
The number of values in the array.
|
||||
offset : int, default: 0
|
||||
Number of elements to offset from the start of the buffer.
|
||||
allow_copy : bool, default: True
|
||||
Whether to allow copying the memory to perform the conversion
|
||||
(if false then zero-copy approach is requested).
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.Buffer
|
||||
"""
|
||||
kind, bit_width, _, _ = data_type
|
||||
data_dtype = map_date_type(data_type)
|
||||
null_kind, sentinel_val = describe_null
|
||||
|
||||
# Check for float NaN values
|
||||
if null_kind == ColumnNullType.USE_NAN:
|
||||
if not allow_copy:
|
||||
raise RuntimeError(
|
||||
"To create a bitmask a copy of the data is "
|
||||
"required which is forbidden by allow_copy=False"
|
||||
)
|
||||
|
||||
if kind == DtypeKind.FLOAT and bit_width == 16:
|
||||
# 'pyarrow.compute.is_nan' kernel not yet implemented
|
||||
# for float16
|
||||
raise NotImplementedError(
|
||||
f"{data_type} with {null_kind} is not yet supported.")
|
||||
else:
|
||||
pyarrow_data = pa.Array.from_buffers(
|
||||
data_dtype,
|
||||
length,
|
||||
[None, data_pa_buffer],
|
||||
offset=offset,
|
||||
)
|
||||
mask = pc.is_nan(pyarrow_data)
|
||||
mask = pc.invert(mask)
|
||||
return mask.buffers()[1]
|
||||
|
||||
# Check for sentinel values
|
||||
elif null_kind == ColumnNullType.USE_SENTINEL:
|
||||
if not allow_copy:
|
||||
raise RuntimeError(
|
||||
"To create a bitmask a copy of the data is "
|
||||
"required which is forbidden by allow_copy=False"
|
||||
)
|
||||
|
||||
if kind == DtypeKind.DATETIME:
|
||||
sentinel_dtype = pa.int64()
|
||||
else:
|
||||
sentinel_dtype = data_dtype
|
||||
pyarrow_data = pa.Array.from_buffers(sentinel_dtype,
|
||||
length,
|
||||
[None, data_pa_buffer],
|
||||
offset=offset)
|
||||
sentinel_arr = pc.equal(pyarrow_data, sentinel_val)
|
||||
mask_bool = pc.invert(sentinel_arr)
|
||||
return mask_bool.buffers()[1]
|
||||
|
||||
elif null_kind == ColumnNullType.NON_NULLABLE:
|
||||
pass
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"{describe_null} null representation is not yet supported.")
|
||||
Reference in New Issue
Block a user