Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# flake8: noqa
from .from_dataframe import from_dataframe

View File

@@ -0,0 +1,107 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
import enum
import pyarrow as pa
class DlpackDeviceType(enum.IntEnum):
"""Integer enum for device type codes matching DLPack."""
CPU = 1
CUDA = 2
CPU_PINNED = 3
OPENCL = 4
VULKAN = 7
METAL = 8
VPI = 9
ROCM = 10
class _PyArrowBuffer:
"""
Data in the buffer is guaranteed to be contiguous in memory.
Note that there is no dtype attribute present, a buffer can be thought of
as simply a block of memory. However, if the column that the buffer is
attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
implemented, then that dtype information will be contained in the return
value from ``__dlpack__``.
This distinction is useful to support both data exchange via DLPack on a
buffer and (b) dtypes like variable-length strings which do not have a
fixed number of bytes per element.
"""
def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None:
"""
Handle PyArrow Buffers.
"""
self._x = x
@property
def bufsize(self) -> int:
"""
Buffer size in bytes.
"""
return self._x.size
@property
def ptr(self) -> int:
"""
Pointer to start of the buffer as an integer.
"""
return self._x.address
def __dlpack__(self):
"""
Produce DLPack capsule (see array API standard).
Raises:
- TypeError : if the buffer contains unsupported dtypes.
- NotImplementedError : if DLPack support is not implemented
Useful to have to connect to array libraries. Support optional because
it's not completely trivial to implement for a Python-only library.
"""
raise NotImplementedError("__dlpack__")
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
"""
Device type and device ID for where the data in the buffer resides.
Uses device type codes matching DLPack.
Note: must be implemented even if ``__dlpack__`` is not.
"""
if self._x.is_cpu:
return (DlpackDeviceType.CPU, None)
else:
raise NotImplementedError("__dlpack_device__")
def __repr__(self) -> str:
return (
"PyArrowBuffer(" +
str(
{
"bufsize": self.bufsize,
"ptr": self.ptr,
"device": self.__dlpack_device__()[0].name,
}
) +
")"
)

View File

@@ -0,0 +1,529 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
import enum
from typing import (
Any,
Dict,
Iterable,
Optional,
Tuple,
)
import sys
if sys.version_info >= (3, 8):
from typing import TypedDict
else:
from typing_extensions import TypedDict
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow.interchange.buffer import _PyArrowBuffer
class DtypeKind(enum.IntEnum):
"""
Integer enum for data types.
Attributes
----------
INT : int
Matches to signed integer data type.
UINT : int
Matches to unsigned integer data type.
FLOAT : int
Matches to floating point data type.
BOOL : int
Matches to boolean data type.
STRING : int
Matches to string data type (UTF-8 encoded).
DATETIME : int
Matches to datetime data type.
CATEGORICAL : int
Matches to categorical data type.
"""
INT = 0
UINT = 1
FLOAT = 2
BOOL = 20
STRING = 21 # UTF-8
DATETIME = 22
CATEGORICAL = 23
Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype
_PYARROW_KINDS = {
pa.int8(): (DtypeKind.INT, "c"),
pa.int16(): (DtypeKind.INT, "s"),
pa.int32(): (DtypeKind.INT, "i"),
pa.int64(): (DtypeKind.INT, "l"),
pa.uint8(): (DtypeKind.UINT, "C"),
pa.uint16(): (DtypeKind.UINT, "S"),
pa.uint32(): (DtypeKind.UINT, "I"),
pa.uint64(): (DtypeKind.UINT, "L"),
pa.float16(): (DtypeKind.FLOAT, "e"),
pa.float32(): (DtypeKind.FLOAT, "f"),
pa.float64(): (DtypeKind.FLOAT, "g"),
pa.bool_(): (DtypeKind.BOOL, "b"),
pa.string(): (DtypeKind.STRING, "u"),
pa.large_string(): (DtypeKind.STRING, "U"),
}
class ColumnNullType(enum.IntEnum):
"""
Integer enum for null type representation.
Attributes
----------
NON_NULLABLE : int
Non-nullable column.
USE_NAN : int
Use explicit float NaN value.
USE_SENTINEL : int
Sentinel value besides NaN.
USE_BITMASK : int
The bit is set/unset representing a null on a certain position.
USE_BYTEMASK : int
The byte is set/unset representing a null on a certain position.
"""
NON_NULLABLE = 0
USE_NAN = 1
USE_SENTINEL = 2
USE_BITMASK = 3
USE_BYTEMASK = 4
class ColumnBuffers(TypedDict):
# first element is a buffer containing the column data;
# second element is the data buffer's associated dtype
data: Tuple[_PyArrowBuffer, Dtype]
# first element is a buffer containing mask values indicating missing data;
# second element is the mask value buffer's associated dtype.
# None if the null representation is not a bit or byte mask
validity: Optional[Tuple[_PyArrowBuffer, Dtype]]
# first element is a buffer containing the offset values for
# variable-size binary data (e.g., variable-length strings);
# second element is the offsets buffer's associated dtype.
# None if the data buffer does not have an associated offsets buffer
offsets: Optional[Tuple[_PyArrowBuffer, Dtype]]
class CategoricalDescription(TypedDict):
# whether the ordering of dictionary indices is semantically meaningful
is_ordered: bool
# whether a dictionary-style mapping of categorical values to other objects
# exists
is_dictionary: bool
# Python-level only (e.g. ``{int: str}``).
# None if not a dictionary-style categorical.
categories: Optional[_PyArrowColumn]
class Endianness:
"""Enum indicating the byte-order of a data-type."""
LITTLE = "<"
BIG = ">"
NATIVE = "="
NA = "|"
class NoBufferPresent(Exception):
"""Exception to signal that there is no requested buffer."""
class _PyArrowColumn:
"""
A column object, with only the methods and properties required by the
interchange protocol defined.
A column can contain one or more chunks. Each chunk can contain up to three
buffers - a data buffer, a mask buffer (depending on null representation),
and an offsets buffer (if variable-size binary; e.g., variable-length
strings).
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
Instead, it seems to use "children" for both columns with a bit mask,
and for nested dtypes. Unclear whether this is elegant or confusing.
This design requires checking the null representation explicitly.
The Arrow design requires checking:
1. the ARROW_FLAG_NULLABLE (for sentinel values)
2. if a column has two children, combined with one of those children
having a null dtype.
Making the mask concept explicit seems useful. One null dtype would
not be enough to cover both bit and byte masks, so that would mean
even more checking if we did it the Arrow way.
TBD: there's also the "chunk" concept here, which is implicit in Arrow as
multiple buffers per array (= column here). Semantically it may make
sense to have both: chunks were meant for example for lazy evaluation
of data which doesn't fit in memory, while multiple buffers per column
could also come from doing a selection operation on a single
contiguous buffer.
Given these concepts, one would expect chunks to be all of the same
size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
while multiple buffers could have data-dependent lengths. Not an issue
in pandas if one column is backed by a single NumPy array, but in
Arrow it seems possible.
Are multiple chunks *and* multiple buffers per column necessary for
the purposes of this interchange protocol, or must producers either
reuse the chunk concept for this or copy the data?
Note: this Column object can only be produced by ``__dataframe__``, so
doesn't need its own version or ``__column__`` protocol.
"""
def __init__(
self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True
) -> None:
"""
Handles PyArrow Arrays and ChunkedArrays.
"""
# Store the column as a private attribute
if isinstance(column, pa.ChunkedArray):
if column.num_chunks == 1:
column = column.chunk(0)
else:
if not allow_copy:
raise RuntimeError(
"Chunks will be combined and a copy is required which "
"is forbidden by allow_copy=False"
)
column = column.combine_chunks()
self._allow_copy = allow_copy
if pa.types.is_boolean(column.type):
if not allow_copy:
raise RuntimeError(
"Boolean column will be casted to uint8 and a copy "
"is required which is forbidden by allow_copy=False"
)
self._dtype = self._dtype_from_arrowdtype(column.type, 8)
self._col = pc.cast(column, pa.uint8())
else:
self._col = column
dtype = self._col.type
try:
bit_width = dtype.bit_width
except ValueError:
# in case of a variable-length strings, considered as array
# of bytes (8 bits)
bit_width = 8
self._dtype = self._dtype_from_arrowdtype(dtype, bit_width)
def size(self) -> int:
"""
Size of the column, in elements.
Corresponds to DataFrame.num_rows() if column is a single chunk;
equal to size of this current chunk otherwise.
Is a method rather than a property because it may cause a (potentially
expensive) computation for some dataframe implementations.
"""
return len(self._col)
@property
def offset(self) -> int:
"""
Offset of first element.
May be > 0 if using chunks; for example for a column with N chunks of
equal size M (only the last chunk may be shorter),
``offset = n * M``, ``n = 0 .. N-1``.
"""
return self._col.offset
@property
def dtype(self) -> Tuple[DtypeKind, int, str, str]:
"""
Dtype description as a tuple ``(kind, bit-width, format string,
endianness)``.
Bit-width : the number of bits as an integer
Format string : data type description format string in Apache Arrow C
Data Interface format.
Endianness : current only native endianness (``=``) is supported
Notes:
- Kind specifiers are aligned with DLPack where possible (hence the
jump to 20, leave enough room for future extension)
- Masks must be specified as boolean with either bit width 1 (for
bit masks) or 8 (for byte masks).
- Dtype width in bits was preferred over bytes
- Endianness isn't too useful, but included now in case in the
future we need to support non-native endianness
- Went with Apache Arrow format strings over NumPy format strings
because they're more complete from a dataframe perspective
- Format strings are mostly useful for datetime specification, and
for categoricals.
- For categoricals, the format string describes the type of the
categorical in the data buffer. In case of a separate encoding of
the categorical (e.g. an integer to string mapping), this can
be derived from ``self.describe_categorical``.
- Data types not included: complex, Arrow-style null, binary,
decimal, and nested (list, struct, map, union) dtypes.
"""
return self._dtype
def _dtype_from_arrowdtype(
self, dtype: pa.DataType, bit_width: int
) -> Tuple[DtypeKind, int, str, str]:
"""
See `self.dtype` for details.
"""
# Note: 'c' (complex) not handled yet (not in array spec v1).
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
# not handled datetime and timedelta both map to datetime
# (is timedelta handled?)
if pa.types.is_timestamp(dtype):
kind = DtypeKind.DATETIME
ts = dtype.unit[0]
tz = dtype.tz if dtype.tz else ""
f_string = f"ts{ts}:{tz}"
return kind, bit_width, f_string, Endianness.NATIVE
elif pa.types.is_dictionary(dtype):
kind = DtypeKind.CATEGORICAL
arr = self._col
indices_dtype = arr.indices.type
_, f_string = _PYARROW_KINDS.get(indices_dtype)
return kind, bit_width, f_string, Endianness.NATIVE
else:
kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
if kind is None:
raise ValueError(
f"Data type {dtype} not supported by interchange protocol")
return kind, bit_width, f_string, Endianness.NATIVE
@property
def describe_categorical(self) -> CategoricalDescription:
"""
If the dtype is categorical, there are two options:
- There are only values in the data buffer.
- There is a separate non-categorical Column encoding categorical
values.
Raises TypeError if the dtype is not categorical
Returns the dictionary with description on how to interpret the
data buffer:
- "is_ordered" : bool, whether the ordering of dictionary indices
is semantically meaningful.
- "is_dictionary" : bool, whether a mapping of
categorical values to other objects exists
- "categories" : Column representing the (implicit) mapping of
indices to category values (e.g. an array of
cat1, cat2, ...). None if not a dictionary-style
categorical.
TBD: are there any other in-memory representations that are needed?
"""
arr = self._col
if not pa.types.is_dictionary(arr.type):
raise TypeError(
"describe_categorical only works on a column with "
"categorical dtype!"
)
return {
"is_ordered": self._col.type.ordered,
"is_dictionary": True,
"categories": _PyArrowColumn(arr.dictionary),
}
@property
def describe_null(self) -> Tuple[ColumnNullType, Any]:
"""
Return the missing value (or "null") representation the column dtype
uses, as a tuple ``(kind, value)``.
Value : if kind is "sentinel value", the actual value. If kind is a bit
mask or a byte mask, the value (0 or 1) indicating a missing value.
None otherwise.
"""
# In case of no missing values, we need to set ColumnNullType to
# non nullable as in the current __dataframe__ protocol bit/byte masks
# cannot be None
if self.null_count == 0:
return ColumnNullType.NON_NULLABLE, None
else:
return ColumnNullType.USE_BITMASK, 0
@property
def null_count(self) -> int:
"""
Number of null elements, if known.
Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
"""
arrow_null_count = self._col.null_count
n = arrow_null_count if arrow_null_count != -1 else None
return n
@property
def metadata(self) -> Dict[str, Any]:
"""
The metadata for the column. See `DataFrame.metadata` for more details.
"""
pass
def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
"""
return 1
def get_chunks(
self, n_chunks: Optional[int] = None
) -> Iterable[_PyArrowColumn]:
"""
Return an iterator yielding the chunks.
See `DataFrame.get_chunks` for details on ``n_chunks``.
"""
if n_chunks and n_chunks > 1:
chunk_size = self.size() // n_chunks
if self.size() % n_chunks != 0:
chunk_size += 1
array = self._col
i = 0
for start in range(0, chunk_size * n_chunks, chunk_size):
yield _PyArrowColumn(
array.slice(start, chunk_size), self._allow_copy
)
i += 1
else:
yield self
def get_buffers(self) -> ColumnBuffers:
"""
Return a dictionary containing the underlying buffers.
The returned dictionary has the following contents:
- "data": a two-element tuple whose first element is a buffer
containing the data and whose second element is the data
buffer's associated dtype.
- "validity": a two-element tuple whose first element is a buffer
containing mask values indicating missing data and
whose second element is the mask value buffer's
associated dtype. None if the null representation is
not a bit or byte mask.
- "offsets": a two-element tuple whose first element is a buffer
containing the offset values for variable-size binary
data (e.g., variable-length strings) and whose second
element is the offsets buffer's associated dtype. None
if the data buffer does not have an associated offsets
buffer.
"""
buffers: ColumnBuffers = {
"data": self._get_data_buffer(),
"validity": None,
"offsets": None,
}
try:
buffers["validity"] = self._get_validity_buffer()
except NoBufferPresent:
pass
try:
buffers["offsets"] = self._get_offsets_buffer()
except NoBufferPresent:
pass
return buffers
def _get_data_buffer(
self,
) -> Tuple[_PyArrowBuffer, Any]: # Any is for self.dtype tuple
"""
Return the buffer containing the data and the buffer's
associated dtype.
"""
array = self._col
dtype = self.dtype
# In case of dictionary arrays, use indices
# to define a buffer, codes are transferred through
# describe_categorical()
if pa.types.is_dictionary(array.type):
array = array.indices
dtype = _PyArrowColumn(array).dtype
n = len(array.buffers())
if n == 2:
return _PyArrowBuffer(array.buffers()[1]), dtype
elif n == 3:
return _PyArrowBuffer(array.buffers()[2]), dtype
def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
"""
Return the buffer containing the mask values indicating missing data
and the buffer's associated dtype.
Raises NoBufferPresent if null representation is not a bit or byte
mask.
"""
# Define the dtype of the returned buffer
dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE)
array = self._col
buff = array.buffers()[0]
if buff:
return _PyArrowBuffer(buff), dtype
else:
raise NoBufferPresent(
"There are no missing values so "
"does not have a separate mask")
def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
"""
Return the buffer containing the offset values for variable-size binary
data (e.g., variable-length strings) and the buffer's associated dtype.
Raises NoBufferPresent if the data buffer does not have an associated
offsets buffer.
"""
array = self._col
n = len(array.buffers())
if n == 2:
raise NoBufferPresent(
"This column has a fixed-length dtype so "
"it does not have an offsets buffer"
)
elif n == 3:
# Define the dtype of the returned buffer
dtype = self._col.type
if pa.types.is_large_string(dtype):
dtype = (DtypeKind.INT, 64, "l", Endianness.NATIVE)
else:
dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE)
return _PyArrowBuffer(array.buffers()[1]), dtype

View File

@@ -0,0 +1,217 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
from typing import (
Any,
Iterable,
Optional,
Sequence,
)
import pyarrow as pa
from pyarrow.interchange.column import _PyArrowColumn
class _PyArrowDataFrame:
"""
A data frame class, with only the methods required by the interchange
protocol defined.
A "data frame" represents an ordered collection of named columns.
A column's "name" must be a unique string.
Columns may be accessed by name or by position.
This could be a public data frame class, or an object with the methods and
attributes defined on this DataFrame class could be returned from the
``__dataframe__`` method of a public data frame class in a library adhering
to the dataframe interchange protocol specification.
"""
def __init__(
self, df: pa.Table | pa.RecordBatch,
nan_as_null: bool = False,
allow_copy: bool = True
) -> None:
"""
Constructor - an instance of this (private) class is returned from
`pa.Table.__dataframe__` or `pa.RecordBatch.__dataframe__`.
"""
self._df = df
# ``nan_as_null`` is a keyword intended for the consumer to tell the
# producer to overwrite null values in the data with ``NaN`` (or
# ``NaT``).
if nan_as_null is True:
raise RuntimeError(
"nan_as_null=True currently has no effect, "
"use the default nan_as_null=False"
)
self._nan_as_null = nan_as_null
self._allow_copy = allow_copy
def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> _PyArrowDataFrame:
"""
Construct a new exchange object, potentially changing the parameters.
``nan_as_null`` is a keyword intended for the consumer to tell the
producer to overwrite null values in the data with ``NaN``.
It is intended for cases where the consumer does not support the bit
mask or byte mask that is the producer's native representation.
``allow_copy`` is a keyword that defines whether or not the library is
allowed to make a copy of the data. For example, copying data would be
necessary if a library supports strided buffers, given that this
protocol specifies contiguous buffers.
"""
return _PyArrowDataFrame(self._df, nan_as_null, allow_copy)
@property
def metadata(self) -> dict[str, Any]:
"""
The metadata for the data frame, as a dictionary with string keys. The
contents of `metadata` may be anything, they are meant for a library
to store information that it needs to, e.g., roundtrip losslessly or
for two implementations to share data that is not (yet) part of the
interchange protocol specification. For avoiding collisions with other
entries, please add name the keys with the name of the library
followed by a period and the desired name, e.g, ``pandas.indexcol``.
"""
# The metadata for the data frame, as a dictionary with string keys.
# Add schema metadata here (pandas metadata or custom metadata)
if self._df.schema.metadata:
schema_metadata = {"pyarrow." + k.decode('utf8'): v.decode('utf8')
for k, v in self._df.schema.metadata.items()}
return schema_metadata
else:
return {}
def num_columns(self) -> int:
"""
Return the number of columns in the DataFrame.
"""
return self._df.num_columns
def num_rows(self) -> int:
"""
Return the number of rows in the DataFrame, if available.
"""
return self._df.num_rows
def num_chunks(self) -> int:
"""
Return the number of chunks the DataFrame consists of.
"""
if isinstance(self._df, pa.RecordBatch):
return 1
else:
# pyarrow.Table can have columns with different number
# of chunks so we take the number of chunks that
# .to_batches() returns as it takes the min chunk size
# of all the columns (to_batches is a zero copy method)
batches = self._df.to_batches()
return len(batches)
def column_names(self) -> Iterable[str]:
"""
Return an iterator yielding the column names.
"""
return self._df.schema.names
def get_column(self, i: int) -> _PyArrowColumn:
"""
Return the column at the indicated position.
"""
return _PyArrowColumn(self._df.column(i),
allow_copy=self._allow_copy)
def get_column_by_name(self, name: str) -> _PyArrowColumn:
"""
Return the column whose name is the indicated name.
"""
return _PyArrowColumn(self._df.column(name),
allow_copy=self._allow_copy)
def get_columns(self) -> Iterable[_PyArrowColumn]:
"""
Return an iterator yielding the columns.
"""
return [
_PyArrowColumn(col, allow_copy=self._allow_copy)
for col in self._df.columns
]
def select_columns(self, indices: Sequence[int]) -> _PyArrowDataFrame:
"""
Create a new DataFrame by selecting a subset of columns by index.
"""
return _PyArrowDataFrame(
self._df.select(list(indices)), self._nan_as_null, self._allow_copy
)
def select_columns_by_name(
self, names: Sequence[str]
) -> _PyArrowDataFrame:
"""
Create a new DataFrame by selecting a subset of columns by name.
"""
return _PyArrowDataFrame(
self._df.select(list(names)), self._nan_as_null, self._allow_copy
)
def get_chunks(
self, n_chunks: Optional[int] = None
) -> Iterable[_PyArrowDataFrame]:
"""
Return an iterator yielding the chunks.
By default (None), yields the chunks that the data is stored as by the
producer. If given, ``n_chunks`` must be a multiple of
``self.num_chunks()``, meaning the producer must subdivide each chunk
before yielding it.
Note that the producer must ensure that all columns are chunked the
same way.
"""
# Subdivide chunks
if n_chunks and n_chunks > 1:
chunk_size = self.num_rows() // n_chunks
if self.num_rows() % n_chunks != 0:
chunk_size += 1
if isinstance(self._df, pa.Table):
batches = self._df.to_batches(max_chunksize=chunk_size)
else:
batches = []
for start in range(0, chunk_size * n_chunks, chunk_size):
batches.append(self._df.slice(start, chunk_size))
# In case when the size of the chunk is such that the resulting
# list is one less chunk then n_chunks -> append an empty chunk
if len(batches) == n_chunks - 1:
batches.append(pa.record_batch([[]], schema=self._df.schema))
# yields the chunks that the data is stored as
else:
if isinstance(self._df, pa.Table):
batches = self._df.to_batches()
else:
batches = [self._df]
# Create an iterator of RecordBatches
iterator = [_PyArrowDataFrame(batch,
self._nan_as_null,
self._allow_copy)
for batch in batches]
return iterator

View File

@@ -0,0 +1,614 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
from typing import (
Any,
Tuple,
)
from pyarrow.interchange.column import (
DtypeKind,
ColumnBuffers,
ColumnNullType,
)
import pyarrow as pa
import re
import pyarrow.compute as pc
from pyarrow.interchange.column import Dtype
# A typing protocol could be added later to let Mypy validate code using
# `from_dataframe` better.
DataFrameObject = Any
ColumnObject = Any
BufferObject = Any
_PYARROW_DTYPES: dict[DtypeKind, dict[int, Any]] = {
DtypeKind.INT: {8: pa.int8(),
16: pa.int16(),
32: pa.int32(),
64: pa.int64()},
DtypeKind.UINT: {8: pa.uint8(),
16: pa.uint16(),
32: pa.uint32(),
64: pa.uint64()},
DtypeKind.FLOAT: {16: pa.float16(),
32: pa.float32(),
64: pa.float64()},
DtypeKind.BOOL: {1: pa.bool_(),
8: pa.uint8()},
DtypeKind.STRING: {8: pa.string()},
}
def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table:
"""
Build a ``pa.Table`` from any DataFrame supporting the interchange protocol.
Parameters
----------
df : DataFrameObject
Object supporting the interchange protocol, i.e. `__dataframe__`
method.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pa.Table
Examples
--------
>>> import pyarrow
>>> from pyarrow.interchange import from_dataframe
Convert a pandas dataframe to a pyarrow table:
>>> import pandas as pd
>>> df = pd.DataFrame({
... "n_attendees": [100, 10, 1],
... "country": ["Italy", "Spain", "Slovenia"],
... })
>>> df
n_attendees country
0 100 Italy
1 10 Spain
2 1 Slovenia
>>> from_dataframe(df)
pyarrow.Table
n_attendees: int64
country: large_string
----
n_attendees: [[100,10,1]]
country: [["Italy","Spain","Slovenia"]]
"""
if isinstance(df, pa.Table):
return df
elif isinstance(df, pa.RecordBatch):
return pa.Table.from_batches([df])
if not hasattr(df, "__dataframe__"):
raise ValueError("`df` does not support __dataframe__")
return _from_dataframe(df.__dataframe__(allow_copy=allow_copy),
allow_copy=allow_copy)
def _from_dataframe(df: DataFrameObject, allow_copy=True):
"""
Build a ``pa.Table`` from the DataFrame interchange object.
Parameters
----------
df : DataFrameObject
Object supporting the interchange protocol, i.e. `__dataframe__`
method.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pa.Table
"""
batches = []
for chunk in df.get_chunks():
batch = protocol_df_chunk_to_pyarrow(chunk, allow_copy)
batches.append(batch)
if not batches:
batch = protocol_df_chunk_to_pyarrow(df)
batches.append(batch)
return pa.Table.from_batches(batches)
def protocol_df_chunk_to_pyarrow(
df: DataFrameObject,
allow_copy: bool = True
) -> pa.RecordBatch:
"""
Convert interchange protocol chunk to ``pa.RecordBatch``.
Parameters
----------
df : DataFrameObject
Object supporting the interchange protocol, i.e. `__dataframe__`
method.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pa.RecordBatch
"""
# We need a dict of columns here, with each column being a pa.Array
columns: dict[str, pa.Array] = {}
for name in df.column_names():
if not isinstance(name, str):
raise ValueError(f"Column {name} is not a string")
if name in columns:
raise ValueError(f"Column {name} is not unique")
col = df.get_column_by_name(name)
dtype = col.dtype[0]
if dtype in (
DtypeKind.INT,
DtypeKind.UINT,
DtypeKind.FLOAT,
DtypeKind.STRING,
DtypeKind.DATETIME,
):
columns[name] = column_to_array(col, allow_copy)
elif dtype == DtypeKind.BOOL:
columns[name] = bool_column_to_array(col, allow_copy)
elif dtype == DtypeKind.CATEGORICAL:
columns[name] = categorical_column_to_dictionary(col, allow_copy)
else:
raise NotImplementedError(f"Data type {dtype} not handled yet")
return pa.RecordBatch.from_pydict(columns)
def column_to_array(
col: ColumnObject,
allow_copy: bool = True,
) -> pa.Array:
"""
Convert a column holding one of the primitive dtypes to a PyArrow array.
A primitive type is one of: int, uint, float, bool (1 bit).
Parameters
----------
col : ColumnObject
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pa.Array
"""
buffers = col.get_buffers()
data_type = col.dtype
data = buffers_to_array(buffers, data_type,
col.size(),
col.describe_null,
col.offset,
allow_copy)
return data
def bool_column_to_array(
col: ColumnObject,
allow_copy: bool = True,
) -> pa.Array:
"""
Convert a column holding boolean dtype to a PyArrow array.
Parameters
----------
col : ColumnObject
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pa.Array
"""
buffers = col.get_buffers()
size = buffers["data"][1][1]
# If booleans are byte-packed a copy to bit-packed will be made
if size == 8 and not allow_copy:
raise RuntimeError(
"Boolean column will be casted from uint8 and a copy "
"is required which is forbidden by allow_copy=False"
)
data_type = col.dtype
data = buffers_to_array(buffers, data_type,
col.size(),
col.describe_null,
col.offset)
if size == 8:
data = pc.cast(data, pa.bool_())
return data
def categorical_column_to_dictionary(
col: ColumnObject,
allow_copy: bool = True,
) -> pa.DictionaryArray:
"""
Convert a column holding categorical data to a pa.DictionaryArray.
Parameters
----------
col : ColumnObject
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pa.DictionaryArray
"""
if not allow_copy:
raise RuntimeError(
"Categorical column will be casted from uint8 and a copy "
"is required which is forbidden by allow_copy=False"
)
categorical = col.describe_categorical
if not categorical["is_dictionary"]:
raise NotImplementedError(
"Non-dictionary categoricals not supported yet")
# We need to first convert the dictionary column
cat_column = categorical["categories"]
dictionary = column_to_array(cat_column)
# Then we need to convert the indices
# Here we need to use the buffer data type!
buffers = col.get_buffers()
_, data_type = buffers["data"]
indices = buffers_to_array(buffers, data_type,
col.size(),
col.describe_null,
col.offset)
# Constructing a pa.DictionaryArray
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
return dict_array
def parse_datetime_format_str(format_str):
"""Parse datetime `format_str` to interpret the `data`."""
# timestamp 'ts{unit}:tz'
timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
if timestamp_meta:
unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
if unit != "s":
# the format string describes only a first letter of the unit, so
# add one extra letter to convert the unit to numpy-style:
# 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
unit += "s"
return unit, tz
raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
def map_date_type(data_type):
"""Map column date type to pyarrow date type. """
kind, bit_width, f_string, _ = data_type
if kind == DtypeKind.DATETIME:
unit, tz = parse_datetime_format_str(f_string)
return pa.timestamp(unit, tz=tz)
else:
pa_dtype = _PYARROW_DTYPES.get(kind, {}).get(bit_width, None)
# Error if dtype is not supported
if pa_dtype:
return pa_dtype
else:
raise NotImplementedError(
f"Conversion for {data_type} is not yet supported.")
def buffers_to_array(
buffers: ColumnBuffers,
data_type: Tuple[DtypeKind, int, str, str],
length: int,
describe_null: ColumnNullType,
offset: int = 0,
allow_copy: bool = True,
) -> pa.Array:
"""
Build a PyArrow array from the passed buffer.
Parameters
----------
buffer : ColumnBuffers
Dictionary containing tuples of underlying buffers and
their associated dtype.
data_type : Tuple[DtypeKind, int, str, str],
Dtype description of the column as a tuple ``(kind, bit-width, format string,
endianness)``.
length : int
The number of values in the array.
describe_null: ColumnNullType
Null representation the column dtype uses,
as a tuple ``(kind, value)``
offset : int, default: 0
Number of elements to offset from the start of the buffer.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pa.Array
Notes
-----
The returned array doesn't own the memory. The caller of this function
is responsible for keeping the memory owner object alive as long as
the returned PyArrow array is being used.
"""
data_buff, _ = buffers["data"]
try:
validity_buff, validity_dtype = buffers["validity"]
except TypeError:
validity_buff = None
try:
offset_buff, offset_dtype = buffers["offsets"]
except TypeError:
offset_buff = None
# Construct a pyarrow Buffer
data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize,
base=data_buff)
# Construct a validity pyarrow Buffer, if applicable
if validity_buff:
validity_pa_buff = validity_buffer_from_mask(validity_buff,
validity_dtype,
describe_null,
length,
offset,
allow_copy)
else:
validity_pa_buff = validity_buffer_nan_sentinel(data_pa_buffer,
data_type,
describe_null,
length,
offset,
allow_copy)
# Construct a pyarrow Array from buffers
data_dtype = map_date_type(data_type)
if offset_buff:
_, offset_bit_width, _, _ = offset_dtype
# If an offset buffer exists, construct an offset pyarrow Buffer
# and add it to the construction of an array
offset_pa_buffer = pa.foreign_buffer(offset_buff.ptr,
offset_buff.bufsize,
base=offset_buff)
if data_type[2] == 'U':
string_type = pa.large_string()
else:
if offset_bit_width == 64:
string_type = pa.large_string()
else:
string_type = pa.string()
array = pa.Array.from_buffers(
string_type,
length,
[validity_pa_buff, offset_pa_buffer, data_pa_buffer],
offset=offset,
)
else:
array = pa.Array.from_buffers(
data_dtype,
length,
[validity_pa_buff, data_pa_buffer],
offset=offset,
)
return array
def validity_buffer_from_mask(
validity_buff: BufferObject,
validity_dtype: Dtype,
describe_null: ColumnNullType,
length: int,
offset: int = 0,
allow_copy: bool = True,
) -> pa.Buffer:
"""
Build a PyArrow buffer from the passed mask buffer.
Parameters
----------
validity_buff : BufferObject
Tuple of underlying validity buffer and associated dtype.
validity_dtype : Dtype
Dtype description as a tuple ``(kind, bit-width, format string,
endianness)``.
describe_null : ColumnNullType
Null representation the column dtype uses,
as a tuple ``(kind, value)``
length : int
The number of values in the array.
offset : int, default: 0
Number of elements to offset from the start of the buffer.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pa.Buffer
"""
null_kind, sentinel_val = describe_null
validity_kind, _, _, _ = validity_dtype
assert validity_kind == DtypeKind.BOOL
if null_kind == ColumnNullType.NON_NULLABLE:
# Sliced array can have a NON_NULLABLE ColumnNullType due
# to no missing values in that slice of an array though the bitmask
# exists and validity_buff must be set to None in this case
return None
elif null_kind == ColumnNullType.USE_BYTEMASK or (
null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 1
):
buff = pa.foreign_buffer(validity_buff.ptr,
validity_buff.bufsize,
base=validity_buff)
if null_kind == ColumnNullType.USE_BYTEMASK:
if not allow_copy:
raise RuntimeError(
"To create a bitmask a copy of the data is "
"required which is forbidden by allow_copy=False"
)
mask = pa.Array.from_buffers(pa.int8(), length,
[None, buff],
offset=offset)
mask_bool = pc.cast(mask, pa.bool_())
else:
mask_bool = pa.Array.from_buffers(pa.bool_(), length,
[None, buff],
offset=offset)
if sentinel_val == 1:
mask_bool = pc.invert(mask_bool)
return mask_bool.buffers()[1]
elif null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 0:
return pa.foreign_buffer(validity_buff.ptr,
validity_buff.bufsize,
base=validity_buff)
else:
raise NotImplementedError(
f"{describe_null} null representation is not yet supported.")
def validity_buffer_nan_sentinel(
data_pa_buffer: BufferObject,
data_type: Dtype,
describe_null: ColumnNullType,
length: int,
offset: int = 0,
allow_copy: bool = True,
) -> pa.Buffer:
"""
Build a PyArrow buffer from NaN or sentinel values.
Parameters
----------
data_pa_buffer : pa.Buffer
PyArrow buffer for the column data.
data_type : Dtype
Dtype description as a tuple ``(kind, bit-width, format string,
endianness)``.
describe_null : ColumnNullType
Null representation the column dtype uses,
as a tuple ``(kind, value)``
length : int
The number of values in the array.
offset : int, default: 0
Number of elements to offset from the start of the buffer.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
pa.Buffer
"""
kind, bit_width, _, _ = data_type
data_dtype = map_date_type(data_type)
null_kind, sentinel_val = describe_null
# Check for float NaN values
if null_kind == ColumnNullType.USE_NAN:
if not allow_copy:
raise RuntimeError(
"To create a bitmask a copy of the data is "
"required which is forbidden by allow_copy=False"
)
if kind == DtypeKind.FLOAT and bit_width == 16:
# 'pyarrow.compute.is_nan' kernel not yet implemented
# for float16
raise NotImplementedError(
f"{data_type} with {null_kind} is not yet supported.")
else:
pyarrow_data = pa.Array.from_buffers(
data_dtype,
length,
[None, data_pa_buffer],
offset=offset,
)
mask = pc.is_nan(pyarrow_data)
mask = pc.invert(mask)
return mask.buffers()[1]
# Check for sentinel values
elif null_kind == ColumnNullType.USE_SENTINEL:
if not allow_copy:
raise RuntimeError(
"To create a bitmask a copy of the data is "
"required which is forbidden by allow_copy=False"
)
if kind == DtypeKind.DATETIME:
sentinel_dtype = pa.int64()
else:
sentinel_dtype = data_dtype
pyarrow_data = pa.Array.from_buffers(sentinel_dtype,
length,
[None, data_pa_buffer],
offset=offset)
sentinel_arr = pc.equal(pyarrow_data, sentinel_val)
mask_bool = pc.invert(sentinel_arr)
return mask_bool.buffers()[1]
elif null_kind == ColumnNullType.NON_NULLABLE:
pass
else:
raise NotImplementedError(
f"{describe_null} null representation is not yet supported.")