Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/pyarrow/acero.py
+++ b/venv/lib/python3.10/site-packages/pyarrow/acero.py
@@ -0,0 +1,418 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# ---------------------------------------------------------------------
+# Implement Internal ExecPlan bindings
+
+# cython: profile=False
+# distutils: language = c++
+# cython: language_level = 3
+
+from pyarrow.lib import Table, RecordBatch, array
+from pyarrow.compute import Expression, field
+
+try:
+    from pyarrow._acero import (  # noqa
+        Declaration,
+        ExecNodeOptions,
+        TableSourceNodeOptions,
+        FilterNodeOptions,
+        ProjectNodeOptions,
+        AggregateNodeOptions,
+        OrderByNodeOptions,
+        HashJoinNodeOptions,
+        AsofJoinNodeOptions,
+    )
+except ImportError as exc:
+    raise ImportError(
+        f"The pyarrow installation is not built with support for 'acero' ({str(exc)})"
+    ) from None
+
+
+try:
+    import pyarrow.dataset as ds
+    from pyarrow._dataset import ScanNodeOptions
+except ImportError:
+    class DatasetModuleStub:
+        class Dataset:
+            pass
+
+        class InMemoryDataset:
+            pass
+    ds = DatasetModuleStub
+
+
+def _dataset_to_decl(dataset, use_threads=True, implicit_ordering=False):
+    decl = Declaration("scan", ScanNodeOptions(
+        dataset, use_threads=use_threads,
+        implicit_ordering=implicit_ordering))
+
+    # Get rid of special dataset columns
+    # "__fragment_index", "__batch_index", "__last_in_fragment", "__filename"
+    projections = [field(f) for f in dataset.schema.names]
+    decl = Declaration.from_sequence(
+        [decl, Declaration("project", ProjectNodeOptions(projections))]
+    )
+
+    filter_expr = dataset._scan_options.get("filter")
+    if filter_expr is not None:
+        # Filters applied in CScanNodeOptions are "best effort" for the scan node itself
+        # so we always need to inject an additional Filter node to apply them for real.
+        decl = Declaration.from_sequence(
+            [decl, Declaration("filter", FilterNodeOptions(filter_expr))]
+        )
+
+    return decl
+
+
+def _perform_join(join_type, left_operand, left_keys,
+                  right_operand, right_keys,
+                  left_suffix=None, right_suffix=None,
+                  use_threads=True, coalesce_keys=False,
+                  output_type=Table, filter_expression=None):
+    """
+    Perform join of two tables or datasets.
+
+    The result will be an output table with the result of the join operation
+
+    Parameters
+    ----------
+    join_type : str
+        One of supported join types.
+    left_operand : Table or Dataset
+        The left operand for the join operation.
+    left_keys : str or list[str]
+        The left key (or keys) on which the join operation should be performed.
+    right_operand : Table or Dataset
+        The right operand for the join operation.
+    right_keys : str or list[str]
+        The right key (or keys) on which the join operation should be performed.
+    left_suffix : str, default None
+        Which suffix to add to left column names. This prevents confusion
+        when the columns in left and right operands have colliding names.
+    right_suffix : str, default None
+        Which suffix to add to the right column names. This prevents confusion
+        when the columns in left and right operands have colliding names.
+    use_threads : bool, default True
+        Whether to use multithreading or not.
+    coalesce_keys : bool, default False
+        If the duplicated keys should be omitted from one of the sides
+        in the join result.
+    output_type: Table or InMemoryDataset
+        The output type for the exec plan result.
+    filter_expression : pyarrow.compute.Expression
+        Residual filter which is applied to matching row.
+
+    Returns
+    -------
+    result_table : Table or InMemoryDataset
+    """
+    if not isinstance(left_operand, (Table, ds.Dataset)):
+        raise TypeError(f"Expected Table or Dataset, got {type(left_operand)}")
+    if not isinstance(right_operand, (Table, ds.Dataset)):
+        raise TypeError(f"Expected Table or Dataset, got {type(right_operand)}")
+
+    # Prepare left and right tables Keys to send them to the C++ function
+    left_keys_order = {}
+    if not isinstance(left_keys, (tuple, list)):
+        left_keys = [left_keys]
+    for idx, key in enumerate(left_keys):
+        left_keys_order[key] = idx
+
+    right_keys_order = {}
+    if not isinstance(right_keys, (list, tuple)):
+        right_keys = [right_keys]
+    for idx, key in enumerate(right_keys):
+        right_keys_order[key] = idx
+
+    # By default expose all columns on both left and right table
+    left_columns = left_operand.schema.names
+    right_columns = right_operand.schema.names
+
+    # Pick the join type
+    if join_type == "left semi" or join_type == "left anti":
+        right_columns = []
+    elif join_type == "right semi" or join_type == "right anti":
+        left_columns = []
+    elif join_type == "inner" or join_type == "left outer":
+        right_columns = [
+            col for col in right_columns if col not in right_keys_order
+        ]
+    elif join_type == "right outer":
+        left_columns = [
+            col for col in left_columns if col not in left_keys_order
+        ]
+
+    # Turn the columns to vectors of FieldRefs
+    # and set aside indices of keys.
+    left_column_keys_indices = {}
+    for idx, colname in enumerate(left_columns):
+        if colname in left_keys:
+            left_column_keys_indices[colname] = idx
+    right_column_keys_indices = {}
+    for idx, colname in enumerate(right_columns):
+        if colname in right_keys:
+            right_column_keys_indices[colname] = idx
+
+    # Add the join node to the execplan
+    if isinstance(left_operand, ds.Dataset):
+        left_source = _dataset_to_decl(left_operand, use_threads=use_threads)
+    else:
+        left_source = Declaration("table_source", TableSourceNodeOptions(left_operand))
+    if isinstance(right_operand, ds.Dataset):
+        right_source = _dataset_to_decl(right_operand, use_threads=use_threads)
+    else:
+        right_source = Declaration(
+            "table_source", TableSourceNodeOptions(right_operand)
+        )
+
+    if coalesce_keys:
+        join_opts = HashJoinNodeOptions(
+            join_type, left_keys, right_keys, left_columns, right_columns,
+            output_suffix_for_left=left_suffix or "",
+            output_suffix_for_right=right_suffix or "",
+            filter_expression=filter_expression,
+        )
+    else:
+        join_opts = HashJoinNodeOptions(
+            join_type, left_keys, right_keys,
+            output_suffix_for_left=left_suffix or "",
+            output_suffix_for_right=right_suffix or "",
+            filter_expression=filter_expression,
+        )
+    decl = Declaration(
+        "hashjoin", options=join_opts, inputs=[left_source, right_source]
+    )
+
+    if coalesce_keys and join_type == "full outer":
+        # In case of full outer joins, the join operation will output all columns
+        # so that we can coalesce the keys and exclude duplicates in a subsequent
+        # projection.
+        left_columns_set = set(left_columns)
+        right_columns_set = set(right_columns)
+        # Where the right table columns start.
+        right_operand_index = len(left_columns)
+        projected_col_names = []
+        projections = []
+        for idx, col in enumerate(left_columns + right_columns):
+            if idx < len(left_columns) and col in left_column_keys_indices:
+                # Include keys only once and coalesce left+right table keys.
+                projected_col_names.append(col)
+                # Get the index of the right key that is being paired
+                # with this left key. We do so by retrieving the name
+                # of the right key that is in the same position in the provided keys
+                # and then looking up the index for that name in the right table.
+                right_key_index = right_column_keys_indices[
+                    right_keys[left_keys_order[col]]]
+                projections.append(
+                    Expression._call("coalesce", [
+                        Expression._field(idx), Expression._field(
+                            right_operand_index+right_key_index)
+                    ])
+                )
+            elif idx >= right_operand_index and col in right_column_keys_indices:
+                # Do not include right table keys. As they would lead to duplicated keys
+                continue
+            else:
+                # For all the other columns include them as they are.
+                # Just recompute the suffixes that the join produced as the projection
+                # would lose them otherwise.
+                if (
+                    left_suffix and idx < right_operand_index
+                    and col in right_columns_set
+                ):
+                    col += left_suffix
+                if (
+                    right_suffix and idx >= right_operand_index
+                    and col in left_columns_set
+                ):
+                    col += right_suffix
+                projected_col_names.append(col)
+                projections.append(
+                    Expression._field(idx)
+                )
+        projection = Declaration(
+            "project", ProjectNodeOptions(projections, projected_col_names)
+        )
+        decl = Declaration.from_sequence([decl, projection])
+
+    result_table = decl.to_table(use_threads=use_threads)
+
+    if output_type == Table:
+        return result_table
+    elif output_type == ds.InMemoryDataset:
+        return ds.InMemoryDataset(result_table)
+    else:
+        raise TypeError("Unsupported output type")
+
+
+def _perform_join_asof(left_operand, left_on, left_by,
+                       right_operand, right_on, right_by,
+                       tolerance, use_threads=True,
+                       output_type=Table):
+    """
+    Perform asof join of two tables or datasets.
+
+    The result will be an output table with the result of the join operation
+
+    Parameters
+    ----------
+    left_operand : Table or Dataset
+        The left operand for the join operation.
+    left_on : str
+        The left key (or keys) on which the join operation should be performed.
+    left_by: str or list[str]
+        The left key (or keys) on which the join operation should be performed.
+    right_operand : Table or Dataset
+        The right operand for the join operation.
+    right_on : str or list[str]
+        The right key (or keys) on which the join operation should be performed.
+    right_by: str or list[str]
+        The right key (or keys) on which the join operation should be performed.
+    tolerance : int
+        The tolerance to use for the asof join. The tolerance is interpreted in
+        the same units as the "on" key.
+    output_type: Table or InMemoryDataset
+        The output type for the exec plan result.
+
+    Returns
+    -------
+    result_table : Table or InMemoryDataset
+    """
+    if not isinstance(left_operand, (Table, ds.Dataset)):
+        raise TypeError(f"Expected Table or Dataset, got {type(left_operand)}")
+    if not isinstance(right_operand, (Table, ds.Dataset)):
+        raise TypeError(f"Expected Table or Dataset, got {type(right_operand)}")
+
+    if not isinstance(left_by, (tuple, list)):
+        left_by = [left_by]
+    if not isinstance(right_by, (tuple, list)):
+        right_by = [right_by]
+
+    # AsofJoin does not return on or by columns for right_operand.
+    right_columns = [
+        col for col in right_operand.schema.names
+        if col not in [right_on] + right_by
+    ]
+    columns_collisions = set(left_operand.schema.names) & set(right_columns)
+    if columns_collisions:
+        raise ValueError(
+            f"Columns {columns_collisions} present in both tables. "
+            "AsofJoin does not support column collisions."
+        )
+
+    # Add the join node to the execplan
+    if isinstance(left_operand, ds.Dataset):
+        left_source = _dataset_to_decl(
+            left_operand,
+            use_threads=use_threads,
+            implicit_ordering=True)
+    else:
+        left_source = Declaration(
+            "table_source", TableSourceNodeOptions(left_operand),
+        )
+    if isinstance(right_operand, ds.Dataset):
+        right_source = _dataset_to_decl(
+            right_operand, use_threads=use_threads,
+            implicit_ordering=True)
+    else:
+        right_source = Declaration(
+            "table_source", TableSourceNodeOptions(right_operand)
+        )
+
+    join_opts = AsofJoinNodeOptions(
+        left_on, left_by, right_on, right_by, tolerance
+    )
+    decl = Declaration(
+        "asofjoin", options=join_opts, inputs=[left_source, right_source]
+    )
+
+    result_table = decl.to_table(use_threads=use_threads)
+
+    if output_type == Table:
+        return result_table
+    elif output_type == ds.InMemoryDataset:
+        return ds.InMemoryDataset(result_table)
+    else:
+        raise TypeError("Unsupported output type")
+
+
+def _filter_table(table, expression):
+    """Filter rows of a table based on the provided expression.
+
+    The result will be an output table with only the rows matching
+    the provided expression.
+
+    Parameters
+    ----------
+    table : Table or RecordBatch
+        Table that should be filtered.
+    expression : Expression
+        The expression on which rows should be filtered.
+
+    Returns
+    -------
+    Table or RecordBatch
+    """
+    is_batch = False
+    if isinstance(table, RecordBatch):
+        table = Table.from_batches([table])
+        is_batch = True
+
+    decl = Declaration.from_sequence([
+        Declaration("table_source", options=TableSourceNodeOptions(table)),
+        Declaration("filter", options=FilterNodeOptions(expression))
+    ])
+    result = decl.to_table(use_threads=True)
+    if is_batch:
+        if result.num_rows > 0:
+            result = result.combine_chunks().to_batches()[0]
+        else:
+            arrays = [array([], type=field.type) for field in result.schema]
+            result = RecordBatch.from_arrays(arrays, schema=result.schema)
+    return result
+
+
+def _sort_source(table_or_dataset, sort_keys, output_type=Table, **kwargs):
+
+    if isinstance(table_or_dataset, ds.Dataset):
+        data_source = _dataset_to_decl(table_or_dataset, use_threads=True)
+    else:
+        data_source = Declaration(
+            "table_source", TableSourceNodeOptions(table_or_dataset)
+        )
+
+    order_by = Declaration("order_by", OrderByNodeOptions(sort_keys, **kwargs))
+
+    decl = Declaration.from_sequence([data_source, order_by])
+    result_table = decl.to_table(use_threads=True)
+
+    if output_type == Table:
+        return result_table
+    elif output_type == ds.InMemoryDataset:
+        return ds.InMemoryDataset(result_table)
+    else:
+        raise TypeError("Unsupported output type")
+
+
+def _group_by(table, aggregates, keys, use_threads=True):
+
+    decl = Declaration.from_sequence([
+        Declaration("table_source", TableSourceNodeOptions(table)),
+        Declaration("aggregate", AggregateNodeOptions(aggregates, keys=keys))
+    ])
+    return decl.to_table(use_threads=use_threads)