Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/pyarrow/include/arrow/dataset/scanner.h
+++ b/venv/lib/python3.10/site-packages/pyarrow/include/arrow/dataset/scanner.h
@@ -0,0 +1,623 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/acero/options.h"
+#include "arrow/compute/expression.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator_fwd.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/type_fwd.h"
+
+namespace arrow {
+
+using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
+
+namespace dataset {
+
+/// \defgroup dataset-scanning Scanning API
+///
+/// @{
+
+constexpr int64_t kDefaultBatchSize = 1 << 17;  // 128Ki rows
+// This will yield 64 batches ~ 8Mi rows
+constexpr int32_t kDefaultBatchReadahead = 16;
+constexpr int32_t kDefaultFragmentReadahead = 4;
+constexpr int32_t kDefaultBytesReadahead = 1 << 25;  // 32MiB
+
+/// Scan-specific options, which can be changed between scans of the same dataset.
+struct ARROW_DS_EXPORT ScanOptions {
+  /// A row filter (which will be pushed down to partitioning/reading if supported).
+  compute::Expression filter = compute::literal(true);
+  /// A projection expression (which can add/remove/rename columns).
+  compute::Expression projection;
+
+  /// Schema with which batches will be read from fragments. This is also known as the
+  /// "reader schema" it will be used (for example) in constructing CSV file readers to
+  /// identify column types for parsing. Usually only a subset of its fields (see
+  /// MaterializedFields) will be materialized during a scan.
+  std::shared_ptr<Schema> dataset_schema;
+
+  /// Schema of projected record batches. This is independent of dataset_schema as its
+  /// fields are derived from the projection. For example, let
+  ///
+  ///   dataset_schema = {"a": int32, "b": int32, "id": utf8}
+  ///   projection = project({equal(field_ref("a"), field_ref("b"))}, {"a_plus_b"})
+  ///
+  /// (no filter specified). In this case, the projected_schema would be
+  ///
+  ///   {"a_plus_b": int32}
+  std::shared_ptr<Schema> projected_schema;
+
+  /// Maximum row count for scanned batches.
+  int64_t batch_size = kDefaultBatchSize;
+
+  /// How many batches to read ahead within a fragment.
+  ///
+  /// Set to 0 to disable batch readahead
+  ///
+  /// Note: May not be supported by all formats
+  /// Note: Will be ignored if use_threads is set to false
+  int32_t batch_readahead = kDefaultBatchReadahead;
+
+  /// How many files to read ahead
+  ///
+  /// Set to 0 to disable fragment readahead
+  ///
+  /// Note: May not be enforced by all scanners
+  /// Note: Will be ignored if use_threads is set to false
+  int32_t fragment_readahead = kDefaultFragmentReadahead;
+
+  /// A pool from which materialized and scanned arrays will be allocated.
+  MemoryPool* pool = arrow::default_memory_pool();
+
+  /// IOContext for any IO tasks
+  ///
+  /// Note: The IOContext executor will be ignored if use_threads is set to false
+  io::IOContext io_context;
+
+  /// Executor for any CPU tasks
+  ///
+  /// If null, the global CPU executor will be used
+  ///
+  /// Note: The Executor will be ignored if use_threads is set to false
+  arrow::internal::Executor* cpu_executor = NULLPTR;
+
+  /// If true the scanner will scan in parallel
+  ///
+  /// Note: If true, this will use threads from both the cpu_executor and the
+  /// io_context.executor
+  /// Note: This  must be true in order for any readahead to happen
+  bool use_threads = false;
+
+  /// If true the scanner will add augmented fields to the output schema.
+  bool add_augmented_fields = true;
+
+  /// Whether to cache metadata when scanning.
+  ///
+  /// Fragments may typically cache metadata to speed up repeated accesses.
+  /// However, in use cases where a single scan is done, or if memory use
+  /// is more critical than CPU time, setting this option to false can
+  /// lessen memory use.
+  bool cache_metadata = true;
+
+  /// Fragment-specific scan options.
+  std::shared_ptr<FragmentScanOptions> fragment_scan_options;
+
+  /// Return a vector of FieldRefs that require materialization.
+  ///
+  /// This is usually the union of the fields referenced in the projection and the
+  /// filter expression. Examples:
+  ///
+  /// - `SELECT a, b WHERE a < 2 && c > 1` => ["a", "b", "a", "c"]
+  /// - `SELECT a + b < 3 WHERE a > 1` => ["a", "b", "a"]
+  ///
+  /// This is needed for expression where a field may not be directly
+  /// used in the final projection but is still required to evaluate the
+  /// expression.
+  ///
+  /// This is used by Fragment implementations to apply the column
+  /// sub-selection optimization.
+  std::vector<FieldRef> MaterializedFields() const;
+
+  /// Parameters which control when the plan should pause for a slow consumer
+  acero::BackpressureOptions backpressure =
+      acero::BackpressureOptions::DefaultBackpressure();
+};
+
+/// Scan-specific options, which can be changed between scans of the same dataset.
+///
+/// A dataset consists of one or more individual fragments.  A fragment is anything
+/// that is independently scannable, often a file.
+///
+/// Batches from all fragments will be converted to a single schema. This unified
+/// schema is referred to as the "dataset schema" and is the output schema for
+/// this node.
+///
+/// Individual fragments may have schemas that are different from the dataset
+/// schema.  This is sometimes referred to as the physical or fragment schema.
+/// Conversion from the fragment schema to the dataset schema is a process
+/// known as evolution.
+struct ARROW_DS_EXPORT ScanV2Options : public acero::ExecNodeOptions {
+  explicit ScanV2Options(std::shared_ptr<Dataset> dataset)
+      : dataset(std::move(dataset)) {}
+
+  /// \brief The dataset to scan
+  std::shared_ptr<Dataset> dataset;
+  /// \brief A row filter
+  ///
+  /// The filter expression should be written against the dataset schema.
+  /// The filter must be unbound.
+  ///
+  /// This is an opportunistic pushdown filter.  Filtering capabilities will
+  /// vary between formats.  If a format is not capable of applying the filter
+  /// then it will ignore it.
+  ///
+  /// Each fragment will do its best to filter the data based on the information
+  /// (partitioning guarantees, statistics) available to it.  If it is able to
+  /// apply some filtering then it will indicate what filtering it was able to
+  /// apply by attaching a guarantee to the batch.
+  ///
+  /// For example, if a filter is x < 50 && y > 40 then a batch may be able to
+  /// apply a guarantee x < 50.  Post-scan filtering would then only need to
+  /// consider y > 40 (for this specific batch).  The next batch may not be able
+  /// to attach any guarantee and both clauses would need to be applied to that batch.
+  ///
+  /// A single guarantee-aware filtering operation should generally be applied to all
+  /// resulting batches.  The scan node is not responsible for this.
+  ///
+  /// Fields that are referenced by the filter should be included in the `columns` vector.
+  /// The scan node will not automatically fetch fields referenced by the filter
+  /// expression. \see AddFieldsNeededForFilter
+  ///
+  /// If the filter references fields that are not included in `columns` this may or may
+  /// not be an error, depending on the format.
+  compute::Expression filter = compute::literal(true);
+
+  /// \brief The columns to scan
+  ///
+  /// This is not a simple list of top-level column indices but instead a set of paths
+  /// allowing for partial selection of columns
+  ///
+  /// These paths refer to the dataset schema
+  ///
+  /// For example, consider the following dataset schema:
+  ///   schema({
+  ///     field("score", int32()),
+  ///           "marker", struct_({
+  ///              field("color", utf8()),
+  ///              field("location", struct_({
+  ///                  field("x", float64()),
+  ///                  field("y", float64())
+  ///              })
+  ///          })
+  ///   })
+  ///
+  /// If `columns` is {{0}, {1,1,0}} then the output schema is:
+  ///   schema({field("score", int32()), field("x", float64())})
+  ///
+  /// If `columns` is {{1,1,1}, {1,1}} then the output schema is:
+  ///   schema({
+  ///       field("y", float64()),
+  ///       field("location", struct_({
+  ///           field("x", float64()),
+  ///           field("y", float64())
+  ///       })
+  ///   })
+  std::vector<FieldPath> columns;
+
+  /// \brief Target number of bytes to read ahead in a fragment
+  ///
+  /// This limit involves some amount of estimation.  Formats typically only know
+  /// batch boundaries in terms of rows (not decoded bytes) and so an estimation
+  /// must be done to guess the average row size.  Other formats like CSV and JSON
+  /// must make even more generalized guesses.
+  ///
+  /// This is a best-effort guide.  Some formats may need to read ahead further,
+  /// for example, if scanning a parquet file that has batches with 100MiB of data
+  /// then the actual readahead will be at least 100MiB
+  ///
+  /// Set to 0 to disable readahead.  When disabled, the scanner will read the
+  /// dataset one batch at a time
+  ///
+  /// This limit applies across all fragments.  If the limit is 32MiB and the
+  /// fragment readahead allows for 20 fragments to be read at once then the
+  /// total readahead will still be 32MiB and NOT 20 * 32MiB.
+  int32_t target_bytes_readahead = kDefaultBytesReadahead;
+
+  /// \brief Number of fragments to read ahead
+  ///
+  /// Higher readahead will potentially lead to more efficient I/O but will lead
+  /// to the scan operation using more RAM.  The default is fairly conservative
+  /// and designed for fast local disks (or slow local spinning disks which cannot
+  /// handle much parallelism anyways).  When using a highly parallel remote filesystem
+  /// you will likely want to increase these values.
+  ///
+  /// Set to 0 to disable fragment readahead.  When disabled the dataset will be scanned
+  /// one fragment at a time.
+  int32_t fragment_readahead = kDefaultFragmentReadahead;
+  /// \brief Options specific to the file format
+  const FragmentScanOptions* format_options = NULLPTR;
+
+  /// \brief Utility method to get a selection representing all columns in a dataset
+  static std::vector<FieldPath> AllColumns(const Schema& dataset_schema);
+
+  /// \brief Utility method to add fields needed for the current filter
+  ///
+  /// This method adds any fields that are needed by `filter` which are not already
+  /// included in the list of columns.  Any new fields added will be added to the end
+  /// in no particular order.
+  static Status AddFieldsNeededForFilter(ScanV2Options* options);
+};
+
+/// \brief Describes a projection
+struct ARROW_DS_EXPORT ProjectionDescr {
+  /// \brief The projection expression itself
+  /// This expression must be a call to make_struct
+  compute::Expression expression;
+  /// \brief The output schema of the projection.
+
+  /// This can be calculated from the input schema and the expression but it
+  /// is cached here for convenience.
+  std::shared_ptr<Schema> schema;
+
+  /// \brief Create a ProjectionDescr by binding an expression to the dataset schema
+  ///
+  /// expression must return a struct type
+  static Result<ProjectionDescr> FromStructExpression(
+      const compute::Expression& expression, const Schema& dataset_schema);
+
+  /// \brief Create a ProjectionDescr from expressions/names for each field
+  static Result<ProjectionDescr> FromExpressions(std::vector<compute::Expression> exprs,
+                                                 std::vector<std::string> names,
+                                                 const Schema& dataset_schema);
+
+  /// \brief Create a default projection referencing fields in the dataset schema
+  static Result<ProjectionDescr> FromNames(std::vector<std::string> names,
+                                           const Schema& dataset_schema,
+                                           bool add_augmented_fields = true);
+
+  /// \brief Make a projection that projects every field in the dataset schema
+  static Result<ProjectionDescr> Default(const Schema& dataset_schema,
+                                         bool add_augmented_fields = true);
+};
+
+/// \brief Utility method to set the projection expression and schema
+ARROW_DS_EXPORT void SetProjection(ScanOptions* options, ProjectionDescr projection);
+
+/// \brief Combines a record batch with the fragment that the record batch originated
+/// from
+///
+/// Knowing the source fragment can be useful for debugging & understanding loaded
+/// data
+struct TaggedRecordBatch {
+  std::shared_ptr<RecordBatch> record_batch;
+  std::shared_ptr<Fragment> fragment;
+
+  friend inline bool operator==(const TaggedRecordBatch& left,
+                                const TaggedRecordBatch& right) {
+    return left.record_batch == right.record_batch && left.fragment == right.fragment;
+  }
+};
+
+using TaggedRecordBatchGenerator = std::function<Future<TaggedRecordBatch>()>;
+using TaggedRecordBatchIterator = Iterator<TaggedRecordBatch>;
+
+/// \brief Combines a tagged batch with positional information
+///
+/// This is returned when scanning batches in an unordered fashion.  This information is
+/// needed if you ever want to reassemble the batches in order
+struct EnumeratedRecordBatch {
+  Enumerated<std::shared_ptr<RecordBatch>> record_batch;
+  Enumerated<std::shared_ptr<Fragment>> fragment;
+
+  friend inline bool operator==(const EnumeratedRecordBatch& left,
+                                const EnumeratedRecordBatch& right) {
+    return left.record_batch == right.record_batch && left.fragment == right.fragment;
+  }
+};
+
+using EnumeratedRecordBatchGenerator = std::function<Future<EnumeratedRecordBatch>()>;
+using EnumeratedRecordBatchIterator = Iterator<EnumeratedRecordBatch>;
+
+/// @}
+
+}  // namespace dataset
+
+template <>
+struct IterationTraits<dataset::TaggedRecordBatch> {
+  static dataset::TaggedRecordBatch End() {
+    return dataset::TaggedRecordBatch{NULLPTR, NULLPTR};
+  }
+  static bool IsEnd(const dataset::TaggedRecordBatch& val) {
+    return val.record_batch == NULLPTR;
+  }
+};
+
+template <>
+struct IterationTraits<dataset::EnumeratedRecordBatch> {
+  static dataset::EnumeratedRecordBatch End() {
+    return dataset::EnumeratedRecordBatch{
+        IterationEnd<Enumerated<std::shared_ptr<RecordBatch>>>(),
+        IterationEnd<Enumerated<std::shared_ptr<dataset::Fragment>>>()};
+  }
+  static bool IsEnd(const dataset::EnumeratedRecordBatch& val) {
+    return IsIterationEnd(val.fragment);
+  }
+};
+
+namespace dataset {
+
+/// \defgroup dataset-scanning Scanning API
+///
+/// @{
+
+/// \brief A scanner glues together several dataset classes to load in data.
+/// The dataset contains a collection of fragments and partitioning rules.
+///
+/// The fragments identify independently loadable units of data (i.e. each fragment has
+/// a potentially unique schema and possibly even format.  It should be possible to read
+/// fragments in parallel if desired).
+///
+/// The fragment's format contains the logic necessary to actually create a task to load
+/// the fragment into memory.  That task may or may not support parallel execution of
+/// its own.
+///
+/// The scanner is then responsible for creating scan tasks from every fragment in the
+/// dataset and (potentially) sequencing the loaded record batches together.
+///
+/// The scanner should not buffer the entire dataset in memory (unless asked) instead
+/// yielding record batches as soon as they are ready to scan.  Various readahead
+/// properties control how much data is allowed to be scanned before pausing to let a
+/// slow consumer catchup.
+///
+/// Today the scanner also handles projection & filtering although that may change in
+/// the future.
+class ARROW_DS_EXPORT Scanner {
+ public:
+  virtual ~Scanner() = default;
+
+  /// \brief Apply a visitor to each RecordBatch as it is scanned. If multiple threads
+  /// are used (via use_threads), the visitor will be invoked from those threads and is
+  /// responsible for any synchronization.
+  virtual Status Scan(std::function<Status(TaggedRecordBatch)> visitor) = 0;
+  /// \brief Convert a Scanner into a Table.
+  ///
+  /// Use this convenience utility with care. This will serially materialize the
+  /// Scan result in memory before creating the Table.
+  virtual Result<std::shared_ptr<Table>> ToTable() = 0;
+  /// \brief Scan the dataset into a stream of record batches.  Each batch is tagged
+  /// with the fragment it originated from.  The batches will arrive in order.  The
+  /// order of fragments is determined by the dataset.
+  ///
+  /// Note: The scanner will perform some readahead but will avoid materializing too
+  /// much in memory (this is goverended by the readahead options and use_threads option).
+  /// If the readahead queue fills up then I/O will pause until the calling thread catches
+  /// up.
+  virtual Result<TaggedRecordBatchIterator> ScanBatches() = 0;
+  virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync() = 0;
+  virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync(
+      ::arrow::internal::Executor* cpu_thread_pool) = 0;
+  /// \brief Scan the dataset into a stream of record batches.  Unlike ScanBatches this
+  /// method may allow record batches to be returned out of order.  This allows for more
+  /// efficient scanning: some fragments may be accessed more quickly than others (e.g.
+  /// may be cached in RAM or just happen to get scheduled earlier by the I/O)
+  ///
+  /// To make up for the out-of-order iteration each batch is further tagged with
+  /// positional information.
+  virtual Result<EnumeratedRecordBatchIterator> ScanBatchesUnordered() = 0;
+  virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync() = 0;
+  virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync(
+      ::arrow::internal::Executor* cpu_thread_pool) = 0;
+  /// \brief A convenience to synchronously load the given rows by index.
+  ///
+  /// Will only consume as many batches as needed from ScanBatches().
+  virtual Result<std::shared_ptr<Table>> TakeRows(const Array& indices) = 0;
+  /// \brief Get the first N rows.
+  virtual Result<std::shared_ptr<Table>> Head(int64_t num_rows) = 0;
+  /// \brief Count rows matching a predicate.
+  ///
+  /// This method will push down the predicate and compute the result based on fragment
+  /// metadata if possible.
+  virtual Result<int64_t> CountRows() = 0;
+  virtual Future<int64_t> CountRowsAsync() = 0;
+  /// \brief Convert the Scanner to a RecordBatchReader so it can be
+  /// easily used with APIs that expect a reader.
+  virtual Result<std::shared_ptr<RecordBatchReader>> ToRecordBatchReader() = 0;
+
+  /// \brief Get the options for this scan.
+  const std::shared_ptr<ScanOptions>& options() const { return scan_options_; }
+  /// \brief Get the dataset that this scanner will scan
+  virtual const std::shared_ptr<Dataset>& dataset() const = 0;
+
+ protected:
+  explicit Scanner(std::shared_ptr<ScanOptions> scan_options)
+      : scan_options_(std::move(scan_options)) {}
+
+  Result<EnumeratedRecordBatchIterator> AddPositioningToInOrderScan(
+      TaggedRecordBatchIterator scan);
+
+  const std::shared_ptr<ScanOptions> scan_options_;
+};
+
+/// \brief ScannerBuilder is a factory class to construct a Scanner. It is used
+/// to pass information, notably a potential filter expression and a subset of
+/// columns to materialize.
+class ARROW_DS_EXPORT ScannerBuilder {
+ public:
+  explicit ScannerBuilder(std::shared_ptr<Dataset> dataset);
+
+  ScannerBuilder(std::shared_ptr<Dataset> dataset,
+                 std::shared_ptr<ScanOptions> scan_options);
+
+  ScannerBuilder(std::shared_ptr<Schema> schema, std::shared_ptr<Fragment> fragment,
+                 std::shared_ptr<ScanOptions> scan_options);
+
+  /// \brief Make a scanner from a record batch reader.
+  ///
+  /// The resulting scanner can be scanned only once. This is intended
+  /// to support writing data from streaming sources or other sources
+  /// that can be iterated only once.
+  static std::shared_ptr<ScannerBuilder> FromRecordBatchReader(
+      std::shared_ptr<RecordBatchReader> reader);
+
+  /// \brief Set the subset of columns to materialize.
+  ///
+  /// Columns which are not referenced may not be read from fragments.
+  ///
+  /// \param[in] columns list of columns to project. Order and duplicates will
+  ///            be preserved.
+  ///
+  /// \return Failure if any column name does not exists in the dataset's
+  ///         Schema.
+  Status Project(std::vector<std::string> columns);
+
+  /// \brief Set expressions which will be evaluated to produce the materialized
+  /// columns.
+  ///
+  /// Columns which are not referenced may not be read from fragments.
+  ///
+  /// \param[in] exprs expressions to evaluate to produce columns.
+  /// \param[in] names list of names for the resulting columns.
+  ///
+  /// \return Failure if any referenced column does not exists in the dataset's
+  ///         Schema.
+  Status Project(std::vector<compute::Expression> exprs, std::vector<std::string> names);
+
+  /// \brief Set the filter expression to return only rows matching the filter.
+  ///
+  /// The predicate will be passed down to Sources and corresponding
+  /// Fragments to exploit predicate pushdown if possible using
+  /// partition information or Fragment internal metadata, e.g. Parquet statistics.
+  /// Columns which are not referenced may not be read from fragments.
+  ///
+  /// \param[in] filter expression to filter rows with.
+  ///
+  /// \return Failure if any referenced columns does not exist in the dataset's
+  ///         Schema.
+  Status Filter(const compute::Expression& filter);
+
+  /// \brief Indicate if the Scanner should make use of the available
+  ///        ThreadPool found in ScanOptions;
+  Status UseThreads(bool use_threads = true);
+
+  /// \brief Indicate if metadata should be cached when scanning
+  ///
+  /// Fragments may typically cache metadata to speed up repeated accesses.
+  /// However, in use cases where a single scan is done, or if memory use
+  /// is more critical than CPU time, setting this option to false can
+  /// lessen memory use.
+  Status CacheMetadata(bool cache_metadata = true);
+
+  /// \brief Set the maximum number of rows per RecordBatch.
+  ///
+  /// \param[in] batch_size the maximum number of rows.
+  /// \returns An error if the number for batch is not greater than 0.
+  ///
+  /// This option provides a control limiting the memory owned by any RecordBatch.
+  Status BatchSize(int64_t batch_size);
+
+  /// \brief Set the number of batches to read ahead within a fragment.
+  ///
+  /// \param[in] batch_readahead How many batches to read ahead within a fragment
+  /// \returns an error if this number is less than 0.
+  ///
+  /// This option provides a control on the RAM vs I/O tradeoff.
+  /// It might not be supported by all file formats, in which case it will
+  /// simply be ignored.
+  Status BatchReadahead(int32_t batch_readahead);
+
+  /// \brief Set the number of fragments to read ahead
+  ///
+  /// \param[in] fragment_readahead How many fragments to read ahead
+  /// \returns an error if this number is less than 0.
+  ///
+  /// This option provides a control on the RAM vs I/O tradeoff.
+  Status FragmentReadahead(int32_t fragment_readahead);
+
+  /// \brief Set the pool from which materialized and scanned arrays will be allocated.
+  Status Pool(MemoryPool* pool);
+
+  /// \brief Set fragment-specific scan options.
+  Status FragmentScanOptions(std::shared_ptr<FragmentScanOptions> fragment_scan_options);
+
+  /// \brief Override default backpressure configuration
+  Status Backpressure(acero::BackpressureOptions backpressure);
+
+  /// \brief Return the current scan options for the builder.
+  Result<std::shared_ptr<ScanOptions>> GetScanOptions();
+
+  /// \brief Return the constructed now-immutable Scanner object
+  Result<std::shared_ptr<Scanner>> Finish();
+
+  const std::shared_ptr<Schema>& schema() const;
+  const std::shared_ptr<Schema>& projected_schema() const;
+
+ private:
+  std::shared_ptr<Dataset> dataset_;
+  std::shared_ptr<ScanOptions> scan_options_ = std::make_shared<ScanOptions>();
+};
+
+/// \brief Construct a source ExecNode which yields batches from a dataset scan.
+///
+/// Does not construct associated filter or project nodes.
+///
+/// Batches are yielded sequentially, like single-threaded,
+/// when require_sequenced_output=true.
+///
+/// Yielded batches will be augmented with fragment/batch indices when
+/// implicit_ordering=true to enable stable ordering for simple ExecPlans.
+class ARROW_DS_EXPORT ScanNodeOptions : public acero::ExecNodeOptions {
+ public:
+  explicit ScanNodeOptions(std::shared_ptr<Dataset> dataset,
+                           std::shared_ptr<ScanOptions> scan_options,
+                           bool require_sequenced_output = false,
+                           bool implicit_ordering = false)
+      : dataset(std::move(dataset)),
+        scan_options(std::move(scan_options)),
+        require_sequenced_output(require_sequenced_output),
+        implicit_ordering(implicit_ordering) {}
+
+  std::shared_ptr<Dataset> dataset;
+  std::shared_ptr<ScanOptions> scan_options;
+  bool require_sequenced_output;
+  bool implicit_ordering;
+};
+
+/// @}
+
+namespace internal {
+ARROW_DS_EXPORT void InitializeScanner(arrow::acero::ExecFactoryRegistry* registry);
+ARROW_DS_EXPORT void InitializeScannerV2(arrow::acero::ExecFactoryRegistry* registry);
+}  // namespace internal
+}  // namespace dataset
+}  // namespace arrow