Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include "arrow/compute/expression.h"
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/discovery.h"
#include "arrow/dataset/file_base.h"
#ifdef ARROW_CSV
# include "arrow/dataset/file_csv.h"
#endif
#ifdef ARROW_JSON
# include "arrow/dataset/file_json.h"
#endif
#include "arrow/dataset/file_ipc.h"
#ifdef ARROW_ORC
# include "arrow/dataset/file_orc.h"
#endif
#ifdef ARROW_PARQUET
# include "arrow/dataset/file_parquet.h"
#endif
#include "arrow/dataset/scanner.h"

View File

@@ -0,0 +1,491 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <functional>
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>
#include "arrow/compute/expression.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/util/async_generator_fwd.h"
#include "arrow/util/future.h"
#include "arrow/util/macros.h"
#include "arrow/util/mutex.h"
namespace arrow {
namespace internal {
class Executor;
} // namespace internal
namespace dataset {
using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
/// \brief Description of a column to scan
struct ARROW_DS_EXPORT FragmentSelectionColumn {
/// \brief The path to the column to load
FieldPath path;
/// \brief The type of the column in the dataset schema
///
/// A format may choose to ignore this field completely. For example, when
/// reading from IPC the reader can just return the column in the data type
/// that is stored on disk. There is no point in doing anything special.
///
/// However, some formats may be capable of casting on the fly. For example,
/// when reading from CSV, if we know the target type of the column, we can
/// convert from string to the target type as we read.
DataType* requested_type;
};
/// \brief A list of columns that should be loaded from a fragment
///
/// The paths in this selection should be referring to the fragment schema. This class
/// contains a virtual destructor as it is expected evolution strategies will need to
/// extend this to add any information needed to later evolve the batches.
///
/// For example, in the basic evolution strategy, we keep track of which columns
/// were missing from the file so that we can fill those in with null when evolving.
class ARROW_DS_EXPORT FragmentSelection {
public:
explicit FragmentSelection(std::vector<FragmentSelectionColumn> columns)
: columns_(std::move(columns)) {}
virtual ~FragmentSelection() = default;
/// The columns that should be loaded from the fragment
const std::vector<FragmentSelectionColumn>& columns() const { return columns_; }
private:
std::vector<FragmentSelectionColumn> columns_;
};
/// \brief Instructions for scanning a particular fragment
///
/// The fragment scan request is derived from ScanV2Options. The main
/// difference is that the scan options are based on the dataset schema
/// while the fragment request is based on the fragment schema.
struct ARROW_DS_EXPORT FragmentScanRequest {
/// \brief A row filter
///
/// The filter expression should be written against the fragment schema.
///
/// \see ScanV2Options for details on how this filter should be applied
compute::Expression filter = compute::literal(true);
/// \brief The columns to scan
///
/// These indices refer to the fragment schema
///
/// Note: This is NOT a simple list of top-level column indices.
/// For more details \see ScanV2Options
///
/// If possible a fragment should only read from disk the data needed
/// to satisfy these columns. If a format cannot partially read a nested
/// column (e.g. JSON) then it must apply the column selection (in memory)
/// before returning the scanned batch.
std::shared_ptr<FragmentSelection> fragment_selection;
/// \brief Options specific to the format being scanned
const FragmentScanOptions* format_scan_options;
};
/// \brief An iterator-like object that can yield batches created from a fragment
class ARROW_DS_EXPORT FragmentScanner {
public:
/// This instance will only be destroyed after all ongoing scan futures
/// have been completed.
///
/// This means any callbacks created as part of the scan can safely
/// capture `this`
virtual ~FragmentScanner() = default;
/// \brief Scan a batch of data from the file
/// \param batch_number The index of the batch to read
virtual Future<std::shared_ptr<RecordBatch>> ScanBatch(int batch_number) = 0;
/// \brief Calculate an estimate of how many data bytes the given batch will represent
///
/// "Data bytes" should be the total size of all the buffers once the data has been
/// decoded into the Arrow format.
virtual int64_t EstimatedDataBytes(int batch_number) = 0;
/// \brief The number of batches in the fragment to scan
virtual int NumBatches() = 0;
};
/// \brief Information learned about a fragment through inspection
///
/// This information can be used to figure out which fields need
/// to be read from a file and how the data read in should be evolved
/// to match the dataset schema.
///
/// For example, from a CSV file we can inspect and learn the column
/// names and use those column names to determine which columns to load
/// from the CSV file.
struct ARROW_DS_EXPORT InspectedFragment {
explicit InspectedFragment(std::vector<std::string> column_names)
: column_names(std::move(column_names)) {}
std::vector<std::string> column_names;
};
/// \brief A granular piece of a Dataset, such as an individual file.
///
/// A Fragment can be read/scanned separately from other fragments. It yields a
/// collection of RecordBatches when scanned
///
/// Note that Fragments have well defined physical schemas which are reconciled by
/// the Datasets which contain them; these physical schemas may differ from a parent
/// Dataset's schema and the physical schemas of sibling Fragments.
class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
public:
/// \brief An expression that represents no known partition information
static const compute::Expression kNoPartitionInformation;
/// \brief Return the physical schema of the Fragment.
///
/// The physical schema is also called the writer schema.
/// This method is blocking and may suffer from high latency filesystem.
/// The schema is cached after being read once, or may be specified at construction.
Result<std::shared_ptr<Schema>> ReadPhysicalSchema();
/// An asynchronous version of Scan
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options) = 0;
/// \brief Inspect a fragment to learn basic information
///
/// This will be called before a scan and a fragment should attach whatever
/// information will be needed to figure out an evolution strategy. This information
/// will then be passed to the call to BeginScan
virtual Future<std::shared_ptr<InspectedFragment>> InspectFragment(
const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
/// \brief Start a scan operation
virtual Future<std::shared_ptr<FragmentScanner>> BeginScan(
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
/// \brief Count the number of rows in this fragment matching the filter using metadata
/// only. That is, this method may perform I/O, but will not load data.
///
/// If this is not possible, resolve with an empty optional. The fragment can perform
/// I/O (e.g. to read metadata) before it deciding whether it can satisfy the request.
virtual Future<std::optional<int64_t>> CountRows(
compute::Expression predicate, const std::shared_ptr<ScanOptions>& options);
/// \brief Clear any metadata that may have been cached by this object.
///
/// A fragment may typically cache metadata to speed up repeated accesses.
/// In use cases when memory use is more critical than CPU time, calling
/// this function can help reclaim memory.
virtual Status ClearCachedMetadata();
virtual std::string type_name() const = 0;
virtual std::string ToString() const { return type_name(); }
/// \brief An expression which evaluates to true for all data viewed by this
/// Fragment.
const compute::Expression& partition_expression() const {
return partition_expression_;
}
virtual ~Fragment() = default;
protected:
Fragment() = default;
explicit Fragment(compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema);
virtual Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() = 0;
util::Mutex physical_schema_mutex_;
compute::Expression partition_expression_ = compute::literal(true);
// The physical schema that is inferred from the Fragment
std::shared_ptr<Schema> physical_schema_;
// The physical schema that was passed to the Fragment constructor
std::shared_ptr<Schema> given_physical_schema_;
};
/// \brief Per-scan options for fragment(s) in a dataset.
///
/// These options are not intrinsic to the format or fragment itself, but do affect
/// the results of a scan. These are options which make sense to change between
/// repeated reads of the same dataset, such as format-specific conversion options
/// (that do not affect the schema).
///
/// \ingroup dataset-scanning
class ARROW_DS_EXPORT FragmentScanOptions {
public:
virtual std::string type_name() const = 0;
virtual std::string ToString() const { return type_name(); }
virtual ~FragmentScanOptions() = default;
};
/// \defgroup dataset-implementations Concrete implementations
///
/// @{
/// \brief A trivial Fragment that yields ScanTask out of a fixed set of
/// RecordBatch.
class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
public:
class Scanner;
InMemoryFragment(std::shared_ptr<Schema> schema, RecordBatchVector record_batches,
compute::Expression = compute::literal(true));
explicit InMemoryFragment(RecordBatchVector record_batches,
compute::Expression = compute::literal(true));
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options) override;
Future<std::optional<int64_t>> CountRows(
compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) override;
Future<std::shared_ptr<FragmentScanner>> BeginScan(
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) override;
std::string type_name() const override { return "in-memory"; }
protected:
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
RecordBatchVector record_batches_;
};
/// @}
using FragmentGenerator = AsyncGenerator<std::shared_ptr<Fragment>>;
/// \brief Rules for converting the dataset schema to and from fragment schemas
class ARROW_DS_EXPORT FragmentEvolutionStrategy {
public:
/// This instance will only be destroyed when all scan operations for the
/// fragment have completed.
virtual ~FragmentEvolutionStrategy() = default;
/// \brief A guarantee that applies to all batches of this fragment
///
/// For example, if a fragment is missing one of the fields in the dataset
/// schema then a typical evolution strategy is to set that field to null.
///
/// So if the column at index 3 is missing then the guarantee is
/// FieldRef(3) == null
///
/// Individual field guarantees should be AND'd together and returned
/// as a single expression.
virtual Result<compute::Expression> GetGuarantee(
const std::vector<FieldPath>& dataset_schema_selection) const = 0;
/// \brief Return a fragment schema selection given a dataset schema selection
///
/// For example, if the user wants fields 2 & 4 of the dataset schema and
/// in this fragment the field 2 is missing and the field 4 is at index 1 then
/// this should return {1}
virtual Result<std::unique_ptr<FragmentSelection>> DevolveSelection(
const std::vector<FieldPath>& dataset_schema_selection) const = 0;
/// \brief Return a filter expression bound to the fragment schema given
/// a filter expression bound to the dataset schema
///
/// The dataset scan filter will first be simplified by the guarantee returned
/// by GetGuarantee. This means an evolution that only handles dropping or casting
/// fields doesn't need to do anything here except return the given filter.
///
/// On the other hand, an evolution that is doing some kind of aliasing will likely
/// need to convert field references in the filter to the aliased field references
/// where appropriate.
virtual Result<compute::Expression> DevolveFilter(
const compute::Expression& filter) const = 0;
/// \brief Convert a batch from the fragment schema to the dataset schema
///
/// Typically this involves casting columns from the data type stored on disk
/// to the data type of the dataset schema. For example, this fragment might
/// have columns stored as int32 and the dataset schema might have int64 for
/// the column. In this case we should cast the column from int32 to int64.
///
/// Note: A fragment may perform this cast as the data is read from disk. In
/// that case a cast might not be needed.
virtual Result<compute::ExecBatch> EvolveBatch(
const std::shared_ptr<RecordBatch>& batch,
const std::vector<FieldPath>& dataset_selection,
const FragmentSelection& selection) const = 0;
/// \brief Return a string description of this strategy
virtual std::string ToString() const = 0;
};
/// \brief Lookup to create a FragmentEvolutionStrategy for a given fragment
class ARROW_DS_EXPORT DatasetEvolutionStrategy {
public:
virtual ~DatasetEvolutionStrategy() = default;
/// \brief Create a strategy for evolving from the given fragment
/// to the schema of the given dataset
virtual std::unique_ptr<FragmentEvolutionStrategy> GetStrategy(
const Dataset& dataset, const Fragment& fragment,
const InspectedFragment& inspected_fragment) = 0;
/// \brief Return a string description of this strategy
virtual std::string ToString() const = 0;
};
ARROW_DS_EXPORT std::unique_ptr<DatasetEvolutionStrategy>
MakeBasicDatasetEvolutionStrategy();
/// \brief A container of zero or more Fragments.
///
/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a
/// directory. A Dataset has a schema to which Fragments must align during a
/// scan operation. This is analogous to Avro's reader and writer schema.
class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
public:
/// \brief Begin to build a new Scan operation against this Dataset
Result<std::shared_ptr<ScannerBuilder>> NewScan();
/// \brief GetFragments returns an iterator of Fragments given a predicate.
Result<FragmentIterator> GetFragments(compute::Expression predicate);
Result<FragmentIterator> GetFragments();
/// \brief Async versions of `GetFragments`.
Result<FragmentGenerator> GetFragmentsAsync(compute::Expression predicate);
Result<FragmentGenerator> GetFragmentsAsync();
const std::shared_ptr<Schema>& schema() const { return schema_; }
/// \brief An expression which evaluates to true for all data viewed by this Dataset.
/// May be null, which indicates no information is available.
const compute::Expression& partition_expression() const {
return partition_expression_;
}
/// \brief The name identifying the kind of Dataset
virtual std::string type_name() const = 0;
/// \brief Return a copy of this Dataset with a different schema.
///
/// The copy will view the same Fragments. If the new schema is not compatible with the
/// original dataset's schema then an error will be raised.
virtual Result<std::shared_ptr<Dataset>> ReplaceSchema(
std::shared_ptr<Schema> schema) const = 0;
/// \brief Rules used by this dataset to handle schema evolution
DatasetEvolutionStrategy* evolution_strategy() { return evolution_strategy_.get(); }
virtual ~Dataset() = default;
protected:
explicit Dataset(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression);
virtual Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) = 0;
/// \brief Default non-virtual implementation method for the base
/// `GetFragmentsAsyncImpl` method, which creates a fragment generator for
/// the dataset, possibly filtering results with a predicate (forwarding to
/// the synchronous `GetFragmentsImpl` method and moving the computations
/// to the background, using the IO thread pool).
///
/// Currently, `executor` is always the same as `internal::GetCPUThreadPool()`,
/// which means the results from the underlying fragment generator will be
/// transferred to the default CPU thread pool. The generator itself is
/// offloaded to run on the default IO thread pool.
virtual Result<FragmentGenerator> GetFragmentsAsyncImpl(
compute::Expression predicate, arrow::internal::Executor* executor);
std::shared_ptr<Schema> schema_;
compute::Expression partition_expression_ = compute::literal(true);
std::unique_ptr<DatasetEvolutionStrategy> evolution_strategy_ =
MakeBasicDatasetEvolutionStrategy();
};
/// \addtogroup dataset-implementations
///
/// @{
/// \brief A Source which yields fragments wrapping a stream of record batches.
///
/// The record batches must match the schema provided to the source at construction.
class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
public:
class RecordBatchGenerator {
public:
virtual ~RecordBatchGenerator() = default;
virtual RecordBatchIterator Get() const = 0;
};
/// Construct a dataset from a schema and a factory of record batch iterators.
InMemoryDataset(std::shared_ptr<Schema> schema,
std::shared_ptr<RecordBatchGenerator> get_batches)
: Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {}
/// Convenience constructor taking a fixed list of batches
InMemoryDataset(std::shared_ptr<Schema> schema, RecordBatchVector batches);
/// Convenience constructor taking a Table
explicit InMemoryDataset(std::shared_ptr<Table> table);
std::string type_name() const override { return "in-memory"; }
Result<std::shared_ptr<Dataset>> ReplaceSchema(
std::shared_ptr<Schema> schema) const override;
protected:
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
std::shared_ptr<RecordBatchGenerator> get_batches_;
};
/// \brief A Dataset wrapping child Datasets.
class ARROW_DS_EXPORT UnionDataset : public Dataset {
public:
/// \brief Construct a UnionDataset wrapping child Datasets.
///
/// \param[in] schema the schema of the resulting dataset.
/// \param[in] children one or more child Datasets. Their schemas must be identical to
/// schema.
static Result<std::shared_ptr<UnionDataset>> Make(std::shared_ptr<Schema> schema,
DatasetVector children);
const DatasetVector& children() const { return children_; }
std::string type_name() const override { return "union"; }
Result<std::shared_ptr<Dataset>> ReplaceSchema(
std::shared_ptr<Schema> schema) const override;
protected:
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
explicit UnionDataset(std::shared_ptr<Schema> schema, DatasetVector children)
: Dataset(std::move(schema)), children_(std::move(children)) {}
DatasetVector children_;
friend class UnionDatasetFactory;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,103 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include "arrow/dataset/file_base.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
#include "arrow/util/async_util.h"
#include "arrow/util/future.h"
namespace arrow {
namespace dataset {
namespace internal {
// This lines up with our other defaults in the scanner and execution plan
constexpr uint64_t kDefaultDatasetWriterMaxRowsQueued = 8 * 1024 * 1024;
/// \brief Utility class that manages a set of writers to different paths
///
/// Writers may be closed and reopened (and a new file created) based on the dataset
/// write options (for example, max_rows_per_file or max_open_files)
///
/// The dataset writer enforces its own back pressure based on the # of rows (as opposed
/// to # of batches which is how it is typically enforced elsewhere) and # of files.
class ARROW_DS_EXPORT DatasetWriter {
public:
/// \brief Create a dataset writer
///
/// Will fail if basename_template is invalid or if there is existing data and
/// existing_data_behavior is kError
///
/// \param write_options options to control how the data should be written
/// \param max_rows_queued max # of rows allowed to be queued before the dataset_writer
/// will ask for backpressure
static Result<std::unique_ptr<DatasetWriter>> Make(
FileSystemDatasetWriteOptions write_options, util::AsyncTaskScheduler* scheduler,
std::function<void()> pause_callback, std::function<void()> resume_callback,
std::function<void()> finish_callback,
uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
~DatasetWriter();
/// \brief Write a batch to the dataset
/// \param[in] batch The batch to write
/// \param[in] directory The directory to write to
///
/// Note: The written filename will be {directory}/{filename_factory(i)} where i is a
/// counter controlled by `max_open_files` and `max_rows_per_file`
///
/// If multiple WriteRecordBatch calls arrive with the same `directory` then the batches
/// may be written to the same file.
///
/// The returned future will be marked finished when the record batch has been queued
/// to be written. If the returned future is unfinished then this indicates the dataset
/// writer's queue is full and the data provider should pause.
///
/// This method is NOT async reentrant. The returned future will only be unfinished
/// if back pressure needs to be applied. Async reentrancy is not necessary for
/// concurrent writes to happen. Calling this method again before the previous future
/// completes will not just violate max_rows_queued but likely lead to race conditions.
///
/// One thing to note is that the ordering of your data can affect your maximum
/// potential parallelism. If this seems odd then consider a dataset where the first
/// 1000 batches go to the same directory and then the 1001st batch goes to a different
/// directory. The only way to get two parallel writes immediately would be to queue
/// all 1000 pending writes to the first directory.
void WriteRecordBatch(std::shared_ptr<RecordBatch> batch, const std::string& directory,
const std::string& prefix = "");
/// Finish all pending writes and close any open files
void Finish();
protected:
DatasetWriter(FileSystemDatasetWriteOptions write_options,
util::AsyncTaskScheduler* scheduler, std::function<void()> pause_callback,
std::function<void()> resume_callback,
std::function<void()> finish_callback,
uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
class DatasetWriterImpl;
std::unique_ptr<DatasetWriterImpl> impl_;
};
} // namespace internal
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,275 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
/// Logic for automatically determining the structure of multi-file
/// dataset with possible partitioning according to available
/// partitioning
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <string>
#include <variant>
#include <vector>
#include "arrow/dataset/partition.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/filesystem/type_fwd.h"
#include "arrow/result.h"
#include "arrow/util/macros.h"
namespace arrow {
namespace dataset {
/// \defgroup dataset-discovery Discovery API
///
/// @{
struct InspectOptions {
/// See `fragments` property.
static constexpr int kInspectAllFragments = -1;
/// Indicate how many fragments should be inspected to infer the unified dataset
/// schema. Limiting the number of fragments accessed improves the latency of
/// the discovery process when dealing with a high number of fragments and/or
/// high latency file systems.
///
/// The default value of `1` inspects the schema of the first (in no particular
/// order) fragment only. If the dataset has a uniform schema for all fragments,
/// this default is the optimal value. In order to inspect all fragments and
/// robustly unify their potentially varying schemas, set this option to
/// `kInspectAllFragments`. A value of `0` disables inspection of fragments
/// altogether so only the partitioning schema will be inspected.
int fragments = 1;
/// Control how to unify types. By default, types are merged strictly (the
/// type must match exactly, except nulls can be merged with other types).
Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
};
struct FinishOptions {
/// Finalize the dataset with this given schema. If the schema is not
/// provided, infer the schema via the Inspect, see the `inspect_options`
/// property.
std::shared_ptr<Schema> schema = NULLPTR;
/// If the schema is not provided, it will be discovered by passing the
/// following options to `DatasetDiscovery::Inspect`.
InspectOptions inspect_options{};
/// Indicate if the given Schema (when specified), should be validated against
/// the fragments' schemas. `inspect_options` will control how many fragments
/// are checked.
bool validate_fragments = false;
};
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's expected
/// schema before materializing said Dataset.
class ARROW_DS_EXPORT DatasetFactory {
public:
/// \brief Get the schemas of the Fragments and Partitioning.
virtual Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
InspectOptions options) = 0;
/// \brief Get unified schema for the resulting Dataset.
Result<std::shared_ptr<Schema>> Inspect(InspectOptions options = {});
/// \brief Create a Dataset
Result<std::shared_ptr<Dataset>> Finish();
/// \brief Create a Dataset with the given schema (see \a InspectOptions::schema)
Result<std::shared_ptr<Dataset>> Finish(std::shared_ptr<Schema> schema);
/// \brief Create a Dataset with the given options
virtual Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) = 0;
/// \brief Optional root partition for the resulting Dataset.
const compute::Expression& root_partition() const { return root_partition_; }
/// \brief Set the root partition for the resulting Dataset.
Status SetRootPartition(compute::Expression partition) {
root_partition_ = std::move(partition);
return Status::OK();
}
virtual ~DatasetFactory() = default;
protected:
DatasetFactory();
compute::Expression root_partition_;
};
/// @}
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's
/// expected schema before materialization.
/// \ingroup dataset-implementations
class ARROW_DS_EXPORT UnionDatasetFactory : public DatasetFactory {
public:
static Result<std::shared_ptr<DatasetFactory>> Make(
std::vector<std::shared_ptr<DatasetFactory>> factories);
/// \brief Return the list of child DatasetFactory
const std::vector<std::shared_ptr<DatasetFactory>>& factories() const {
return factories_;
}
/// \brief Get the schemas of the Datasets.
///
/// Instead of applying options globally, it applies at each child factory.
/// This will not respect `options.fragments` exactly, but will respect the
/// spirit of peeking the first fragments or all of them.
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
InspectOptions options) override;
/// \brief Create a Dataset.
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
protected:
explicit UnionDatasetFactory(std::vector<std::shared_ptr<DatasetFactory>> factories);
std::vector<std::shared_ptr<DatasetFactory>> factories_;
};
/// \ingroup dataset-filesystem
struct FileSystemFactoryOptions {
/// Either an explicit Partitioning or a PartitioningFactory to discover one.
///
/// If a factory is provided, it will be used to infer a schema for partition fields
/// based on file and directory paths then construct a Partitioning. The default
/// is a Partitioning which will yield no partition information.
///
/// The (explicit or discovered) partitioning will be applied to discovered files
/// and the resulting partition information embedded in the Dataset.
PartitioningOrFactory partitioning{Partitioning::Default()};
/// For the purposes of applying the partitioning, paths will be stripped
/// of the partition_base_dir. Files not matching the partition_base_dir
/// prefix will be skipped for partition discovery. The ignored files will still
/// be part of the Dataset, but will not have partition information.
///
/// Example:
/// partition_base_dir = "/dataset";
///
/// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
///
/// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
///
/// This is useful for partitioning which parses directory when ordering
/// is important, e.g. DirectoryPartitioning.
std::string partition_base_dir;
/// Invalid files (via selector or explicitly) will be excluded by checking
/// with the FileFormat::IsSupported method. This will incur IO for each files
/// in a serial and single threaded fashion. Disabling this feature will skip the
/// IO, but unsupported files may be present in the Dataset
/// (resulting in an error at scan time).
bool exclude_invalid_files = false;
/// When discovering from a Selector (and not from an explicit file list), ignore
/// files and directories matching any of these prefixes.
///
/// Example (with selector = "/dataset/**"):
/// selector_ignore_prefixes = {"_", ".DS_STORE" };
///
/// - "/dataset/data.csv" -> not ignored
/// - "/dataset/_metadata" -> ignored
/// - "/dataset/.DS_STORE" -> ignored
/// - "/dataset/_hidden/dat" -> ignored
/// - "/dataset/nested/.DS_STORE" -> ignored
std::vector<std::string> selector_ignore_prefixes = {
".",
"_",
};
};
/// \brief FileSystemDatasetFactory creates a Dataset from a vector of
/// fs::FileInfo or a fs::FileSelector.
/// \ingroup dataset-filesystem
class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {
public:
/// \brief Build a FileSystemDatasetFactory from an explicit list of
/// paths.
///
/// \param[in] filesystem passed to FileSystemDataset
/// \param[in] paths passed to FileSystemDataset
/// \param[in] format passed to FileSystemDataset
/// \param[in] options see FileSystemFactoryOptions for more information.
static Result<std::shared_ptr<DatasetFactory>> Make(
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<std::string>& paths,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
/// \brief Build a FileSystemDatasetFactory from a fs::FileSelector.
///
/// The selector will expand to a vector of FileInfo. The expansion/crawling
/// is performed in this function call. Thus, the finalized Dataset is
/// working with a snapshot of the filesystem.
//
/// If options.partition_base_dir is not provided, it will be overwritten
/// with selector.base_dir.
///
/// \param[in] filesystem passed to FileSystemDataset
/// \param[in] selector used to crawl and search files
/// \param[in] format passed to FileSystemDataset
/// \param[in] options see FileSystemFactoryOptions for more information.
static Result<std::shared_ptr<DatasetFactory>> Make(
std::shared_ptr<fs::FileSystem> filesystem, fs::FileSelector selector,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
/// \brief Build a FileSystemDatasetFactory from an uri including filesystem
/// information.
///
/// \param[in] uri passed to FileSystemDataset
/// \param[in] format passed to FileSystemDataset
/// \param[in] options see FileSystemFactoryOptions for more information.
static Result<std::shared_ptr<DatasetFactory>> Make(std::string uri,
std::shared_ptr<FileFormat> format,
FileSystemFactoryOptions options);
/// \brief Build a FileSystemDatasetFactory from an explicit list of
/// file information.
///
/// \param[in] filesystem passed to FileSystemDataset
/// \param[in] files passed to FileSystemDataset
/// \param[in] format passed to FileSystemDataset
/// \param[in] options see FileSystemFactoryOptions for more information.
static Result<std::shared_ptr<DatasetFactory>> Make(
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<fs::FileInfo>& files,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
InspectOptions options) override;
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
protected:
FileSystemDatasetFactory(std::vector<fs::FileInfo> files,
std::shared_ptr<fs::FileSystem> filesystem,
std::shared_ptr<FileFormat> format,
FileSystemFactoryOptions options);
Result<std::shared_ptr<Schema>> PartitionSchema();
std::vector<fs::FileInfo> files_;
std::shared_ptr<fs::FileSystem> fs_;
std::shared_ptr<FileFormat> format_;
FileSystemFactoryOptions options_;
};
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,499 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/buffer.h"
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/partition.h"
#include "arrow/dataset/scanner.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/filesystem/filesystem.h"
#include "arrow/io/file.h"
#include "arrow/type_fwd.h"
#include "arrow/util/compression.h"
namespace arrow {
namespace dataset {
/// \defgroup dataset-file-formats File formats for reading and writing datasets
/// \defgroup dataset-filesystem File system datasets
///
/// @{
/// \brief The path and filesystem where an actual file is located or a buffer which can
/// be read like a file
class ARROW_DS_EXPORT FileSource : public util::EqualityComparable<FileSource> {
public:
FileSource(std::string path, std::shared_ptr<fs::FileSystem> filesystem,
Compression::type compression = Compression::UNCOMPRESSED)
: file_info_(std::move(path)),
filesystem_(std::move(filesystem)),
compression_(compression) {}
FileSource(fs::FileInfo info, std::shared_ptr<fs::FileSystem> filesystem,
Compression::type compression = Compression::UNCOMPRESSED)
: file_info_(std::move(info)),
filesystem_(std::move(filesystem)),
compression_(compression) {}
explicit FileSource(std::shared_ptr<Buffer> buffer,
Compression::type compression = Compression::UNCOMPRESSED)
: buffer_(std::move(buffer)), compression_(compression) {}
using CustomOpen = std::function<Result<std::shared_ptr<io::RandomAccessFile>>()>;
FileSource(CustomOpen open, int64_t size)
: custom_open_(std::move(open)), custom_size_(size) {}
using CustomOpenWithCompression =
std::function<Result<std::shared_ptr<io::RandomAccessFile>>(Compression::type)>;
FileSource(CustomOpenWithCompression open_with_compression, int64_t size,
Compression::type compression = Compression::UNCOMPRESSED)
: custom_open_(std::bind(std::move(open_with_compression), compression)),
custom_size_(size),
compression_(compression) {}
FileSource(std::shared_ptr<io::RandomAccessFile> file, int64_t size,
Compression::type compression = Compression::UNCOMPRESSED)
: custom_open_([=] { return ToResult(file); }),
custom_size_(size),
compression_(compression) {}
explicit FileSource(std::shared_ptr<io::RandomAccessFile> file,
Compression::type compression = Compression::UNCOMPRESSED);
FileSource() : custom_open_(CustomOpen{&InvalidOpen}) {}
static std::vector<FileSource> FromPaths(const std::shared_ptr<fs::FileSystem>& fs,
std::vector<std::string> paths) {
std::vector<FileSource> sources;
for (auto&& path : paths) {
sources.emplace_back(std::move(path), fs);
}
return sources;
}
/// \brief Return the type of raw compression on the file, if any.
Compression::type compression() const { return compression_; }
/// \brief Return the file path, if any. Only valid when file source wraps a path.
const std::string& path() const {
static std::string buffer_path = "<Buffer>";
static std::string custom_open_path = "<Buffer>";
return filesystem_ ? file_info_.path() : buffer_ ? buffer_path : custom_open_path;
}
/// \brief Return the filesystem, if any. Otherwise returns nullptr
const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
/// \brief Return the buffer containing the file, if any. Otherwise returns nullptr
const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
/// \brief Get a RandomAccessFile which views this file source
Result<std::shared_ptr<io::RandomAccessFile>> Open() const;
Future<std::shared_ptr<io::RandomAccessFile>> OpenAsync() const;
/// \brief Get the size (in bytes) of the file or buffer
/// If the file is compressed this should be the compressed (on-disk) size.
int64_t Size() const;
/// \brief Get an InputStream which views this file source (and decompresses if needed)
/// \param[in] compression If nullopt, guess the compression scheme from the
/// filename, else decompress with the given codec
Result<std::shared_ptr<io::InputStream>> OpenCompressed(
std::optional<Compression::type> compression = std::nullopt) const;
/// \brief equality comparison with another FileSource
bool Equals(const FileSource& other) const;
private:
static Result<std::shared_ptr<io::RandomAccessFile>> InvalidOpen() {
return Status::Invalid("Called Open() on an uninitialized FileSource");
}
fs::FileInfo file_info_;
std::shared_ptr<fs::FileSystem> filesystem_;
std::shared_ptr<Buffer> buffer_;
CustomOpen custom_open_;
int64_t custom_size_ = 0;
Compression::type compression_ = Compression::UNCOMPRESSED;
};
/// \brief Base class for file format implementation
class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileFormat> {
public:
/// Options affecting how this format is scanned.
///
/// The options here can be overridden at scan time.
std::shared_ptr<FragmentScanOptions> default_fragment_scan_options;
virtual ~FileFormat() = default;
/// \brief The name identifying the kind of file format
virtual std::string type_name() const = 0;
virtual bool Equals(const FileFormat& other) const = 0;
/// \brief Indicate if the FileSource is supported/readable by this format.
virtual Result<bool> IsSupported(const FileSource& source) const = 0;
/// \brief Return the schema of the file if possible.
virtual Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const = 0;
/// \brief Learn what we need about the file before we start scanning it
virtual Future<std::shared_ptr<InspectedFragment>> InspectFragment(
const FileSource& source, const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) const;
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<FileFragment>& file) const = 0;
virtual Future<std::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options);
virtual Future<std::shared_ptr<FragmentScanner>> BeginScan(
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) const;
/// \brief Open a fragment
virtual Result<std::shared_ptr<FileFragment>> MakeFragment(
FileSource source, compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema);
/// \brief Create a FileFragment for a FileSource.
Result<std::shared_ptr<FileFragment>> MakeFragment(
FileSource source, compute::Expression partition_expression);
/// \brief Create a FileFragment for a FileSource.
Result<std::shared_ptr<FileFragment>> MakeFragment(
FileSource source, std::shared_ptr<Schema> physical_schema = NULLPTR);
/// \brief Create a writer for this format.
virtual Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const = 0;
/// \brief Get default write options for this format.
///
/// May return null shared_ptr if this file format does not yet support
/// writing datasets.
virtual std::shared_ptr<FileWriteOptions> DefaultWriteOptions() = 0;
protected:
explicit FileFormat(std::shared_ptr<FragmentScanOptions> default_fragment_scan_options)
: default_fragment_scan_options(std::move(default_fragment_scan_options)) {}
};
/// \brief A Fragment that is stored in a file with a known format
class ARROW_DS_EXPORT FileFragment : public Fragment,
public util::EqualityComparable<FileFragment> {
public:
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options) override;
Future<std::optional<int64_t>> CountRows(
compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
Future<std::shared_ptr<FragmentScanner>> BeginScan(
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) override;
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) override;
std::string type_name() const override { return format_->type_name(); }
std::string ToString() const override { return source_.path(); };
const FileSource& source() const { return source_; }
const std::shared_ptr<FileFormat>& format() const { return format_; }
bool Equals(const FileFragment& other) const;
protected:
FileFragment(FileSource source, std::shared_ptr<FileFormat> format,
compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema)
: Fragment(std::move(partition_expression), std::move(physical_schema)),
source_(std::move(source)),
format_(std::move(format)) {}
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
FileSource source_;
std::shared_ptr<FileFormat> format_;
friend class FileFormat;
};
/// \brief A Dataset of FileFragments.
///
/// A FileSystemDataset is composed of one or more FileFragment. The fragments
/// are independent and don't need to share the same format and/or filesystem.
class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
public:
/// \brief Create a FileSystemDataset.
///
/// \param[in] schema the schema of the dataset
/// \param[in] root_partition the partition expression of the dataset
/// \param[in] format the format of each FileFragment.
/// \param[in] filesystem the filesystem of each FileFragment, or nullptr if the
/// fragments wrap buffers.
/// \param[in] fragments list of fragments to create the dataset from.
/// \param[in] partitioning the Partitioning object in case the dataset is created
/// with a known partitioning (e.g. from a discovered partitioning
/// through a DatasetFactory), or nullptr if not known.
///
/// Note that fragments wrapping files resident in differing filesystems are not
/// permitted; to work with multiple filesystems use a UnionDataset.
///
/// \return A constructed dataset.
static Result<std::shared_ptr<FileSystemDataset>> Make(
std::shared_ptr<Schema> schema, compute::Expression root_partition,
std::shared_ptr<FileFormat> format, std::shared_ptr<fs::FileSystem> filesystem,
std::vector<std::shared_ptr<FileFragment>> fragments,
std::shared_ptr<Partitioning> partitioning = NULLPTR);
/// \brief Write a dataset.
static Status Write(const FileSystemDatasetWriteOptions& write_options,
std::shared_ptr<Scanner> scanner);
/// \brief Return the type name of the dataset.
std::string type_name() const override { return "filesystem"; }
/// \brief Replace the schema of the dataset.
Result<std::shared_ptr<Dataset>> ReplaceSchema(
std::shared_ptr<Schema> schema) const override;
/// \brief Return the path of files.
std::vector<std::string> files() const;
/// \brief Return the format.
const std::shared_ptr<FileFormat>& format() const { return format_; }
/// \brief Return the filesystem. May be nullptr if the fragments wrap buffers.
const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
/// \brief Return the partitioning. May be nullptr if the dataset was not constructed
/// with a partitioning.
const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
std::string ToString() const;
protected:
struct FragmentSubtrees;
explicit FileSystemDataset(std::shared_ptr<Schema> schema)
: Dataset(std::move(schema)) {}
FileSystemDataset(std::shared_ptr<Schema> schema,
compute::Expression partition_expression)
: Dataset(std::move(schema), partition_expression) {}
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
void SetupSubtreePruning();
std::shared_ptr<FileFormat> format_;
std::shared_ptr<fs::FileSystem> filesystem_;
std::vector<std::shared_ptr<FileFragment>> fragments_;
std::shared_ptr<Partitioning> partitioning_;
std::shared_ptr<FragmentSubtrees> subtrees_;
};
/// \brief Options for writing a file of this format.
class ARROW_DS_EXPORT FileWriteOptions {
public:
virtual ~FileWriteOptions() = default;
const std::shared_ptr<FileFormat>& format() const { return format_; }
std::string type_name() const { return format_->type_name(); }
protected:
explicit FileWriteOptions(std::shared_ptr<FileFormat> format)
: format_(std::move(format)) {}
std::shared_ptr<FileFormat> format_;
};
/// \brief A writer for this format.
class ARROW_DS_EXPORT FileWriter {
public:
virtual ~FileWriter() = default;
/// \brief Write the given batch.
virtual Status Write(const std::shared_ptr<RecordBatch>& batch) = 0;
/// \brief Write all batches from the reader.
Status Write(RecordBatchReader* batches);
/// \brief Indicate that writing is done.
virtual Future<> Finish();
const std::shared_ptr<FileFormat>& format() const { return options_->format(); }
const std::shared_ptr<Schema>& schema() const { return schema_; }
const std::shared_ptr<FileWriteOptions>& options() const { return options_; }
const fs::FileLocator& destination() const { return destination_locator_; }
/// \brief After Finish() is called, provides number of bytes written to file.
Result<int64_t> GetBytesWritten() const;
protected:
FileWriter(std::shared_ptr<Schema> schema, std::shared_ptr<FileWriteOptions> options,
std::shared_ptr<io::OutputStream> destination,
fs::FileLocator destination_locator)
: schema_(std::move(schema)),
options_(std::move(options)),
destination_(std::move(destination)),
destination_locator_(std::move(destination_locator)) {}
virtual Future<> FinishInternal() = 0;
std::shared_ptr<Schema> schema_;
std::shared_ptr<FileWriteOptions> options_;
std::shared_ptr<io::OutputStream> destination_;
fs::FileLocator destination_locator_;
std::optional<int64_t> bytes_written_;
};
/// \brief Options for writing a dataset.
struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
/// Options for individual fragment writing.
std::shared_ptr<FileWriteOptions> file_write_options;
/// FileSystem into which a dataset will be written.
std::shared_ptr<fs::FileSystem> filesystem;
/// Root directory into which the dataset will be written.
std::string base_dir;
/// Partitioning used to generate fragment paths.
std::shared_ptr<Partitioning> partitioning;
/// If true the order of rows in the dataset is preserved when writing with
/// multiple threads. This may cause notable performance degradation.
bool preserve_order = false;
/// Maximum number of partitions any batch may be written into, default is 1K.
int max_partitions = 1024;
/// Template string used to generate fragment basenames.
/// {i} will be replaced by an auto incremented integer.
std::string basename_template;
/// A functor which will be applied on an incremented counter. The result will be
/// inserted into the basename_template in place of {i}.
///
/// This can be used, for example, to left-pad the file counter.
std::function<std::string(int)> basename_template_functor;
/// If greater than 0 then this will limit the maximum number of files that can be left
/// open. If an attempt is made to open too many files then the least recently used file
/// will be closed. If this setting is set too low you may end up fragmenting your data
/// into many small files.
///
/// The default is 900 which also allows some # of files to be open by the scanner
/// before hitting the default Linux limit of 1024
uint32_t max_open_files = 900;
/// If greater than 0 then this will limit how many rows are placed in any single file.
/// Otherwise there will be no limit and one file will be created in each output
/// directory unless files need to be closed to respect max_open_files
uint64_t max_rows_per_file = 0;
/// If greater than 0 then this will cause the dataset writer to batch incoming data
/// and only write the row groups to the disk when sufficient rows have accumulated.
/// The final row group size may be less than this value and other options such as
/// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes.
uint64_t min_rows_per_group = 0;
/// If greater than 0 then the dataset writer may split up large incoming batches into
/// multiple row groups. If this value is set then min_rows_per_group should also be
/// set or else you may end up with very small row groups (e.g. if the incoming row
/// group size is just barely larger than this value).
uint64_t max_rows_per_group = 1 << 20;
/// Controls what happens if an output directory already exists.
ExistingDataBehavior existing_data_behavior = ExistingDataBehavior::kError;
/// \brief If false the dataset writer will not create directories
/// This is mainly intended for filesystems that do not require directories such as S3.
bool create_dir = true;
/// Callback to be invoked against all FileWriters before
/// they are finalized with FileWriter::Finish().
std::function<Status(FileWriter*)> writer_pre_finish = [](FileWriter*) {
return Status::OK();
};
/// Callback to be invoked against all FileWriters after they have
/// called FileWriter::Finish().
std::function<Status(FileWriter*)> writer_post_finish = [](FileWriter*) {
return Status::OK();
};
const std::shared_ptr<FileFormat>& format() const {
return file_write_options->format();
}
};
/// \brief Wraps FileSystemDatasetWriteOptions for consumption as compute::ExecNodeOptions
class ARROW_DS_EXPORT WriteNodeOptions : public acero::ExecNodeOptions {
public:
explicit WriteNodeOptions(
FileSystemDatasetWriteOptions options,
std::shared_ptr<const KeyValueMetadata> custom_metadata = NULLPTR)
: write_options(std::move(options)), custom_metadata(std::move(custom_metadata)) {}
/// \brief Options to control how to write the dataset
FileSystemDatasetWriteOptions write_options;
/// \brief Optional schema to attach to all written batches
///
/// By default, we will use the output schema of the input.
///
/// This can be used to alter schema metadata, field nullability, or field metadata.
/// However, this cannot be used to change the type of data. If the custom schema does
/// not have the same number of fields and the same data types as the input then the
/// plan will fail.
std::shared_ptr<Schema> custom_schema;
/// \brief Optional metadata to attach to written batches
std::shared_ptr<const KeyValueMetadata> custom_metadata;
};
/// @}
namespace internal {
ARROW_DS_EXPORT void InitializeDatasetWriter(arrow::acero::ExecFactoryRegistry* registry);
}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,144 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include "arrow/csv/options.h"
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/ipc/type_fwd.h"
#include "arrow/status.h"
#include "arrow/util/compression.h"
namespace arrow {
namespace dataset {
constexpr char kCsvTypeName[] = "csv";
/// \addtogroup dataset-file-formats
///
/// @{
/// \brief A FileFormat implementation that reads from and writes to Csv files
class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
public:
// TODO(ARROW-18328) Remove this, moved to CsvFragmentScanOptions
/// Options affecting the parsing of CSV files
csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
CsvFileFormat();
std::string type_name() const override { return kCsvTypeName; }
bool Equals(const FileFormat& other) const override;
Result<bool> IsSupported(const FileSource& source) const override;
/// \brief Return the schema of the file if possible.
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
Future<std::shared_ptr<FragmentScanner>> BeginScan(
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) const override;
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& scan_options,
const std::shared_ptr<FileFragment>& file) const override;
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
const FileSource& source, const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) const override;
Future<std::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const override;
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
};
/// \brief Per-scan options for CSV fragments
struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
std::string type_name() const override { return kCsvTypeName; }
using StreamWrapFunc = std::function<Result<std::shared_ptr<io::InputStream>>(
std::shared_ptr<io::InputStream>)>;
/// CSV conversion options
csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults();
/// CSV reading options
///
/// Note that use_threads is always ignored.
csv::ReadOptions read_options = csv::ReadOptions::Defaults();
/// CSV parse options
csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
/// Optional stream wrapping function
///
/// If defined, all open dataset file fragments will be passed
/// through this function. One possible use case is to transparently
/// transcode all input files from a given character set to utf8.
StreamWrapFunc stream_transform_func{};
};
class ARROW_DS_EXPORT CsvFileWriteOptions : public FileWriteOptions {
public:
/// Options passed to csv::MakeCSVWriter.
std::shared_ptr<csv::WriteOptions> write_options;
protected:
explicit CsvFileWriteOptions(std::shared_ptr<FileFormat> format)
: FileWriteOptions(std::move(format)) {}
friend class CsvFileFormat;
};
class ARROW_DS_EXPORT CsvFileWriter : public FileWriter {
public:
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
private:
CsvFileWriter(std::shared_ptr<io::OutputStream> destination,
std::shared_ptr<ipc::RecordBatchWriter> writer,
std::shared_ptr<Schema> schema,
std::shared_ptr<CsvFileWriteOptions> options,
fs::FileLocator destination_locator);
Future<> FinishInternal() override;
std::shared_ptr<io::OutputStream> destination_;
std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
friend class CsvFileFormat;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,123 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <string>
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/io/type_fwd.h"
#include "arrow/ipc/type_fwd.h"
#include "arrow/result.h"
namespace arrow {
namespace dataset {
/// \addtogroup dataset-file-formats
///
/// @{
constexpr char kIpcTypeName[] = "ipc";
/// \brief A FileFormat implementation that reads from and writes to Ipc files
class ARROW_DS_EXPORT IpcFileFormat : public FileFormat {
public:
std::string type_name() const override { return kIpcTypeName; }
IpcFileFormat();
bool Equals(const FileFormat& other) const override {
return type_name() == other.type_name();
}
Result<bool> IsSupported(const FileSource& source) const override;
/// \brief Return the schema of the file if possible.
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<FileFragment>& file) const override;
Future<std::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const override;
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
};
/// \brief Per-scan options for IPC fragments
class ARROW_DS_EXPORT IpcFragmentScanOptions : public FragmentScanOptions {
public:
std::string type_name() const override { return kIpcTypeName; }
/// Options passed to the IPC file reader.
/// included_fields, memory_pool, and use_threads are ignored.
std::shared_ptr<ipc::IpcReadOptions> options;
/// If present, the async scanner will enable I/O coalescing.
/// This is ignored by the sync scanner.
std::shared_ptr<io::CacheOptions> cache_options;
};
class ARROW_DS_EXPORT IpcFileWriteOptions : public FileWriteOptions {
public:
/// Options passed to ipc::MakeFileWriter. use_threads is ignored
std::shared_ptr<ipc::IpcWriteOptions> options;
/// custom_metadata written to the file's footer
std::shared_ptr<const KeyValueMetadata> metadata;
protected:
explicit IpcFileWriteOptions(std::shared_ptr<FileFormat> format)
: FileWriteOptions(std::move(format)) {}
friend class IpcFileFormat;
};
class ARROW_DS_EXPORT IpcFileWriter : public FileWriter {
public:
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
private:
IpcFileWriter(std::shared_ptr<io::OutputStream> destination,
std::shared_ptr<ipc::RecordBatchWriter> writer,
std::shared_ptr<Schema> schema,
std::shared_ptr<IpcFileWriteOptions> options,
fs::FileLocator destination_locator);
Future<> FinishInternal() override;
std::shared_ptr<io::OutputStream> destination_;
std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
friend class IpcFileFormat;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,98 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <optional>
#include <string>
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/ipc/type_fwd.h"
#include "arrow/json/options.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/future.h"
#include "arrow/util/macros.h"
namespace arrow::dataset {
/// \addtogroup dataset-file-formats
///
/// @{
constexpr char kJsonTypeName[] = "json";
/// \brief A FileFormat implementation that reads from JSON files
class ARROW_DS_EXPORT JsonFileFormat : public FileFormat {
public:
JsonFileFormat();
std::string type_name() const override { return kJsonTypeName; }
bool Equals(const FileFormat& other) const override;
Result<bool> IsSupported(const FileSource& source) const override;
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
const FileSource& source, const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) const override;
Future<std::shared_ptr<FragmentScanner>> BeginScan(
const FragmentScanRequest& scan_request, const InspectedFragment& inspected,
const FragmentScanOptions* format_options,
compute::ExecContext* exec_context) const override;
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& scan_options,
const std::shared_ptr<FileFragment>& file) const override;
Future<std::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& scan_options) override;
Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const override {
return Status::NotImplemented("Writing JSON files is not currently supported");
}
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override { return NULLPTR; }
};
/// \brief Per-scan options for JSON fragments
struct ARROW_DS_EXPORT JsonFragmentScanOptions : public FragmentScanOptions {
std::string type_name() const override { return kJsonTypeName; }
/// @brief Options that affect JSON parsing
///
/// Note: `explicit_schema` and `unexpected_field_behavior` are ignored.
json::ParseOptions parse_options = json::ParseOptions::Defaults();
/// @brief Options that affect JSON reading
json::ReadOptions read_options = json::ReadOptions::Defaults();
};
/// @}
} // namespace arrow::dataset

View File

@@ -0,0 +1,75 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <string>
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/io/type_fwd.h"
#include "arrow/result.h"
namespace arrow {
namespace dataset {
/// \addtogroup dataset-file-formats
///
/// @{
constexpr char kOrcTypeName[] = "orc";
/// \brief A FileFormat implementation that reads from and writes to ORC files
class ARROW_DS_EXPORT OrcFileFormat : public FileFormat {
public:
OrcFileFormat();
std::string type_name() const override { return kOrcTypeName; }
bool Equals(const FileFormat& other) const override {
return type_name() == other.type_name();
}
Result<bool> IsSupported(const FileSource& source) const override;
/// \brief Return the schema of the file if possible.
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<FileFragment>& file) const override;
Future<std::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const override;
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,410 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <optional>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "arrow/dataset/discovery.h"
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/io/caching.h"
namespace parquet {
class ParquetFileReader;
class Statistics;
class ColumnChunkMetaData;
class RowGroupMetaData;
class FileMetaData;
class FileDecryptionProperties;
class FileEncryptionProperties;
class ReaderProperties;
class ArrowReaderProperties;
class WriterProperties;
class ArrowWriterProperties;
namespace arrow {
class FileReader;
class FileWriter;
struct SchemaManifest;
} // namespace arrow
} // namespace parquet
namespace arrow {
namespace dataset {
struct ParquetDecryptionConfig;
struct ParquetEncryptionConfig;
/// \addtogroup dataset-file-formats
///
/// @{
constexpr char kParquetTypeName[] = "parquet";
/// \brief A FileFormat implementation that reads from Parquet files
class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
public:
ParquetFileFormat();
/// Convenience constructor which copies properties from a parquet::ReaderProperties.
/// memory_pool will be ignored.
explicit ParquetFileFormat(const parquet::ReaderProperties& reader_properties);
std::string type_name() const override { return kParquetTypeName; }
bool Equals(const FileFormat& other) const override;
struct ReaderOptions {
/// \defgroup parquet-file-format-arrow-reader-properties properties which correspond
/// to members of parquet::ArrowReaderProperties.
///
/// We don't embed parquet::ReaderProperties directly because column names (rather
/// than indices) are used to indicate dictionary columns, and other options are
/// deferred to scan time.
///
/// @{
std::unordered_set<std::string> dict_columns;
arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO;
Type::type binary_type = Type::BINARY;
Type::type list_type = Type::LIST;
/// @}
} reader_options;
Result<bool> IsSupported(const FileSource& source) const override;
/// \brief Return the schema of the file if possible.
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<FileFragment>& file) const override;
Future<std::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
using FileFormat::MakeFragment;
/// \brief Create a Fragment targeting all RowGroups.
Result<std::shared_ptr<FileFragment>> MakeFragment(
FileSource source, compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema) override;
/// \brief Create a Fragment, restricted to the specified row groups.
Result<std::shared_ptr<ParquetFileFragment>> MakeFragment(
FileSource source, compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema, std::vector<int> row_groups);
/// \brief Return a FileReader on the given source.
Result<std::shared_ptr<parquet::arrow::FileReader>> GetReader(
const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
Result<std::shared_ptr<parquet::arrow::FileReader>> GetReader(
const FileSource& source, const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<parquet::FileMetaData>& metadata) const;
Future<std::shared_ptr<parquet::arrow::FileReader>> GetReaderAsync(
const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
Future<std::shared_ptr<parquet::arrow::FileReader>> GetReaderAsync(
const FileSource& source, const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<parquet::FileMetaData>& metadata) const;
Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const override;
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
};
/// \brief A FileFragment with parquet logic.
///
/// ParquetFileFragment provides a lazy (with respect to IO) interface to
/// scan parquet files. Any heavy IO calls are deferred to the Scan() method.
///
/// The caller can provide an optional list of selected RowGroups to limit the
/// number of scanned RowGroups, or to partition the scans across multiple
/// threads.
///
/// Metadata can be explicitly provided, enabling pushdown predicate benefits without
/// the potentially heavy IO of loading Metadata from the file system. This can induce
/// significant performance boost when scanning high latency file systems.
class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
public:
Result<FragmentVector> SplitByRowGroup(compute::Expression predicate);
/// \brief Return the RowGroups selected by this fragment.
const std::vector<int>& row_groups() const {
if (row_groups_) return *row_groups_;
static std::vector<int> empty;
return empty;
}
/// \brief Return the FileMetaData associated with this fragment.
///
/// This may return nullptr if the fragment wasn't scanned yet, or if
/// `ScanOptions::cache_metadata` was disabled.
std::shared_ptr<parquet::FileMetaData> metadata();
/// \brief Ensure this fragment's FileMetaData is in memory.
Status EnsureCompleteMetadata(parquet::arrow::FileReader* reader = NULLPTR);
Status ClearCachedMetadata() override;
/// \brief Return fragment which selects a filtered subset of this fragment's RowGroups.
Result<std::shared_ptr<Fragment>> Subset(compute::Expression predicate);
Result<std::shared_ptr<Fragment>> Subset(std::vector<int> row_group_ids);
static std::optional<compute::Expression> EvaluateStatisticsAsExpression(
const Field& field, const parquet::Statistics& statistics);
static std::optional<compute::Expression> EvaluateStatisticsAsExpression(
const Field& field, const FieldRef& field_ref,
const parquet::Statistics& statistics);
private:
ParquetFileFragment(FileSource source, std::shared_ptr<FileFormat> format,
compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema,
std::optional<std::vector<int>> row_groups);
Status SetMetadata(std::shared_ptr<parquet::FileMetaData> metadata,
std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
std::shared_ptr<parquet::FileMetaData> original_metadata = {});
// Overridden to opportunistically set metadata since a reader must be opened anyway.
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override {
ARROW_RETURN_NOT_OK(EnsureCompleteMetadata());
return physical_schema_;
}
/// Return a filtered subset of row group indices.
Result<std::vector<int>> FilterRowGroups(compute::Expression predicate);
/// Simplify the predicate against the statistics of each row group.
Result<std::vector<compute::Expression>> TestRowGroups(compute::Expression predicate);
/// Try to count rows matching the predicate using metadata. Expects
/// metadata to be present, and expects the predicate to have been
/// simplified against the partition expression already.
Result<std::optional<int64_t>> TryCountRows(compute::Expression predicate);
ParquetFileFormat& parquet_format_;
/// Indices of row groups selected by this fragment,
/// or std::nullopt if all row groups are selected.
std::optional<std::vector<int>> row_groups_;
// the expressions (combined for all columns for which statistics have been
// processed) are stored per column group
std::vector<compute::Expression> statistics_expressions_;
// statistics status are kept track of by Parquet Schema column indices
// (i.e. not Arrow schema field index)
std::vector<bool> statistics_expressions_complete_;
std::shared_ptr<parquet::FileMetaData> metadata_;
std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
// The FileMetaData that owns the SchemaDescriptor pointed by SchemaManifest.
std::shared_ptr<parquet::FileMetaData> original_metadata_;
friend class ParquetFileFormat;
friend class ParquetDatasetFactory;
};
/// \brief Per-scan options for Parquet fragments
class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions {
public:
ParquetFragmentScanOptions();
std::string type_name() const override { return kParquetTypeName; }
/// Reader properties. Not all properties are respected: memory_pool comes from
/// ScanOptions.
std::shared_ptr<parquet::ReaderProperties> reader_properties;
/// Arrow reader properties. Not all properties are respected: batch_size comes from
/// ScanOptions. Additionally, other options come from ParquetFileFormat::ReaderOptions.
std::shared_ptr<parquet::ArrowReaderProperties> arrow_reader_properties;
/// A configuration structure that provides decryption properties for a dataset
std::shared_ptr<ParquetDecryptionConfig> parquet_decryption_config = NULLPTR;
};
class ARROW_DS_EXPORT ParquetFileWriteOptions : public FileWriteOptions {
public:
/// \brief Parquet writer properties.
std::shared_ptr<parquet::WriterProperties> writer_properties;
/// \brief Parquet Arrow writer properties.
std::shared_ptr<parquet::ArrowWriterProperties> arrow_writer_properties;
// A configuration structure that provides encryption properties for a dataset
std::shared_ptr<ParquetEncryptionConfig> parquet_encryption_config = NULLPTR;
protected:
explicit ParquetFileWriteOptions(std::shared_ptr<FileFormat> format)
: FileWriteOptions(std::move(format)) {}
friend class ParquetFileFormat;
};
class ARROW_DS_EXPORT ParquetFileWriter : public FileWriter {
public:
const std::shared_ptr<parquet::arrow::FileWriter>& parquet_writer() const {
return parquet_writer_;
}
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
private:
ParquetFileWriter(std::shared_ptr<io::OutputStream> destination,
std::shared_ptr<parquet::arrow::FileWriter> writer,
std::shared_ptr<ParquetFileWriteOptions> options,
fs::FileLocator destination_locator);
Future<> FinishInternal() override;
std::shared_ptr<parquet::arrow::FileWriter> parquet_writer_;
friend class ParquetFileFormat;
};
/// \brief Options for making a FileSystemDataset from a Parquet _metadata file.
struct ParquetFactoryOptions {
/// Either an explicit Partitioning or a PartitioningFactory to discover one.
///
/// If a factory is provided, it will be used to infer a schema for partition fields
/// based on file and directory paths then construct a Partitioning. The default
/// is a Partitioning which will yield no partition information.
///
/// The (explicit or discovered) partitioning will be applied to discovered files
/// and the resulting partition information embedded in the Dataset.
PartitioningOrFactory partitioning{Partitioning::Default()};
/// For the purposes of applying the partitioning, paths will be stripped
/// of the partition_base_dir. Files not matching the partition_base_dir
/// prefix will be skipped for partition discovery. The ignored files will still
/// be part of the Dataset, but will not have partition information.
///
/// Example:
/// partition_base_dir = "/dataset";
///
/// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
///
/// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
///
/// This is useful for partitioning which parses directory when ordering
/// is important, e.g. DirectoryPartitioning.
std::string partition_base_dir;
/// Assert that all ColumnChunk paths are consistent. The parquet spec allows for
/// ColumnChunk data to be stored in multiple files, but ParquetDatasetFactory
/// supports only a single file with all ColumnChunk data. If this flag is set
/// construction of a ParquetDatasetFactory will raise an error if ColumnChunk
/// data is not resident in a single file.
bool validate_column_chunk_paths = false;
};
/// \brief Create FileSystemDataset from custom `_metadata` cache file.
///
/// Dask and other systems will generate a cache metadata file by concatenating
/// the RowGroupMetaData of multiple parquet files into a single parquet file
/// that only contains metadata and no ColumnChunk data.
///
/// ParquetDatasetFactory creates a FileSystemDataset composed of
/// ParquetFileFragment where each fragment is pre-populated with the exact
/// number of row groups and statistics for each columns.
class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory {
public:
/// \brief Create a ParquetDatasetFactory from a metadata path.
///
/// The `metadata_path` will be read from `filesystem`. Each RowGroup
/// contained in the metadata file will be relative to `dirname(metadata_path)`.
///
/// \param[in] metadata_path path of the metadata parquet file
/// \param[in] filesystem from which to open/read the path
/// \param[in] format to read the file with.
/// \param[in] options see ParquetFactoryOptions
static Result<std::shared_ptr<DatasetFactory>> Make(
const std::string& metadata_path, std::shared_ptr<fs::FileSystem> filesystem,
std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
/// \brief Create a ParquetDatasetFactory from a metadata source.
///
/// Similar to the previous Make definition, but the metadata can be a Buffer
/// and the base_path is explicit instead of inferred from the metadata
/// path.
///
/// \param[in] metadata source to open the metadata parquet file from
/// \param[in] base_path used as the prefix of every parquet files referenced
/// \param[in] filesystem from which to read the files referenced.
/// \param[in] format to read the file with.
/// \param[in] options see ParquetFactoryOptions
static Result<std::shared_ptr<DatasetFactory>> Make(
const FileSource& metadata, const std::string& base_path,
std::shared_ptr<fs::FileSystem> filesystem,
std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
InspectOptions options) override;
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
protected:
ParquetDatasetFactory(
std::shared_ptr<fs::FileSystem> filesystem,
std::shared_ptr<ParquetFileFormat> format,
std::shared_ptr<parquet::FileMetaData> metadata,
std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
std::shared_ptr<Schema> physical_schema, std::string base_path,
ParquetFactoryOptions options,
std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids)
: filesystem_(std::move(filesystem)),
format_(std::move(format)),
metadata_(std::move(metadata)),
manifest_(std::move(manifest)),
physical_schema_(std::move(physical_schema)),
base_path_(std::move(base_path)),
options_(std::move(options)),
paths_with_row_group_ids_(std::move(paths_with_row_group_ids)) {}
std::shared_ptr<fs::FileSystem> filesystem_;
std::shared_ptr<ParquetFileFormat> format_;
std::shared_ptr<parquet::FileMetaData> metadata_;
std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
std::shared_ptr<Schema> physical_schema_;
std::string base_path_;
ParquetFactoryOptions options_;
std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids_;
private:
Result<std::vector<std::shared_ptr<FileFragment>>> CollectParquetFragments(
const Partitioning& partitioning);
Result<std::shared_ptr<Schema>> PartitionSchema();
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,75 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/dataset/type_fwd.h"
namespace parquet::encryption {
class CryptoFactory;
struct KmsConnectionConfig;
struct EncryptionConfiguration;
struct DecryptionConfiguration;
} // namespace parquet::encryption
namespace arrow {
namespace dataset {
/// \brief Core configuration class encapsulating parameters for high-level encryption
/// within Parquet framework.
///
/// ParquetEncryptionConfig serves as a bridge, passing encryption-related
/// parameters to appropriate components within the Parquet library. It holds references
/// to objects defining encryption strategy, Key Management Service (KMS) configuration,
/// and specific encryption configurations for Parquet data.
struct ARROW_DS_EXPORT ParquetEncryptionConfig {
/// Shared pointer to CryptoFactory object, responsible for creating cryptographic
/// components like encryptors and decryptors.
std::shared_ptr<parquet::encryption::CryptoFactory> crypto_factory;
/// Shared pointer to KmsConnectionConfig object, holding configuration parameters for
/// connecting to a Key Management Service (KMS).
std::shared_ptr<parquet::encryption::KmsConnectionConfig> kms_connection_config;
/// Shared pointer to EncryptionConfiguration object, defining specific encryption
/// settings for Parquet data, like keys for different columns.
std::shared_ptr<parquet::encryption::EncryptionConfiguration> encryption_config;
};
/// \brief Core configuration class encapsulating parameters for high-level decryption
/// within Parquet framework.
///
/// ParquetDecryptionConfig is designed to pass decryption-related parameters to
/// appropriate decryption components within Parquet library. It holds references to
/// objects defining decryption strategy, Key Management Service (KMS) configuration,
/// and specific decryption configurations for reading encrypted Parquet data.
struct ARROW_DS_EXPORT ParquetDecryptionConfig {
/// Shared pointer to CryptoFactory object, pivotal in creating cryptographic
/// components for decryption process.
std::shared_ptr<parquet::encryption::CryptoFactory> crypto_factory;
/// Shared pointer to KmsConnectionConfig object, containing parameters for connecting
/// to a Key Management Service (KMS) during decryption.
std::shared_ptr<parquet::encryption::KmsConnectionConfig> kms_connection_config;
/// Shared pointer to DecryptionConfiguration object, specifying decryption settings
/// for reading encrypted Parquet data.
std::shared_ptr<parquet::encryption::DecryptionConfiguration> decryption_config;
};
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,432 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <functional>
#include <iosfwd>
#include <memory>
#include <optional>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "arrow/compute/expression.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/util/compare.h"
namespace arrow {
namespace dataset {
constexpr char kFilenamePartitionSep = '_';
struct ARROW_DS_EXPORT PartitionPathFormat {
std::string directory, filename;
};
// ----------------------------------------------------------------------
// Partitioning
/// \defgroup dataset-partitioning Partitioning API
///
/// @{
/// \brief Interface for parsing partition expressions from string partition
/// identifiers.
///
/// For example, the identifier "foo=5" might be parsed to an equality expression
/// between the "foo" field and the value 5.
///
/// Some partitionings may store the field names in a metadata
/// store instead of in file paths, for example
/// dataset_root/2009/11/... could be used when the partition fields
/// are "year" and "month"
///
/// Paths are consumed from left to right. Paths must be relative to
/// the root of a partition; path prefixes must be removed before passing
/// the path to a partitioning for parsing.
class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable<Partitioning> {
public:
virtual ~Partitioning() = default;
/// \brief The name identifying the kind of partitioning
virtual std::string type_name() const = 0;
//// \brief Return whether the partitionings are equal
virtual bool Equals(const Partitioning& other) const {
return schema_->Equals(other.schema_, /*check_metadata=*/false);
}
/// \brief If the input batch shares any fields with this partitioning,
/// produce sub-batches which satisfy mutually exclusive Expressions.
struct PartitionedBatches {
RecordBatchVector batches;
std::vector<compute::Expression> expressions;
};
virtual Result<PartitionedBatches> Partition(
const std::shared_ptr<RecordBatch>& batch) const = 0;
/// \brief Parse a path into a partition expression
virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;
/// \brief A default Partitioning which is a DirectoryPartitioning
/// with an empty schema.
static std::shared_ptr<Partitioning> Default();
/// \brief The partition schema.
const std::shared_ptr<Schema>& schema() const { return schema_; }
protected:
explicit Partitioning(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
std::shared_ptr<Schema> schema_;
};
/// \brief The encoding of partition segments.
enum class SegmentEncoding : int8_t {
/// No encoding.
None = 0,
/// Segment values are URL-encoded.
Uri = 1,
};
ARROW_DS_EXPORT
std::ostream& operator<<(std::ostream& os, SegmentEncoding segment_encoding);
/// \brief Options for key-value based partitioning (hive/directory).
struct ARROW_DS_EXPORT KeyValuePartitioningOptions {
/// After splitting a path into components, decode the path components
/// before parsing according to this scheme.
SegmentEncoding segment_encoding = SegmentEncoding::Uri;
};
/// \brief Options for inferring a partitioning.
struct ARROW_DS_EXPORT PartitioningFactoryOptions {
/// When inferring a schema for partition fields, yield dictionary encoded types
/// instead of plain. This can be more efficient when materializing virtual
/// columns, and Expressions parsed by the finished Partitioning will include
/// dictionaries of all unique inspected values for each field.
bool infer_dictionary = false;
/// Optionally, an expected schema can be provided, in which case inference
/// will only check discovered fields against the schema and update internal
/// state (such as dictionaries).
std::shared_ptr<Schema> schema;
/// After splitting a path into components, decode the path components
/// before parsing according to this scheme.
SegmentEncoding segment_encoding = SegmentEncoding::Uri;
KeyValuePartitioningOptions AsPartitioningOptions() const;
};
/// \brief Options for inferring a hive-style partitioning.
struct ARROW_DS_EXPORT HivePartitioningFactoryOptions : PartitioningFactoryOptions {
/// The hive partitioning scheme maps null to a hard coded fallback string.
std::string null_fallback;
HivePartitioningOptions AsHivePartitioningOptions() const;
};
/// \brief PartitioningFactory provides creation of a partitioning when the
/// specific schema must be inferred from available paths (no explicit schema is known).
class ARROW_DS_EXPORT PartitioningFactory {
public:
virtual ~PartitioningFactory() = default;
/// \brief The name identifying the kind of partitioning
virtual std::string type_name() const = 0;
/// Get the schema for the resulting Partitioning.
/// This may reset internal state, for example dictionaries of unique representations.
virtual Result<std::shared_ptr<Schema>> Inspect(
const std::vector<std::string>& paths) = 0;
/// Create a partitioning using the provided schema
/// (fields may be dropped).
virtual Result<std::shared_ptr<Partitioning>> Finish(
const std::shared_ptr<Schema>& schema) const = 0;
};
/// \brief Subclass for the common case of a partitioning which yields an equality
/// expression for each segment
class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
public:
/// An unconverted equality expression consisting of a field name and the representation
/// of a scalar value
struct Key {
std::string name;
std::optional<std::string> value;
};
Result<PartitionedBatches> Partition(
const std::shared_ptr<RecordBatch>& batch) const override;
Result<compute::Expression> Parse(const std::string& path) const override;
Result<PartitionPathFormat> Format(const compute::Expression& expr) const override;
const ArrayVector& dictionaries() const { return dictionaries_; }
SegmentEncoding segment_encoding() const { return options_.segment_encoding; }
bool Equals(const Partitioning& other) const override;
protected:
KeyValuePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
KeyValuePartitioningOptions options)
: Partitioning(std::move(schema)),
dictionaries_(std::move(dictionaries)),
options_(options) {
if (dictionaries_.empty()) {
dictionaries_.resize(schema_->num_fields());
}
}
virtual Result<std::vector<Key>> ParseKeys(const std::string& path) const = 0;
virtual Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const = 0;
/// Convert a Key to a full expression.
Result<compute::Expression> ConvertKey(const Key& key) const;
Result<std::vector<std::string>> FormatPartitionSegments(
const ScalarVector& values) const;
Result<std::vector<Key>> ParsePartitionSegments(
const std::vector<std::string>& segments) const;
ArrayVector dictionaries_;
KeyValuePartitioningOptions options_;
};
/// \brief DirectoryPartitioning parses one segment of a path for each field in its
/// schema. All fields are required, so paths passed to DirectoryPartitioning::Parse
/// must contain segments for each field.
///
/// For example given schema<year:int16, month:int8> the path "/2009/11" would be
/// parsed to ("year"_ == 2009 and "month"_ == 11)
class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
public:
/// If a field in schema is of dictionary type, the corresponding element of
/// dictionaries must be contain the dictionary of values for that field.
explicit DirectoryPartitioning(std::shared_ptr<Schema> schema,
ArrayVector dictionaries = {},
KeyValuePartitioningOptions options = {});
std::string type_name() const override { return "directory"; }
bool Equals(const Partitioning& other) const override;
/// \brief Create a factory for a directory partitioning.
///
/// \param[in] field_names The names for the partition fields. Types will be
/// inferred.
static std::shared_ptr<PartitioningFactory> MakeFactory(
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
private:
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
};
/// \brief The default fallback used for null values in a Hive-style partitioning.
static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__";
struct ARROW_DS_EXPORT HivePartitioningOptions : public KeyValuePartitioningOptions {
std::string null_fallback = kDefaultHiveNullFallback;
static HivePartitioningOptions DefaultsWithNullFallback(std::string fallback) {
HivePartitioningOptions options;
options.null_fallback = std::move(fallback);
return options;
}
};
/// \brief Multi-level, directory based partitioning
/// originating from Apache Hive with all data files stored in the
/// leaf directories. Data is partitioned by static values of a
/// particular column in the schema. Partition keys are represented in
/// the form $key=$value in directory names.
/// Field order is ignored, as are missing or unrecognized field names.
///
/// For example given schema<year:int16, month:int8, day:int8> the path
/// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321)
class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
public:
/// If a field in schema is of dictionary type, the corresponding element of
/// dictionaries must be contain the dictionary of values for that field.
explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries = {},
std::string null_fallback = kDefaultHiveNullFallback)
: KeyValuePartitioning(std::move(schema), std::move(dictionaries),
KeyValuePartitioningOptions()),
hive_options_(
HivePartitioningOptions::DefaultsWithNullFallback(std::move(null_fallback))) {
}
explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
HivePartitioningOptions options)
: KeyValuePartitioning(std::move(schema), std::move(dictionaries), options),
hive_options_(options) {}
std::string type_name() const override { return "hive"; }
std::string null_fallback() const { return hive_options_.null_fallback; }
const HivePartitioningOptions& options() const { return hive_options_; }
static Result<std::optional<Key>> ParseKey(const std::string& segment,
const HivePartitioningOptions& options);
bool Equals(const Partitioning& other) const override;
/// \brief Create a factory for a hive partitioning.
static std::shared_ptr<PartitioningFactory> MakeFactory(
HivePartitioningFactoryOptions = {});
private:
const HivePartitioningOptions hive_options_;
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
};
/// \brief Implementation provided by lambda or other callable
class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
public:
using ParseImpl = std::function<Result<compute::Expression>(const std::string&)>;
using FormatImpl =
std::function<Result<PartitionPathFormat>(const compute::Expression&)>;
FunctionPartitioning(std::shared_ptr<Schema> schema, ParseImpl parse_impl,
FormatImpl format_impl = NULLPTR, std::string name = "function")
: Partitioning(std::move(schema)),
parse_impl_(std::move(parse_impl)),
format_impl_(std::move(format_impl)),
name_(std::move(name)) {}
std::string type_name() const override { return name_; }
bool Equals(const Partitioning& other) const override { return false; }
Result<compute::Expression> Parse(const std::string& path) const override {
return parse_impl_(path);
}
Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
if (format_impl_) {
return format_impl_(expr);
}
return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning");
}
Result<PartitionedBatches> Partition(
const std::shared_ptr<RecordBatch>& batch) const override {
return Status::NotImplemented("partitioning batches from ", type_name(),
" Partitioning");
}
private:
ParseImpl parse_impl_;
FormatImpl format_impl_;
std::string name_;
};
class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
public:
/// \brief Construct a FilenamePartitioning from its components.
///
/// If a field in schema is of dictionary type, the corresponding element of
/// dictionaries must be contain the dictionary of values for that field.
explicit FilenamePartitioning(std::shared_ptr<Schema> schema,
ArrayVector dictionaries = {},
KeyValuePartitioningOptions options = {});
std::string type_name() const override { return "filename"; }
/// \brief Create a factory for a filename partitioning.
///
/// \param[in] field_names The names for the partition fields. Types will be
/// inferred.
static std::shared_ptr<PartitioningFactory> MakeFactory(
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
bool Equals(const Partitioning& other) const override;
private:
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
};
ARROW_DS_EXPORT std::string StripPrefix(const std::string& path,
const std::string& prefix);
/// \brief Extracts the directory and filename and removes the prefix of a path
///
/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") ->
/// {"year=2019","c.txt"}`
ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path,
const std::string& prefix);
/// \brief Vector version of StripPrefixAndFilename.
ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
const std::vector<std::string>& paths, const std::string& prefix);
/// \brief Vector version of StripPrefixAndFilename.
ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
const std::vector<fs::FileInfo>& files, const std::string& prefix);
/// \brief Either a Partitioning or a PartitioningFactory
class ARROW_DS_EXPORT PartitioningOrFactory {
public:
explicit PartitioningOrFactory(std::shared_ptr<Partitioning> partitioning)
: partitioning_(std::move(partitioning)) {}
explicit PartitioningOrFactory(std::shared_ptr<PartitioningFactory> factory)
: factory_(std::move(factory)) {}
PartitioningOrFactory& operator=(std::shared_ptr<Partitioning> partitioning) {
return *this = PartitioningOrFactory(std::move(partitioning));
}
PartitioningOrFactory& operator=(std::shared_ptr<PartitioningFactory> factory) {
return *this = PartitioningOrFactory(std::move(factory));
}
/// \brief The partitioning (if given).
const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
/// \brief The partition factory (if given).
const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
/// \brief Get the partition schema, inferring it with the given factory if needed.
Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
private:
std::shared_ptr<PartitioningFactory> factory_;
std::shared_ptr<Partitioning> partitioning_;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,33 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#include "arrow/dataset/visibility.h"
namespace arrow {
namespace dataset {
namespace internal {
/// Register dataset-based exec nodes with the exec node registry
///
/// This function must be called before using dataset ExecNode factories
ARROW_DS_EXPORT void Initialize();
} // namespace internal
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,32 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include "arrow/dataset/visibility.h"
#include "arrow/type_fwd.h"
namespace arrow {
namespace dataset {
// FIXME this is superceded by compute::Expression::Bind
ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to);
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,623 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/acero/options.h"
#include "arrow/compute/expression.h"
#include "arrow/compute/type_fwd.h"
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/io/interfaces.h"
#include "arrow/type_fwd.h"
#include "arrow/util/async_generator_fwd.h"
#include "arrow/util/iterator.h"
#include "arrow/util/thread_pool.h"
#include "arrow/util/type_fwd.h"
namespace arrow {
using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
namespace dataset {
/// \defgroup dataset-scanning Scanning API
///
/// @{
constexpr int64_t kDefaultBatchSize = 1 << 17; // 128Ki rows
// This will yield 64 batches ~ 8Mi rows
constexpr int32_t kDefaultBatchReadahead = 16;
constexpr int32_t kDefaultFragmentReadahead = 4;
constexpr int32_t kDefaultBytesReadahead = 1 << 25; // 32MiB
/// Scan-specific options, which can be changed between scans of the same dataset.
struct ARROW_DS_EXPORT ScanOptions {
/// A row filter (which will be pushed down to partitioning/reading if supported).
compute::Expression filter = compute::literal(true);
/// A projection expression (which can add/remove/rename columns).
compute::Expression projection;
/// Schema with which batches will be read from fragments. This is also known as the
/// "reader schema" it will be used (for example) in constructing CSV file readers to
/// identify column types for parsing. Usually only a subset of its fields (see
/// MaterializedFields) will be materialized during a scan.
std::shared_ptr<Schema> dataset_schema;
/// Schema of projected record batches. This is independent of dataset_schema as its
/// fields are derived from the projection. For example, let
///
/// dataset_schema = {"a": int32, "b": int32, "id": utf8}
/// projection = project({equal(field_ref("a"), field_ref("b"))}, {"a_plus_b"})
///
/// (no filter specified). In this case, the projected_schema would be
///
/// {"a_plus_b": int32}
std::shared_ptr<Schema> projected_schema;
/// Maximum row count for scanned batches.
int64_t batch_size = kDefaultBatchSize;
/// How many batches to read ahead within a fragment.
///
/// Set to 0 to disable batch readahead
///
/// Note: May not be supported by all formats
/// Note: Will be ignored if use_threads is set to false
int32_t batch_readahead = kDefaultBatchReadahead;
/// How many files to read ahead
///
/// Set to 0 to disable fragment readahead
///
/// Note: May not be enforced by all scanners
/// Note: Will be ignored if use_threads is set to false
int32_t fragment_readahead = kDefaultFragmentReadahead;
/// A pool from which materialized and scanned arrays will be allocated.
MemoryPool* pool = arrow::default_memory_pool();
/// IOContext for any IO tasks
///
/// Note: The IOContext executor will be ignored if use_threads is set to false
io::IOContext io_context;
/// Executor for any CPU tasks
///
/// If null, the global CPU executor will be used
///
/// Note: The Executor will be ignored if use_threads is set to false
arrow::internal::Executor* cpu_executor = NULLPTR;
/// If true the scanner will scan in parallel
///
/// Note: If true, this will use threads from both the cpu_executor and the
/// io_context.executor
/// Note: This must be true in order for any readahead to happen
bool use_threads = false;
/// If true the scanner will add augmented fields to the output schema.
bool add_augmented_fields = true;
/// Whether to cache metadata when scanning.
///
/// Fragments may typically cache metadata to speed up repeated accesses.
/// However, in use cases where a single scan is done, or if memory use
/// is more critical than CPU time, setting this option to false can
/// lessen memory use.
bool cache_metadata = true;
/// Fragment-specific scan options.
std::shared_ptr<FragmentScanOptions> fragment_scan_options;
/// Return a vector of FieldRefs that require materialization.
///
/// This is usually the union of the fields referenced in the projection and the
/// filter expression. Examples:
///
/// - `SELECT a, b WHERE a < 2 && c > 1` => ["a", "b", "a", "c"]
/// - `SELECT a + b < 3 WHERE a > 1` => ["a", "b", "a"]
///
/// This is needed for expression where a field may not be directly
/// used in the final projection but is still required to evaluate the
/// expression.
///
/// This is used by Fragment implementations to apply the column
/// sub-selection optimization.
std::vector<FieldRef> MaterializedFields() const;
/// Parameters which control when the plan should pause for a slow consumer
acero::BackpressureOptions backpressure =
acero::BackpressureOptions::DefaultBackpressure();
};
/// Scan-specific options, which can be changed between scans of the same dataset.
///
/// A dataset consists of one or more individual fragments. A fragment is anything
/// that is independently scannable, often a file.
///
/// Batches from all fragments will be converted to a single schema. This unified
/// schema is referred to as the "dataset schema" and is the output schema for
/// this node.
///
/// Individual fragments may have schemas that are different from the dataset
/// schema. This is sometimes referred to as the physical or fragment schema.
/// Conversion from the fragment schema to the dataset schema is a process
/// known as evolution.
struct ARROW_DS_EXPORT ScanV2Options : public acero::ExecNodeOptions {
explicit ScanV2Options(std::shared_ptr<Dataset> dataset)
: dataset(std::move(dataset)) {}
/// \brief The dataset to scan
std::shared_ptr<Dataset> dataset;
/// \brief A row filter
///
/// The filter expression should be written against the dataset schema.
/// The filter must be unbound.
///
/// This is an opportunistic pushdown filter. Filtering capabilities will
/// vary between formats. If a format is not capable of applying the filter
/// then it will ignore it.
///
/// Each fragment will do its best to filter the data based on the information
/// (partitioning guarantees, statistics) available to it. If it is able to
/// apply some filtering then it will indicate what filtering it was able to
/// apply by attaching a guarantee to the batch.
///
/// For example, if a filter is x < 50 && y > 40 then a batch may be able to
/// apply a guarantee x < 50. Post-scan filtering would then only need to
/// consider y > 40 (for this specific batch). The next batch may not be able
/// to attach any guarantee and both clauses would need to be applied to that batch.
///
/// A single guarantee-aware filtering operation should generally be applied to all
/// resulting batches. The scan node is not responsible for this.
///
/// Fields that are referenced by the filter should be included in the `columns` vector.
/// The scan node will not automatically fetch fields referenced by the filter
/// expression. \see AddFieldsNeededForFilter
///
/// If the filter references fields that are not included in `columns` this may or may
/// not be an error, depending on the format.
compute::Expression filter = compute::literal(true);
/// \brief The columns to scan
///
/// This is not a simple list of top-level column indices but instead a set of paths
/// allowing for partial selection of columns
///
/// These paths refer to the dataset schema
///
/// For example, consider the following dataset schema:
/// schema({
/// field("score", int32()),
/// "marker", struct_({
/// field("color", utf8()),
/// field("location", struct_({
/// field("x", float64()),
/// field("y", float64())
/// })
/// })
/// })
///
/// If `columns` is {{0}, {1,1,0}} then the output schema is:
/// schema({field("score", int32()), field("x", float64())})
///
/// If `columns` is {{1,1,1}, {1,1}} then the output schema is:
/// schema({
/// field("y", float64()),
/// field("location", struct_({
/// field("x", float64()),
/// field("y", float64())
/// })
/// })
std::vector<FieldPath> columns;
/// \brief Target number of bytes to read ahead in a fragment
///
/// This limit involves some amount of estimation. Formats typically only know
/// batch boundaries in terms of rows (not decoded bytes) and so an estimation
/// must be done to guess the average row size. Other formats like CSV and JSON
/// must make even more generalized guesses.
///
/// This is a best-effort guide. Some formats may need to read ahead further,
/// for example, if scanning a parquet file that has batches with 100MiB of data
/// then the actual readahead will be at least 100MiB
///
/// Set to 0 to disable readahead. When disabled, the scanner will read the
/// dataset one batch at a time
///
/// This limit applies across all fragments. If the limit is 32MiB and the
/// fragment readahead allows for 20 fragments to be read at once then the
/// total readahead will still be 32MiB and NOT 20 * 32MiB.
int32_t target_bytes_readahead = kDefaultBytesReadahead;
/// \brief Number of fragments to read ahead
///
/// Higher readahead will potentially lead to more efficient I/O but will lead
/// to the scan operation using more RAM. The default is fairly conservative
/// and designed for fast local disks (or slow local spinning disks which cannot
/// handle much parallelism anyways). When using a highly parallel remote filesystem
/// you will likely want to increase these values.
///
/// Set to 0 to disable fragment readahead. When disabled the dataset will be scanned
/// one fragment at a time.
int32_t fragment_readahead = kDefaultFragmentReadahead;
/// \brief Options specific to the file format
const FragmentScanOptions* format_options = NULLPTR;
/// \brief Utility method to get a selection representing all columns in a dataset
static std::vector<FieldPath> AllColumns(const Schema& dataset_schema);
/// \brief Utility method to add fields needed for the current filter
///
/// This method adds any fields that are needed by `filter` which are not already
/// included in the list of columns. Any new fields added will be added to the end
/// in no particular order.
static Status AddFieldsNeededForFilter(ScanV2Options* options);
};
/// \brief Describes a projection
struct ARROW_DS_EXPORT ProjectionDescr {
/// \brief The projection expression itself
/// This expression must be a call to make_struct
compute::Expression expression;
/// \brief The output schema of the projection.
/// This can be calculated from the input schema and the expression but it
/// is cached here for convenience.
std::shared_ptr<Schema> schema;
/// \brief Create a ProjectionDescr by binding an expression to the dataset schema
///
/// expression must return a struct type
static Result<ProjectionDescr> FromStructExpression(
const compute::Expression& expression, const Schema& dataset_schema);
/// \brief Create a ProjectionDescr from expressions/names for each field
static Result<ProjectionDescr> FromExpressions(std::vector<compute::Expression> exprs,
std::vector<std::string> names,
const Schema& dataset_schema);
/// \brief Create a default projection referencing fields in the dataset schema
static Result<ProjectionDescr> FromNames(std::vector<std::string> names,
const Schema& dataset_schema,
bool add_augmented_fields = true);
/// \brief Make a projection that projects every field in the dataset schema
static Result<ProjectionDescr> Default(const Schema& dataset_schema,
bool add_augmented_fields = true);
};
/// \brief Utility method to set the projection expression and schema
ARROW_DS_EXPORT void SetProjection(ScanOptions* options, ProjectionDescr projection);
/// \brief Combines a record batch with the fragment that the record batch originated
/// from
///
/// Knowing the source fragment can be useful for debugging & understanding loaded
/// data
struct TaggedRecordBatch {
std::shared_ptr<RecordBatch> record_batch;
std::shared_ptr<Fragment> fragment;
friend inline bool operator==(const TaggedRecordBatch& left,
const TaggedRecordBatch& right) {
return left.record_batch == right.record_batch && left.fragment == right.fragment;
}
};
using TaggedRecordBatchGenerator = std::function<Future<TaggedRecordBatch>()>;
using TaggedRecordBatchIterator = Iterator<TaggedRecordBatch>;
/// \brief Combines a tagged batch with positional information
///
/// This is returned when scanning batches in an unordered fashion. This information is
/// needed if you ever want to reassemble the batches in order
struct EnumeratedRecordBatch {
Enumerated<std::shared_ptr<RecordBatch>> record_batch;
Enumerated<std::shared_ptr<Fragment>> fragment;
friend inline bool operator==(const EnumeratedRecordBatch& left,
const EnumeratedRecordBatch& right) {
return left.record_batch == right.record_batch && left.fragment == right.fragment;
}
};
using EnumeratedRecordBatchGenerator = std::function<Future<EnumeratedRecordBatch>()>;
using EnumeratedRecordBatchIterator = Iterator<EnumeratedRecordBatch>;
/// @}
} // namespace dataset
template <>
struct IterationTraits<dataset::TaggedRecordBatch> {
static dataset::TaggedRecordBatch End() {
return dataset::TaggedRecordBatch{NULLPTR, NULLPTR};
}
static bool IsEnd(const dataset::TaggedRecordBatch& val) {
return val.record_batch == NULLPTR;
}
};
template <>
struct IterationTraits<dataset::EnumeratedRecordBatch> {
static dataset::EnumeratedRecordBatch End() {
return dataset::EnumeratedRecordBatch{
IterationEnd<Enumerated<std::shared_ptr<RecordBatch>>>(),
IterationEnd<Enumerated<std::shared_ptr<dataset::Fragment>>>()};
}
static bool IsEnd(const dataset::EnumeratedRecordBatch& val) {
return IsIterationEnd(val.fragment);
}
};
namespace dataset {
/// \defgroup dataset-scanning Scanning API
///
/// @{
/// \brief A scanner glues together several dataset classes to load in data.
/// The dataset contains a collection of fragments and partitioning rules.
///
/// The fragments identify independently loadable units of data (i.e. each fragment has
/// a potentially unique schema and possibly even format. It should be possible to read
/// fragments in parallel if desired).
///
/// The fragment's format contains the logic necessary to actually create a task to load
/// the fragment into memory. That task may or may not support parallel execution of
/// its own.
///
/// The scanner is then responsible for creating scan tasks from every fragment in the
/// dataset and (potentially) sequencing the loaded record batches together.
///
/// The scanner should not buffer the entire dataset in memory (unless asked) instead
/// yielding record batches as soon as they are ready to scan. Various readahead
/// properties control how much data is allowed to be scanned before pausing to let a
/// slow consumer catchup.
///
/// Today the scanner also handles projection & filtering although that may change in
/// the future.
class ARROW_DS_EXPORT Scanner {
public:
virtual ~Scanner() = default;
/// \brief Apply a visitor to each RecordBatch as it is scanned. If multiple threads
/// are used (via use_threads), the visitor will be invoked from those threads and is
/// responsible for any synchronization.
virtual Status Scan(std::function<Status(TaggedRecordBatch)> visitor) = 0;
/// \brief Convert a Scanner into a Table.
///
/// Use this convenience utility with care. This will serially materialize the
/// Scan result in memory before creating the Table.
virtual Result<std::shared_ptr<Table>> ToTable() = 0;
/// \brief Scan the dataset into a stream of record batches. Each batch is tagged
/// with the fragment it originated from. The batches will arrive in order. The
/// order of fragments is determined by the dataset.
///
/// Note: The scanner will perform some readahead but will avoid materializing too
/// much in memory (this is goverended by the readahead options and use_threads option).
/// If the readahead queue fills up then I/O will pause until the calling thread catches
/// up.
virtual Result<TaggedRecordBatchIterator> ScanBatches() = 0;
virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync() = 0;
virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync(
::arrow::internal::Executor* cpu_thread_pool) = 0;
/// \brief Scan the dataset into a stream of record batches. Unlike ScanBatches this
/// method may allow record batches to be returned out of order. This allows for more
/// efficient scanning: some fragments may be accessed more quickly than others (e.g.
/// may be cached in RAM or just happen to get scheduled earlier by the I/O)
///
/// To make up for the out-of-order iteration each batch is further tagged with
/// positional information.
virtual Result<EnumeratedRecordBatchIterator> ScanBatchesUnordered() = 0;
virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync() = 0;
virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync(
::arrow::internal::Executor* cpu_thread_pool) = 0;
/// \brief A convenience to synchronously load the given rows by index.
///
/// Will only consume as many batches as needed from ScanBatches().
virtual Result<std::shared_ptr<Table>> TakeRows(const Array& indices) = 0;
/// \brief Get the first N rows.
virtual Result<std::shared_ptr<Table>> Head(int64_t num_rows) = 0;
/// \brief Count rows matching a predicate.
///
/// This method will push down the predicate and compute the result based on fragment
/// metadata if possible.
virtual Result<int64_t> CountRows() = 0;
virtual Future<int64_t> CountRowsAsync() = 0;
/// \brief Convert the Scanner to a RecordBatchReader so it can be
/// easily used with APIs that expect a reader.
virtual Result<std::shared_ptr<RecordBatchReader>> ToRecordBatchReader() = 0;
/// \brief Get the options for this scan.
const std::shared_ptr<ScanOptions>& options() const { return scan_options_; }
/// \brief Get the dataset that this scanner will scan
virtual const std::shared_ptr<Dataset>& dataset() const = 0;
protected:
explicit Scanner(std::shared_ptr<ScanOptions> scan_options)
: scan_options_(std::move(scan_options)) {}
Result<EnumeratedRecordBatchIterator> AddPositioningToInOrderScan(
TaggedRecordBatchIterator scan);
const std::shared_ptr<ScanOptions> scan_options_;
};
/// \brief ScannerBuilder is a factory class to construct a Scanner. It is used
/// to pass information, notably a potential filter expression and a subset of
/// columns to materialize.
class ARROW_DS_EXPORT ScannerBuilder {
public:
explicit ScannerBuilder(std::shared_ptr<Dataset> dataset);
ScannerBuilder(std::shared_ptr<Dataset> dataset,
std::shared_ptr<ScanOptions> scan_options);
ScannerBuilder(std::shared_ptr<Schema> schema, std::shared_ptr<Fragment> fragment,
std::shared_ptr<ScanOptions> scan_options);
/// \brief Make a scanner from a record batch reader.
///
/// The resulting scanner can be scanned only once. This is intended
/// to support writing data from streaming sources or other sources
/// that can be iterated only once.
static std::shared_ptr<ScannerBuilder> FromRecordBatchReader(
std::shared_ptr<RecordBatchReader> reader);
/// \brief Set the subset of columns to materialize.
///
/// Columns which are not referenced may not be read from fragments.
///
/// \param[in] columns list of columns to project. Order and duplicates will
/// be preserved.
///
/// \return Failure if any column name does not exists in the dataset's
/// Schema.
Status Project(std::vector<std::string> columns);
/// \brief Set expressions which will be evaluated to produce the materialized
/// columns.
///
/// Columns which are not referenced may not be read from fragments.
///
/// \param[in] exprs expressions to evaluate to produce columns.
/// \param[in] names list of names for the resulting columns.
///
/// \return Failure if any referenced column does not exists in the dataset's
/// Schema.
Status Project(std::vector<compute::Expression> exprs, std::vector<std::string> names);
/// \brief Set the filter expression to return only rows matching the filter.
///
/// The predicate will be passed down to Sources and corresponding
/// Fragments to exploit predicate pushdown if possible using
/// partition information or Fragment internal metadata, e.g. Parquet statistics.
/// Columns which are not referenced may not be read from fragments.
///
/// \param[in] filter expression to filter rows with.
///
/// \return Failure if any referenced columns does not exist in the dataset's
/// Schema.
Status Filter(const compute::Expression& filter);
/// \brief Indicate if the Scanner should make use of the available
/// ThreadPool found in ScanOptions;
Status UseThreads(bool use_threads = true);
/// \brief Indicate if metadata should be cached when scanning
///
/// Fragments may typically cache metadata to speed up repeated accesses.
/// However, in use cases where a single scan is done, or if memory use
/// is more critical than CPU time, setting this option to false can
/// lessen memory use.
Status CacheMetadata(bool cache_metadata = true);
/// \brief Set the maximum number of rows per RecordBatch.
///
/// \param[in] batch_size the maximum number of rows.
/// \returns An error if the number for batch is not greater than 0.
///
/// This option provides a control limiting the memory owned by any RecordBatch.
Status BatchSize(int64_t batch_size);
/// \brief Set the number of batches to read ahead within a fragment.
///
/// \param[in] batch_readahead How many batches to read ahead within a fragment
/// \returns an error if this number is less than 0.
///
/// This option provides a control on the RAM vs I/O tradeoff.
/// It might not be supported by all file formats, in which case it will
/// simply be ignored.
Status BatchReadahead(int32_t batch_readahead);
/// \brief Set the number of fragments to read ahead
///
/// \param[in] fragment_readahead How many fragments to read ahead
/// \returns an error if this number is less than 0.
///
/// This option provides a control on the RAM vs I/O tradeoff.
Status FragmentReadahead(int32_t fragment_readahead);
/// \brief Set the pool from which materialized and scanned arrays will be allocated.
Status Pool(MemoryPool* pool);
/// \brief Set fragment-specific scan options.
Status FragmentScanOptions(std::shared_ptr<FragmentScanOptions> fragment_scan_options);
/// \brief Override default backpressure configuration
Status Backpressure(acero::BackpressureOptions backpressure);
/// \brief Return the current scan options for the builder.
Result<std::shared_ptr<ScanOptions>> GetScanOptions();
/// \brief Return the constructed now-immutable Scanner object
Result<std::shared_ptr<Scanner>> Finish();
const std::shared_ptr<Schema>& schema() const;
const std::shared_ptr<Schema>& projected_schema() const;
private:
std::shared_ptr<Dataset> dataset_;
std::shared_ptr<ScanOptions> scan_options_ = std::make_shared<ScanOptions>();
};
/// \brief Construct a source ExecNode which yields batches from a dataset scan.
///
/// Does not construct associated filter or project nodes.
///
/// Batches are yielded sequentially, like single-threaded,
/// when require_sequenced_output=true.
///
/// Yielded batches will be augmented with fragment/batch indices when
/// implicit_ordering=true to enable stable ordering for simple ExecPlans.
class ARROW_DS_EXPORT ScanNodeOptions : public acero::ExecNodeOptions {
public:
explicit ScanNodeOptions(std::shared_ptr<Dataset> dataset,
std::shared_ptr<ScanOptions> scan_options,
bool require_sequenced_output = false,
bool implicit_ordering = false)
: dataset(std::move(dataset)),
scan_options(std::move(scan_options)),
require_sequenced_output(require_sequenced_output),
implicit_ordering(implicit_ordering) {}
std::shared_ptr<Dataset> dataset;
std::shared_ptr<ScanOptions> scan_options;
bool require_sequenced_output;
bool implicit_ordering;
};
/// @}
namespace internal {
ARROW_DS_EXPORT void InitializeScanner(arrow::acero::ExecFactoryRegistry* registry);
ARROW_DS_EXPORT void InitializeScannerV2(arrow::acero::ExecFactoryRegistry* registry);
} // namespace internal
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,113 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <vector>
#include "arrow/compute/type_fwd.h" // IWYU pragma: export
#include "arrow/dataset/visibility.h"
#include "arrow/filesystem/type_fwd.h" // IWYU pragma: export
#include "arrow/type_fwd.h" // IWYU pragma: export
namespace arrow {
namespace dataset {
class Dataset;
class DatasetFactory;
using DatasetVector = std::vector<std::shared_ptr<Dataset>>;
class UnionDataset;
class UnionDatasetFactory;
class Fragment;
using FragmentIterator = Iterator<std::shared_ptr<Fragment>>;
using FragmentVector = std::vector<std::shared_ptr<Fragment>>;
class FragmentScanOptions;
class FileSource;
class FileFormat;
class FileFragment;
class FileWriter;
class FileWriteOptions;
class FileSystemDataset;
class FileSystemDatasetFactory;
struct FileSystemDatasetWriteOptions;
class WriteNodeOptions;
/// \brief Controls what happens if files exist in an output directory during a dataset
/// write
enum class ExistingDataBehavior : int8_t {
/// Deletes all files in a directory the first time that directory is encountered
kDeleteMatchingPartitions,
/// Ignores existing files, overwriting any that happen to have the same name as an
/// output file
kOverwriteOrIgnore,
/// Returns an error if there are any files or subdirectories in the output directory
kError,
};
class InMemoryDataset;
class CsvFileFormat;
class CsvFileWriter;
class CsvFileWriteOptions;
struct CsvFragmentScanOptions;
class JsonFileFormat;
class JsonFileWriter;
class JsonFileWriteOptions;
struct JsonFragmentScanOptions;
class IpcFileFormat;
class IpcFileWriter;
class IpcFileWriteOptions;
class IpcFragmentScanOptions;
class ParquetFileFormat;
class ParquetFileFragment;
class ParquetFragmentScanOptions;
class ParquetFileWriter;
class ParquetFileWriteOptions;
class Partitioning;
class PartitioningFactory;
class PartitioningOrFactory;
struct KeyValuePartitioningOptions;
class DirectoryPartitioning;
class HivePartitioning;
struct HivePartitioningOptions;
class FilenamePartitioning;
struct FilenamePartitioningOptions;
class ScanNodeOptions;
struct ScanOptions;
class Scanner;
class ScannerBuilder;
class ScanTask;
using ScanTaskVector = std::vector<std::shared_ptr<ScanTask>>;
using ScanTaskIterator = Iterator<std::shared_ptr<ScanTask>>;
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,50 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#if defined(_WIN32) || defined(__CYGWIN__)
# if defined(_MSC_VER)
# pragma warning(push)
# pragma warning(disable : 4251)
# else
# pragma GCC diagnostic ignored "-Wattributes"
# endif
# ifdef ARROW_DS_STATIC
# define ARROW_DS_EXPORT
# elif defined(ARROW_DS_EXPORTING)
# define ARROW_DS_EXPORT __declspec(dllexport)
# else
# define ARROW_DS_EXPORT __declspec(dllimport)
# endif
# define ARROW_DS_NO_EXPORT
#else // Not Windows
# ifndef ARROW_DS_EXPORT
# define ARROW_DS_EXPORT __attribute__((visibility("default")))
# endif
# ifndef ARROW_DS_NO_EXPORT
# define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden")))
# endif
#endif // Non-Windows
#if defined(_MSC_VER)
# pragma warning(pop)
#endif