Initial commit
This commit is contained in:
@@ -0,0 +1,39 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/dataset/dataset.h"
|
||||
#include "arrow/dataset/discovery.h"
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#ifdef ARROW_CSV
|
||||
# include "arrow/dataset/file_csv.h"
|
||||
#endif
|
||||
#ifdef ARROW_JSON
|
||||
# include "arrow/dataset/file_json.h"
|
||||
#endif
|
||||
#include "arrow/dataset/file_ipc.h"
|
||||
#ifdef ARROW_ORC
|
||||
# include "arrow/dataset/file_orc.h"
|
||||
#endif
|
||||
#ifdef ARROW_PARQUET
|
||||
# include "arrow/dataset/file_parquet.h"
|
||||
#endif
|
||||
#include "arrow/dataset/scanner.h"
|
||||
@@ -0,0 +1,491 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/util/async_generator_fwd.h"
|
||||
#include "arrow/util/future.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/mutex.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace internal {
|
||||
class Executor;
|
||||
} // namespace internal
|
||||
|
||||
namespace dataset {
|
||||
|
||||
using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
|
||||
|
||||
/// \brief Description of a column to scan
|
||||
struct ARROW_DS_EXPORT FragmentSelectionColumn {
|
||||
/// \brief The path to the column to load
|
||||
FieldPath path;
|
||||
/// \brief The type of the column in the dataset schema
|
||||
///
|
||||
/// A format may choose to ignore this field completely. For example, when
|
||||
/// reading from IPC the reader can just return the column in the data type
|
||||
/// that is stored on disk. There is no point in doing anything special.
|
||||
///
|
||||
/// However, some formats may be capable of casting on the fly. For example,
|
||||
/// when reading from CSV, if we know the target type of the column, we can
|
||||
/// convert from string to the target type as we read.
|
||||
DataType* requested_type;
|
||||
};
|
||||
|
||||
/// \brief A list of columns that should be loaded from a fragment
|
||||
///
|
||||
/// The paths in this selection should be referring to the fragment schema. This class
|
||||
/// contains a virtual destructor as it is expected evolution strategies will need to
|
||||
/// extend this to add any information needed to later evolve the batches.
|
||||
///
|
||||
/// For example, in the basic evolution strategy, we keep track of which columns
|
||||
/// were missing from the file so that we can fill those in with null when evolving.
|
||||
class ARROW_DS_EXPORT FragmentSelection {
|
||||
public:
|
||||
explicit FragmentSelection(std::vector<FragmentSelectionColumn> columns)
|
||||
: columns_(std::move(columns)) {}
|
||||
virtual ~FragmentSelection() = default;
|
||||
/// The columns that should be loaded from the fragment
|
||||
const std::vector<FragmentSelectionColumn>& columns() const { return columns_; }
|
||||
|
||||
private:
|
||||
std::vector<FragmentSelectionColumn> columns_;
|
||||
};
|
||||
|
||||
/// \brief Instructions for scanning a particular fragment
|
||||
///
|
||||
/// The fragment scan request is derived from ScanV2Options. The main
|
||||
/// difference is that the scan options are based on the dataset schema
|
||||
/// while the fragment request is based on the fragment schema.
|
||||
struct ARROW_DS_EXPORT FragmentScanRequest {
|
||||
/// \brief A row filter
|
||||
///
|
||||
/// The filter expression should be written against the fragment schema.
|
||||
///
|
||||
/// \see ScanV2Options for details on how this filter should be applied
|
||||
compute::Expression filter = compute::literal(true);
|
||||
|
||||
/// \brief The columns to scan
|
||||
///
|
||||
/// These indices refer to the fragment schema
|
||||
///
|
||||
/// Note: This is NOT a simple list of top-level column indices.
|
||||
/// For more details \see ScanV2Options
|
||||
///
|
||||
/// If possible a fragment should only read from disk the data needed
|
||||
/// to satisfy these columns. If a format cannot partially read a nested
|
||||
/// column (e.g. JSON) then it must apply the column selection (in memory)
|
||||
/// before returning the scanned batch.
|
||||
std::shared_ptr<FragmentSelection> fragment_selection;
|
||||
/// \brief Options specific to the format being scanned
|
||||
const FragmentScanOptions* format_scan_options;
|
||||
};
|
||||
|
||||
/// \brief An iterator-like object that can yield batches created from a fragment
|
||||
class ARROW_DS_EXPORT FragmentScanner {
|
||||
public:
|
||||
/// This instance will only be destroyed after all ongoing scan futures
|
||||
/// have been completed.
|
||||
///
|
||||
/// This means any callbacks created as part of the scan can safely
|
||||
/// capture `this`
|
||||
virtual ~FragmentScanner() = default;
|
||||
/// \brief Scan a batch of data from the file
|
||||
/// \param batch_number The index of the batch to read
|
||||
virtual Future<std::shared_ptr<RecordBatch>> ScanBatch(int batch_number) = 0;
|
||||
/// \brief Calculate an estimate of how many data bytes the given batch will represent
|
||||
///
|
||||
/// "Data bytes" should be the total size of all the buffers once the data has been
|
||||
/// decoded into the Arrow format.
|
||||
virtual int64_t EstimatedDataBytes(int batch_number) = 0;
|
||||
/// \brief The number of batches in the fragment to scan
|
||||
virtual int NumBatches() = 0;
|
||||
};
|
||||
|
||||
/// \brief Information learned about a fragment through inspection
|
||||
///
|
||||
/// This information can be used to figure out which fields need
|
||||
/// to be read from a file and how the data read in should be evolved
|
||||
/// to match the dataset schema.
|
||||
///
|
||||
/// For example, from a CSV file we can inspect and learn the column
|
||||
/// names and use those column names to determine which columns to load
|
||||
/// from the CSV file.
|
||||
struct ARROW_DS_EXPORT InspectedFragment {
|
||||
explicit InspectedFragment(std::vector<std::string> column_names)
|
||||
: column_names(std::move(column_names)) {}
|
||||
std::vector<std::string> column_names;
|
||||
};
|
||||
|
||||
/// \brief A granular piece of a Dataset, such as an individual file.
|
||||
///
|
||||
/// A Fragment can be read/scanned separately from other fragments. It yields a
|
||||
/// collection of RecordBatches when scanned
|
||||
///
|
||||
/// Note that Fragments have well defined physical schemas which are reconciled by
|
||||
/// the Datasets which contain them; these physical schemas may differ from a parent
|
||||
/// Dataset's schema and the physical schemas of sibling Fragments.
|
||||
class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
|
||||
public:
|
||||
/// \brief An expression that represents no known partition information
|
||||
static const compute::Expression kNoPartitionInformation;
|
||||
|
||||
/// \brief Return the physical schema of the Fragment.
|
||||
///
|
||||
/// The physical schema is also called the writer schema.
|
||||
/// This method is blocking and may suffer from high latency filesystem.
|
||||
/// The schema is cached after being read once, or may be specified at construction.
|
||||
Result<std::shared_ptr<Schema>> ReadPhysicalSchema();
|
||||
|
||||
/// An asynchronous version of Scan
|
||||
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options) = 0;
|
||||
|
||||
/// \brief Inspect a fragment to learn basic information
|
||||
///
|
||||
/// This will be called before a scan and a fragment should attach whatever
|
||||
/// information will be needed to figure out an evolution strategy. This information
|
||||
/// will then be passed to the call to BeginScan
|
||||
virtual Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
|
||||
|
||||
/// \brief Start a scan operation
|
||||
virtual Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
|
||||
|
||||
/// \brief Count the number of rows in this fragment matching the filter using metadata
|
||||
/// only. That is, this method may perform I/O, but will not load data.
|
||||
///
|
||||
/// If this is not possible, resolve with an empty optional. The fragment can perform
|
||||
/// I/O (e.g. to read metadata) before it deciding whether it can satisfy the request.
|
||||
virtual Future<std::optional<int64_t>> CountRows(
|
||||
compute::Expression predicate, const std::shared_ptr<ScanOptions>& options);
|
||||
|
||||
/// \brief Clear any metadata that may have been cached by this object.
|
||||
///
|
||||
/// A fragment may typically cache metadata to speed up repeated accesses.
|
||||
/// In use cases when memory use is more critical than CPU time, calling
|
||||
/// this function can help reclaim memory.
|
||||
virtual Status ClearCachedMetadata();
|
||||
|
||||
virtual std::string type_name() const = 0;
|
||||
virtual std::string ToString() const { return type_name(); }
|
||||
|
||||
/// \brief An expression which evaluates to true for all data viewed by this
|
||||
/// Fragment.
|
||||
const compute::Expression& partition_expression() const {
|
||||
return partition_expression_;
|
||||
}
|
||||
|
||||
virtual ~Fragment() = default;
|
||||
|
||||
protected:
|
||||
Fragment() = default;
|
||||
explicit Fragment(compute::Expression partition_expression,
|
||||
std::shared_ptr<Schema> physical_schema);
|
||||
|
||||
virtual Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() = 0;
|
||||
|
||||
util::Mutex physical_schema_mutex_;
|
||||
compute::Expression partition_expression_ = compute::literal(true);
|
||||
// The physical schema that is inferred from the Fragment
|
||||
std::shared_ptr<Schema> physical_schema_;
|
||||
// The physical schema that was passed to the Fragment constructor
|
||||
std::shared_ptr<Schema> given_physical_schema_;
|
||||
};
|
||||
|
||||
/// \brief Per-scan options for fragment(s) in a dataset.
|
||||
///
|
||||
/// These options are not intrinsic to the format or fragment itself, but do affect
|
||||
/// the results of a scan. These are options which make sense to change between
|
||||
/// repeated reads of the same dataset, such as format-specific conversion options
|
||||
/// (that do not affect the schema).
|
||||
///
|
||||
/// \ingroup dataset-scanning
|
||||
class ARROW_DS_EXPORT FragmentScanOptions {
|
||||
public:
|
||||
virtual std::string type_name() const = 0;
|
||||
virtual std::string ToString() const { return type_name(); }
|
||||
virtual ~FragmentScanOptions() = default;
|
||||
};
|
||||
|
||||
/// \defgroup dataset-implementations Concrete implementations
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief A trivial Fragment that yields ScanTask out of a fixed set of
|
||||
/// RecordBatch.
|
||||
class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
|
||||
public:
|
||||
class Scanner;
|
||||
InMemoryFragment(std::shared_ptr<Schema> schema, RecordBatchVector record_batches,
|
||||
compute::Expression = compute::literal(true));
|
||||
explicit InMemoryFragment(RecordBatchVector record_batches,
|
||||
compute::Expression = compute::literal(true));
|
||||
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
|
||||
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) override;
|
||||
Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) override;
|
||||
|
||||
std::string type_name() const override { return "in-memory"; }
|
||||
|
||||
protected:
|
||||
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
|
||||
|
||||
RecordBatchVector record_batches_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
using FragmentGenerator = AsyncGenerator<std::shared_ptr<Fragment>>;
|
||||
|
||||
/// \brief Rules for converting the dataset schema to and from fragment schemas
|
||||
class ARROW_DS_EXPORT FragmentEvolutionStrategy {
|
||||
public:
|
||||
/// This instance will only be destroyed when all scan operations for the
|
||||
/// fragment have completed.
|
||||
virtual ~FragmentEvolutionStrategy() = default;
|
||||
/// \brief A guarantee that applies to all batches of this fragment
|
||||
///
|
||||
/// For example, if a fragment is missing one of the fields in the dataset
|
||||
/// schema then a typical evolution strategy is to set that field to null.
|
||||
///
|
||||
/// So if the column at index 3 is missing then the guarantee is
|
||||
/// FieldRef(3) == null
|
||||
///
|
||||
/// Individual field guarantees should be AND'd together and returned
|
||||
/// as a single expression.
|
||||
virtual Result<compute::Expression> GetGuarantee(
|
||||
const std::vector<FieldPath>& dataset_schema_selection) const = 0;
|
||||
|
||||
/// \brief Return a fragment schema selection given a dataset schema selection
|
||||
///
|
||||
/// For example, if the user wants fields 2 & 4 of the dataset schema and
|
||||
/// in this fragment the field 2 is missing and the field 4 is at index 1 then
|
||||
/// this should return {1}
|
||||
virtual Result<std::unique_ptr<FragmentSelection>> DevolveSelection(
|
||||
const std::vector<FieldPath>& dataset_schema_selection) const = 0;
|
||||
|
||||
/// \brief Return a filter expression bound to the fragment schema given
|
||||
/// a filter expression bound to the dataset schema
|
||||
///
|
||||
/// The dataset scan filter will first be simplified by the guarantee returned
|
||||
/// by GetGuarantee. This means an evolution that only handles dropping or casting
|
||||
/// fields doesn't need to do anything here except return the given filter.
|
||||
///
|
||||
/// On the other hand, an evolution that is doing some kind of aliasing will likely
|
||||
/// need to convert field references in the filter to the aliased field references
|
||||
/// where appropriate.
|
||||
virtual Result<compute::Expression> DevolveFilter(
|
||||
const compute::Expression& filter) const = 0;
|
||||
|
||||
/// \brief Convert a batch from the fragment schema to the dataset schema
|
||||
///
|
||||
/// Typically this involves casting columns from the data type stored on disk
|
||||
/// to the data type of the dataset schema. For example, this fragment might
|
||||
/// have columns stored as int32 and the dataset schema might have int64 for
|
||||
/// the column. In this case we should cast the column from int32 to int64.
|
||||
///
|
||||
/// Note: A fragment may perform this cast as the data is read from disk. In
|
||||
/// that case a cast might not be needed.
|
||||
virtual Result<compute::ExecBatch> EvolveBatch(
|
||||
const std::shared_ptr<RecordBatch>& batch,
|
||||
const std::vector<FieldPath>& dataset_selection,
|
||||
const FragmentSelection& selection) const = 0;
|
||||
|
||||
/// \brief Return a string description of this strategy
|
||||
virtual std::string ToString() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Lookup to create a FragmentEvolutionStrategy for a given fragment
|
||||
class ARROW_DS_EXPORT DatasetEvolutionStrategy {
|
||||
public:
|
||||
virtual ~DatasetEvolutionStrategy() = default;
|
||||
/// \brief Create a strategy for evolving from the given fragment
|
||||
/// to the schema of the given dataset
|
||||
virtual std::unique_ptr<FragmentEvolutionStrategy> GetStrategy(
|
||||
const Dataset& dataset, const Fragment& fragment,
|
||||
const InspectedFragment& inspected_fragment) = 0;
|
||||
|
||||
/// \brief Return a string description of this strategy
|
||||
virtual std::string ToString() const = 0;
|
||||
};
|
||||
|
||||
ARROW_DS_EXPORT std::unique_ptr<DatasetEvolutionStrategy>
|
||||
MakeBasicDatasetEvolutionStrategy();
|
||||
|
||||
/// \brief A container of zero or more Fragments.
|
||||
///
|
||||
/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a
|
||||
/// directory. A Dataset has a schema to which Fragments must align during a
|
||||
/// scan operation. This is analogous to Avro's reader and writer schema.
|
||||
class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
|
||||
public:
|
||||
/// \brief Begin to build a new Scan operation against this Dataset
|
||||
Result<std::shared_ptr<ScannerBuilder>> NewScan();
|
||||
|
||||
/// \brief GetFragments returns an iterator of Fragments given a predicate.
|
||||
Result<FragmentIterator> GetFragments(compute::Expression predicate);
|
||||
Result<FragmentIterator> GetFragments();
|
||||
|
||||
/// \brief Async versions of `GetFragments`.
|
||||
Result<FragmentGenerator> GetFragmentsAsync(compute::Expression predicate);
|
||||
Result<FragmentGenerator> GetFragmentsAsync();
|
||||
|
||||
const std::shared_ptr<Schema>& schema() const { return schema_; }
|
||||
|
||||
/// \brief An expression which evaluates to true for all data viewed by this Dataset.
|
||||
/// May be null, which indicates no information is available.
|
||||
const compute::Expression& partition_expression() const {
|
||||
return partition_expression_;
|
||||
}
|
||||
|
||||
/// \brief The name identifying the kind of Dataset
|
||||
virtual std::string type_name() const = 0;
|
||||
|
||||
/// \brief Return a copy of this Dataset with a different schema.
|
||||
///
|
||||
/// The copy will view the same Fragments. If the new schema is not compatible with the
|
||||
/// original dataset's schema then an error will be raised.
|
||||
virtual Result<std::shared_ptr<Dataset>> ReplaceSchema(
|
||||
std::shared_ptr<Schema> schema) const = 0;
|
||||
|
||||
/// \brief Rules used by this dataset to handle schema evolution
|
||||
DatasetEvolutionStrategy* evolution_strategy() { return evolution_strategy_.get(); }
|
||||
|
||||
virtual ~Dataset() = default;
|
||||
|
||||
protected:
|
||||
explicit Dataset(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
|
||||
|
||||
Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression);
|
||||
|
||||
virtual Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) = 0;
|
||||
/// \brief Default non-virtual implementation method for the base
|
||||
/// `GetFragmentsAsyncImpl` method, which creates a fragment generator for
|
||||
/// the dataset, possibly filtering results with a predicate (forwarding to
|
||||
/// the synchronous `GetFragmentsImpl` method and moving the computations
|
||||
/// to the background, using the IO thread pool).
|
||||
///
|
||||
/// Currently, `executor` is always the same as `internal::GetCPUThreadPool()`,
|
||||
/// which means the results from the underlying fragment generator will be
|
||||
/// transferred to the default CPU thread pool. The generator itself is
|
||||
/// offloaded to run on the default IO thread pool.
|
||||
virtual Result<FragmentGenerator> GetFragmentsAsyncImpl(
|
||||
compute::Expression predicate, arrow::internal::Executor* executor);
|
||||
|
||||
std::shared_ptr<Schema> schema_;
|
||||
compute::Expression partition_expression_ = compute::literal(true);
|
||||
std::unique_ptr<DatasetEvolutionStrategy> evolution_strategy_ =
|
||||
MakeBasicDatasetEvolutionStrategy();
|
||||
};
|
||||
|
||||
/// \addtogroup dataset-implementations
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief A Source which yields fragments wrapping a stream of record batches.
|
||||
///
|
||||
/// The record batches must match the schema provided to the source at construction.
|
||||
class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
|
||||
public:
|
||||
class RecordBatchGenerator {
|
||||
public:
|
||||
virtual ~RecordBatchGenerator() = default;
|
||||
virtual RecordBatchIterator Get() const = 0;
|
||||
};
|
||||
|
||||
/// Construct a dataset from a schema and a factory of record batch iterators.
|
||||
InMemoryDataset(std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<RecordBatchGenerator> get_batches)
|
||||
: Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {}
|
||||
|
||||
/// Convenience constructor taking a fixed list of batches
|
||||
InMemoryDataset(std::shared_ptr<Schema> schema, RecordBatchVector batches);
|
||||
|
||||
/// Convenience constructor taking a Table
|
||||
explicit InMemoryDataset(std::shared_ptr<Table> table);
|
||||
|
||||
std::string type_name() const override { return "in-memory"; }
|
||||
|
||||
Result<std::shared_ptr<Dataset>> ReplaceSchema(
|
||||
std::shared_ptr<Schema> schema) const override;
|
||||
|
||||
protected:
|
||||
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
|
||||
|
||||
std::shared_ptr<RecordBatchGenerator> get_batches_;
|
||||
};
|
||||
|
||||
/// \brief A Dataset wrapping child Datasets.
|
||||
class ARROW_DS_EXPORT UnionDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Construct a UnionDataset wrapping child Datasets.
|
||||
///
|
||||
/// \param[in] schema the schema of the resulting dataset.
|
||||
/// \param[in] children one or more child Datasets. Their schemas must be identical to
|
||||
/// schema.
|
||||
static Result<std::shared_ptr<UnionDataset>> Make(std::shared_ptr<Schema> schema,
|
||||
DatasetVector children);
|
||||
|
||||
const DatasetVector& children() const { return children_; }
|
||||
|
||||
std::string type_name() const override { return "union"; }
|
||||
|
||||
Result<std::shared_ptr<Dataset>> ReplaceSchema(
|
||||
std::shared_ptr<Schema> schema) const override;
|
||||
|
||||
protected:
|
||||
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
|
||||
|
||||
explicit UnionDataset(std::shared_ptr<Schema> schema, DatasetVector children)
|
||||
: Dataset(std::move(schema)), children_(std::move(children)) {}
|
||||
|
||||
DatasetVector children_;
|
||||
|
||||
friend class UnionDatasetFactory;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,103 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/async_util.h"
|
||||
#include "arrow/util/future.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
namespace internal {
|
||||
|
||||
// This lines up with our other defaults in the scanner and execution plan
|
||||
constexpr uint64_t kDefaultDatasetWriterMaxRowsQueued = 8 * 1024 * 1024;
|
||||
|
||||
/// \brief Utility class that manages a set of writers to different paths
|
||||
///
|
||||
/// Writers may be closed and reopened (and a new file created) based on the dataset
|
||||
/// write options (for example, max_rows_per_file or max_open_files)
|
||||
///
|
||||
/// The dataset writer enforces its own back pressure based on the # of rows (as opposed
|
||||
/// to # of batches which is how it is typically enforced elsewhere) and # of files.
|
||||
class ARROW_DS_EXPORT DatasetWriter {
|
||||
public:
|
||||
/// \brief Create a dataset writer
|
||||
///
|
||||
/// Will fail if basename_template is invalid or if there is existing data and
|
||||
/// existing_data_behavior is kError
|
||||
///
|
||||
/// \param write_options options to control how the data should be written
|
||||
/// \param max_rows_queued max # of rows allowed to be queued before the dataset_writer
|
||||
/// will ask for backpressure
|
||||
static Result<std::unique_ptr<DatasetWriter>> Make(
|
||||
FileSystemDatasetWriteOptions write_options, util::AsyncTaskScheduler* scheduler,
|
||||
std::function<void()> pause_callback, std::function<void()> resume_callback,
|
||||
std::function<void()> finish_callback,
|
||||
uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
|
||||
|
||||
~DatasetWriter();
|
||||
|
||||
/// \brief Write a batch to the dataset
|
||||
/// \param[in] batch The batch to write
|
||||
/// \param[in] directory The directory to write to
|
||||
///
|
||||
/// Note: The written filename will be {directory}/{filename_factory(i)} where i is a
|
||||
/// counter controlled by `max_open_files` and `max_rows_per_file`
|
||||
///
|
||||
/// If multiple WriteRecordBatch calls arrive with the same `directory` then the batches
|
||||
/// may be written to the same file.
|
||||
///
|
||||
/// The returned future will be marked finished when the record batch has been queued
|
||||
/// to be written. If the returned future is unfinished then this indicates the dataset
|
||||
/// writer's queue is full and the data provider should pause.
|
||||
///
|
||||
/// This method is NOT async reentrant. The returned future will only be unfinished
|
||||
/// if back pressure needs to be applied. Async reentrancy is not necessary for
|
||||
/// concurrent writes to happen. Calling this method again before the previous future
|
||||
/// completes will not just violate max_rows_queued but likely lead to race conditions.
|
||||
///
|
||||
/// One thing to note is that the ordering of your data can affect your maximum
|
||||
/// potential parallelism. If this seems odd then consider a dataset where the first
|
||||
/// 1000 batches go to the same directory and then the 1001st batch goes to a different
|
||||
/// directory. The only way to get two parallel writes immediately would be to queue
|
||||
/// all 1000 pending writes to the first directory.
|
||||
void WriteRecordBatch(std::shared_ptr<RecordBatch> batch, const std::string& directory,
|
||||
const std::string& prefix = "");
|
||||
|
||||
/// Finish all pending writes and close any open files
|
||||
void Finish();
|
||||
|
||||
protected:
|
||||
DatasetWriter(FileSystemDatasetWriteOptions write_options,
|
||||
util::AsyncTaskScheduler* scheduler, std::function<void()> pause_callback,
|
||||
std::function<void()> resume_callback,
|
||||
std::function<void()> finish_callback,
|
||||
uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
|
||||
|
||||
class DatasetWriterImpl;
|
||||
std::unique_ptr<DatasetWriterImpl> impl_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,275 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
/// Logic for automatically determining the structure of multi-file
|
||||
/// dataset with possible partitioning according to available
|
||||
/// partitioning
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/dataset/partition.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/filesystem/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
/// \defgroup dataset-discovery Discovery API
|
||||
///
|
||||
/// @{
|
||||
|
||||
struct InspectOptions {
|
||||
/// See `fragments` property.
|
||||
static constexpr int kInspectAllFragments = -1;
|
||||
|
||||
/// Indicate how many fragments should be inspected to infer the unified dataset
|
||||
/// schema. Limiting the number of fragments accessed improves the latency of
|
||||
/// the discovery process when dealing with a high number of fragments and/or
|
||||
/// high latency file systems.
|
||||
///
|
||||
/// The default value of `1` inspects the schema of the first (in no particular
|
||||
/// order) fragment only. If the dataset has a uniform schema for all fragments,
|
||||
/// this default is the optimal value. In order to inspect all fragments and
|
||||
/// robustly unify their potentially varying schemas, set this option to
|
||||
/// `kInspectAllFragments`. A value of `0` disables inspection of fragments
|
||||
/// altogether so only the partitioning schema will be inspected.
|
||||
int fragments = 1;
|
||||
|
||||
/// Control how to unify types. By default, types are merged strictly (the
|
||||
/// type must match exactly, except nulls can be merged with other types).
|
||||
Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
|
||||
};
|
||||
|
||||
struct FinishOptions {
|
||||
/// Finalize the dataset with this given schema. If the schema is not
|
||||
/// provided, infer the schema via the Inspect, see the `inspect_options`
|
||||
/// property.
|
||||
std::shared_ptr<Schema> schema = NULLPTR;
|
||||
|
||||
/// If the schema is not provided, it will be discovered by passing the
|
||||
/// following options to `DatasetDiscovery::Inspect`.
|
||||
InspectOptions inspect_options{};
|
||||
|
||||
/// Indicate if the given Schema (when specified), should be validated against
|
||||
/// the fragments' schemas. `inspect_options` will control how many fragments
|
||||
/// are checked.
|
||||
bool validate_fragments = false;
|
||||
};
|
||||
|
||||
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's expected
|
||||
/// schema before materializing said Dataset.
|
||||
class ARROW_DS_EXPORT DatasetFactory {
|
||||
public:
|
||||
/// \brief Get the schemas of the Fragments and Partitioning.
|
||||
virtual Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
|
||||
InspectOptions options) = 0;
|
||||
|
||||
/// \brief Get unified schema for the resulting Dataset.
|
||||
Result<std::shared_ptr<Schema>> Inspect(InspectOptions options = {});
|
||||
|
||||
/// \brief Create a Dataset
|
||||
Result<std::shared_ptr<Dataset>> Finish();
|
||||
/// \brief Create a Dataset with the given schema (see \a InspectOptions::schema)
|
||||
Result<std::shared_ptr<Dataset>> Finish(std::shared_ptr<Schema> schema);
|
||||
/// \brief Create a Dataset with the given options
|
||||
virtual Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) = 0;
|
||||
|
||||
/// \brief Optional root partition for the resulting Dataset.
|
||||
const compute::Expression& root_partition() const { return root_partition_; }
|
||||
/// \brief Set the root partition for the resulting Dataset.
|
||||
Status SetRootPartition(compute::Expression partition) {
|
||||
root_partition_ = std::move(partition);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual ~DatasetFactory() = default;
|
||||
|
||||
protected:
|
||||
DatasetFactory();
|
||||
|
||||
compute::Expression root_partition_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's
|
||||
/// expected schema before materialization.
|
||||
/// \ingroup dataset-implementations
|
||||
class ARROW_DS_EXPORT UnionDatasetFactory : public DatasetFactory {
|
||||
public:
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
std::vector<std::shared_ptr<DatasetFactory>> factories);
|
||||
|
||||
/// \brief Return the list of child DatasetFactory
|
||||
const std::vector<std::shared_ptr<DatasetFactory>>& factories() const {
|
||||
return factories_;
|
||||
}
|
||||
|
||||
/// \brief Get the schemas of the Datasets.
|
||||
///
|
||||
/// Instead of applying options globally, it applies at each child factory.
|
||||
/// This will not respect `options.fragments` exactly, but will respect the
|
||||
/// spirit of peeking the first fragments or all of them.
|
||||
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
|
||||
InspectOptions options) override;
|
||||
|
||||
/// \brief Create a Dataset.
|
||||
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
|
||||
|
||||
protected:
|
||||
explicit UnionDatasetFactory(std::vector<std::shared_ptr<DatasetFactory>> factories);
|
||||
|
||||
std::vector<std::shared_ptr<DatasetFactory>> factories_;
|
||||
};
|
||||
|
||||
/// \ingroup dataset-filesystem
|
||||
struct FileSystemFactoryOptions {
|
||||
/// Either an explicit Partitioning or a PartitioningFactory to discover one.
|
||||
///
|
||||
/// If a factory is provided, it will be used to infer a schema for partition fields
|
||||
/// based on file and directory paths then construct a Partitioning. The default
|
||||
/// is a Partitioning which will yield no partition information.
|
||||
///
|
||||
/// The (explicit or discovered) partitioning will be applied to discovered files
|
||||
/// and the resulting partition information embedded in the Dataset.
|
||||
PartitioningOrFactory partitioning{Partitioning::Default()};
|
||||
|
||||
/// For the purposes of applying the partitioning, paths will be stripped
|
||||
/// of the partition_base_dir. Files not matching the partition_base_dir
|
||||
/// prefix will be skipped for partition discovery. The ignored files will still
|
||||
/// be part of the Dataset, but will not have partition information.
|
||||
///
|
||||
/// Example:
|
||||
/// partition_base_dir = "/dataset";
|
||||
///
|
||||
/// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
|
||||
///
|
||||
/// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
|
||||
///
|
||||
/// This is useful for partitioning which parses directory when ordering
|
||||
/// is important, e.g. DirectoryPartitioning.
|
||||
std::string partition_base_dir;
|
||||
|
||||
/// Invalid files (via selector or explicitly) will be excluded by checking
|
||||
/// with the FileFormat::IsSupported method. This will incur IO for each files
|
||||
/// in a serial and single threaded fashion. Disabling this feature will skip the
|
||||
/// IO, but unsupported files may be present in the Dataset
|
||||
/// (resulting in an error at scan time).
|
||||
bool exclude_invalid_files = false;
|
||||
|
||||
/// When discovering from a Selector (and not from an explicit file list), ignore
|
||||
/// files and directories matching any of these prefixes.
|
||||
///
|
||||
/// Example (with selector = "/dataset/**"):
|
||||
/// selector_ignore_prefixes = {"_", ".DS_STORE" };
|
||||
///
|
||||
/// - "/dataset/data.csv" -> not ignored
|
||||
/// - "/dataset/_metadata" -> ignored
|
||||
/// - "/dataset/.DS_STORE" -> ignored
|
||||
/// - "/dataset/_hidden/dat" -> ignored
|
||||
/// - "/dataset/nested/.DS_STORE" -> ignored
|
||||
std::vector<std::string> selector_ignore_prefixes = {
|
||||
".",
|
||||
"_",
|
||||
};
|
||||
};
|
||||
|
||||
/// \brief FileSystemDatasetFactory creates a Dataset from a vector of
|
||||
/// fs::FileInfo or a fs::FileSelector.
|
||||
/// \ingroup dataset-filesystem
|
||||
class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {
|
||||
public:
|
||||
/// \brief Build a FileSystemDatasetFactory from an explicit list of
|
||||
/// paths.
|
||||
///
|
||||
/// \param[in] filesystem passed to FileSystemDataset
|
||||
/// \param[in] paths passed to FileSystemDataset
|
||||
/// \param[in] format passed to FileSystemDataset
|
||||
/// \param[in] options see FileSystemFactoryOptions for more information.
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<std::string>& paths,
|
||||
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
|
||||
|
||||
/// \brief Build a FileSystemDatasetFactory from a fs::FileSelector.
|
||||
///
|
||||
/// The selector will expand to a vector of FileInfo. The expansion/crawling
|
||||
/// is performed in this function call. Thus, the finalized Dataset is
|
||||
/// working with a snapshot of the filesystem.
|
||||
//
|
||||
/// If options.partition_base_dir is not provided, it will be overwritten
|
||||
/// with selector.base_dir.
|
||||
///
|
||||
/// \param[in] filesystem passed to FileSystemDataset
|
||||
/// \param[in] selector used to crawl and search files
|
||||
/// \param[in] format passed to FileSystemDataset
|
||||
/// \param[in] options see FileSystemFactoryOptions for more information.
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
std::shared_ptr<fs::FileSystem> filesystem, fs::FileSelector selector,
|
||||
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
|
||||
|
||||
/// \brief Build a FileSystemDatasetFactory from an uri including filesystem
|
||||
/// information.
|
||||
///
|
||||
/// \param[in] uri passed to FileSystemDataset
|
||||
/// \param[in] format passed to FileSystemDataset
|
||||
/// \param[in] options see FileSystemFactoryOptions for more information.
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(std::string uri,
|
||||
std::shared_ptr<FileFormat> format,
|
||||
FileSystemFactoryOptions options);
|
||||
|
||||
/// \brief Build a FileSystemDatasetFactory from an explicit list of
|
||||
/// file information.
|
||||
///
|
||||
/// \param[in] filesystem passed to FileSystemDataset
|
||||
/// \param[in] files passed to FileSystemDataset
|
||||
/// \param[in] format passed to FileSystemDataset
|
||||
/// \param[in] options see FileSystemFactoryOptions for more information.
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<fs::FileInfo>& files,
|
||||
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
|
||||
|
||||
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
|
||||
InspectOptions options) override;
|
||||
|
||||
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
|
||||
|
||||
protected:
|
||||
FileSystemDatasetFactory(std::vector<fs::FileInfo> files,
|
||||
std::shared_ptr<fs::FileSystem> filesystem,
|
||||
std::shared_ptr<FileFormat> format,
|
||||
FileSystemFactoryOptions options);
|
||||
|
||||
Result<std::shared_ptr<Schema>> PartitionSchema();
|
||||
|
||||
std::vector<fs::FileInfo> files_;
|
||||
std::shared_ptr<fs::FileSystem> fs_;
|
||||
std::shared_ptr<FileFormat> format_;
|
||||
FileSystemFactoryOptions options_;
|
||||
};
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,499 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/dataset/dataset.h"
|
||||
#include "arrow/dataset/partition.h"
|
||||
#include "arrow/dataset/scanner.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "arrow/io/file.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/compression.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace dataset {
|
||||
|
||||
/// \defgroup dataset-file-formats File formats for reading and writing datasets
|
||||
/// \defgroup dataset-filesystem File system datasets
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief The path and filesystem where an actual file is located or a buffer which can
|
||||
/// be read like a file
|
||||
class ARROW_DS_EXPORT FileSource : public util::EqualityComparable<FileSource> {
|
||||
public:
|
||||
FileSource(std::string path, std::shared_ptr<fs::FileSystem> filesystem,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: file_info_(std::move(path)),
|
||||
filesystem_(std::move(filesystem)),
|
||||
compression_(compression) {}
|
||||
|
||||
FileSource(fs::FileInfo info, std::shared_ptr<fs::FileSystem> filesystem,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: file_info_(std::move(info)),
|
||||
filesystem_(std::move(filesystem)),
|
||||
compression_(compression) {}
|
||||
|
||||
explicit FileSource(std::shared_ptr<Buffer> buffer,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: buffer_(std::move(buffer)), compression_(compression) {}
|
||||
|
||||
using CustomOpen = std::function<Result<std::shared_ptr<io::RandomAccessFile>>()>;
|
||||
FileSource(CustomOpen open, int64_t size)
|
||||
: custom_open_(std::move(open)), custom_size_(size) {}
|
||||
|
||||
using CustomOpenWithCompression =
|
||||
std::function<Result<std::shared_ptr<io::RandomAccessFile>>(Compression::type)>;
|
||||
FileSource(CustomOpenWithCompression open_with_compression, int64_t size,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: custom_open_(std::bind(std::move(open_with_compression), compression)),
|
||||
custom_size_(size),
|
||||
compression_(compression) {}
|
||||
|
||||
FileSource(std::shared_ptr<io::RandomAccessFile> file, int64_t size,
|
||||
Compression::type compression = Compression::UNCOMPRESSED)
|
||||
: custom_open_([=] { return ToResult(file); }),
|
||||
custom_size_(size),
|
||||
compression_(compression) {}
|
||||
|
||||
explicit FileSource(std::shared_ptr<io::RandomAccessFile> file,
|
||||
Compression::type compression = Compression::UNCOMPRESSED);
|
||||
|
||||
FileSource() : custom_open_(CustomOpen{&InvalidOpen}) {}
|
||||
|
||||
static std::vector<FileSource> FromPaths(const std::shared_ptr<fs::FileSystem>& fs,
|
||||
std::vector<std::string> paths) {
|
||||
std::vector<FileSource> sources;
|
||||
for (auto&& path : paths) {
|
||||
sources.emplace_back(std::move(path), fs);
|
||||
}
|
||||
return sources;
|
||||
}
|
||||
|
||||
/// \brief Return the type of raw compression on the file, if any.
|
||||
Compression::type compression() const { return compression_; }
|
||||
|
||||
/// \brief Return the file path, if any. Only valid when file source wraps a path.
|
||||
const std::string& path() const {
|
||||
static std::string buffer_path = "<Buffer>";
|
||||
static std::string custom_open_path = "<Buffer>";
|
||||
return filesystem_ ? file_info_.path() : buffer_ ? buffer_path : custom_open_path;
|
||||
}
|
||||
|
||||
/// \brief Return the filesystem, if any. Otherwise returns nullptr
|
||||
const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
|
||||
|
||||
/// \brief Return the buffer containing the file, if any. Otherwise returns nullptr
|
||||
const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
|
||||
|
||||
/// \brief Get a RandomAccessFile which views this file source
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> Open() const;
|
||||
Future<std::shared_ptr<io::RandomAccessFile>> OpenAsync() const;
|
||||
|
||||
/// \brief Get the size (in bytes) of the file or buffer
|
||||
/// If the file is compressed this should be the compressed (on-disk) size.
|
||||
int64_t Size() const;
|
||||
|
||||
/// \brief Get an InputStream which views this file source (and decompresses if needed)
|
||||
/// \param[in] compression If nullopt, guess the compression scheme from the
|
||||
/// filename, else decompress with the given codec
|
||||
Result<std::shared_ptr<io::InputStream>> OpenCompressed(
|
||||
std::optional<Compression::type> compression = std::nullopt) const;
|
||||
|
||||
/// \brief equality comparison with another FileSource
|
||||
bool Equals(const FileSource& other) const;
|
||||
|
||||
private:
|
||||
static Result<std::shared_ptr<io::RandomAccessFile>> InvalidOpen() {
|
||||
return Status::Invalid("Called Open() on an uninitialized FileSource");
|
||||
}
|
||||
|
||||
fs::FileInfo file_info_;
|
||||
std::shared_ptr<fs::FileSystem> filesystem_;
|
||||
std::shared_ptr<Buffer> buffer_;
|
||||
CustomOpen custom_open_;
|
||||
int64_t custom_size_ = 0;
|
||||
Compression::type compression_ = Compression::UNCOMPRESSED;
|
||||
};
|
||||
|
||||
/// \brief Base class for file format implementation
|
||||
class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileFormat> {
|
||||
public:
|
||||
/// Options affecting how this format is scanned.
|
||||
///
|
||||
/// The options here can be overridden at scan time.
|
||||
std::shared_ptr<FragmentScanOptions> default_fragment_scan_options;
|
||||
|
||||
virtual ~FileFormat() = default;
|
||||
|
||||
/// \brief The name identifying the kind of file format
|
||||
virtual std::string type_name() const = 0;
|
||||
|
||||
virtual bool Equals(const FileFormat& other) const = 0;
|
||||
|
||||
/// \brief Indicate if the FileSource is supported/readable by this format.
|
||||
virtual Result<bool> IsSupported(const FileSource& source) const = 0;
|
||||
|
||||
/// \brief Return the schema of the file if possible.
|
||||
virtual Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const = 0;
|
||||
|
||||
/// \brief Learn what we need about the file before we start scanning it
|
||||
virtual Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FileSource& source, const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const;
|
||||
|
||||
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options,
|
||||
const std::shared_ptr<FileFragment>& file) const = 0;
|
||||
|
||||
virtual Future<std::optional<int64_t>> CountRows(
|
||||
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options);
|
||||
|
||||
virtual Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const;
|
||||
|
||||
/// \brief Open a fragment
|
||||
virtual Result<std::shared_ptr<FileFragment>> MakeFragment(
|
||||
FileSource source, compute::Expression partition_expression,
|
||||
std::shared_ptr<Schema> physical_schema);
|
||||
|
||||
/// \brief Create a FileFragment for a FileSource.
|
||||
Result<std::shared_ptr<FileFragment>> MakeFragment(
|
||||
FileSource source, compute::Expression partition_expression);
|
||||
|
||||
/// \brief Create a FileFragment for a FileSource.
|
||||
Result<std::shared_ptr<FileFragment>> MakeFragment(
|
||||
FileSource source, std::shared_ptr<Schema> physical_schema = NULLPTR);
|
||||
|
||||
/// \brief Create a writer for this format.
|
||||
virtual Result<std::shared_ptr<FileWriter>> MakeWriter(
|
||||
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<FileWriteOptions> options,
|
||||
fs::FileLocator destination_locator) const = 0;
|
||||
|
||||
/// \brief Get default write options for this format.
|
||||
///
|
||||
/// May return null shared_ptr if this file format does not yet support
|
||||
/// writing datasets.
|
||||
virtual std::shared_ptr<FileWriteOptions> DefaultWriteOptions() = 0;
|
||||
|
||||
protected:
|
||||
explicit FileFormat(std::shared_ptr<FragmentScanOptions> default_fragment_scan_options)
|
||||
: default_fragment_scan_options(std::move(default_fragment_scan_options)) {}
|
||||
};
|
||||
|
||||
/// \brief A Fragment that is stored in a file with a known format
|
||||
class ARROW_DS_EXPORT FileFragment : public Fragment,
|
||||
public util::EqualityComparable<FileFragment> {
|
||||
public:
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) override;
|
||||
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) override;
|
||||
|
||||
std::string type_name() const override { return format_->type_name(); }
|
||||
std::string ToString() const override { return source_.path(); };
|
||||
|
||||
const FileSource& source() const { return source_; }
|
||||
const std::shared_ptr<FileFormat>& format() const { return format_; }
|
||||
|
||||
bool Equals(const FileFragment& other) const;
|
||||
|
||||
protected:
|
||||
FileFragment(FileSource source, std::shared_ptr<FileFormat> format,
|
||||
compute::Expression partition_expression,
|
||||
std::shared_ptr<Schema> physical_schema)
|
||||
: Fragment(std::move(partition_expression), std::move(physical_schema)),
|
||||
source_(std::move(source)),
|
||||
format_(std::move(format)) {}
|
||||
|
||||
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
|
||||
|
||||
FileSource source_;
|
||||
std::shared_ptr<FileFormat> format_;
|
||||
|
||||
friend class FileFormat;
|
||||
};
|
||||
|
||||
/// \brief A Dataset of FileFragments.
|
||||
///
|
||||
/// A FileSystemDataset is composed of one or more FileFragment. The fragments
|
||||
/// are independent and don't need to share the same format and/or filesystem.
|
||||
class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Create a FileSystemDataset.
|
||||
///
|
||||
/// \param[in] schema the schema of the dataset
|
||||
/// \param[in] root_partition the partition expression of the dataset
|
||||
/// \param[in] format the format of each FileFragment.
|
||||
/// \param[in] filesystem the filesystem of each FileFragment, or nullptr if the
|
||||
/// fragments wrap buffers.
|
||||
/// \param[in] fragments list of fragments to create the dataset from.
|
||||
/// \param[in] partitioning the Partitioning object in case the dataset is created
|
||||
/// with a known partitioning (e.g. from a discovered partitioning
|
||||
/// through a DatasetFactory), or nullptr if not known.
|
||||
///
|
||||
/// Note that fragments wrapping files resident in differing filesystems are not
|
||||
/// permitted; to work with multiple filesystems use a UnionDataset.
|
||||
///
|
||||
/// \return A constructed dataset.
|
||||
static Result<std::shared_ptr<FileSystemDataset>> Make(
|
||||
std::shared_ptr<Schema> schema, compute::Expression root_partition,
|
||||
std::shared_ptr<FileFormat> format, std::shared_ptr<fs::FileSystem> filesystem,
|
||||
std::vector<std::shared_ptr<FileFragment>> fragments,
|
||||
std::shared_ptr<Partitioning> partitioning = NULLPTR);
|
||||
|
||||
/// \brief Write a dataset.
|
||||
static Status Write(const FileSystemDatasetWriteOptions& write_options,
|
||||
std::shared_ptr<Scanner> scanner);
|
||||
|
||||
/// \brief Return the type name of the dataset.
|
||||
std::string type_name() const override { return "filesystem"; }
|
||||
|
||||
/// \brief Replace the schema of the dataset.
|
||||
Result<std::shared_ptr<Dataset>> ReplaceSchema(
|
||||
std::shared_ptr<Schema> schema) const override;
|
||||
|
||||
/// \brief Return the path of files.
|
||||
std::vector<std::string> files() const;
|
||||
|
||||
/// \brief Return the format.
|
||||
const std::shared_ptr<FileFormat>& format() const { return format_; }
|
||||
|
||||
/// \brief Return the filesystem. May be nullptr if the fragments wrap buffers.
|
||||
const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
|
||||
|
||||
/// \brief Return the partitioning. May be nullptr if the dataset was not constructed
|
||||
/// with a partitioning.
|
||||
const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
protected:
|
||||
struct FragmentSubtrees;
|
||||
|
||||
explicit FileSystemDataset(std::shared_ptr<Schema> schema)
|
||||
: Dataset(std::move(schema)) {}
|
||||
|
||||
FileSystemDataset(std::shared_ptr<Schema> schema,
|
||||
compute::Expression partition_expression)
|
||||
: Dataset(std::move(schema), partition_expression) {}
|
||||
|
||||
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
|
||||
|
||||
void SetupSubtreePruning();
|
||||
|
||||
std::shared_ptr<FileFormat> format_;
|
||||
std::shared_ptr<fs::FileSystem> filesystem_;
|
||||
std::vector<std::shared_ptr<FileFragment>> fragments_;
|
||||
std::shared_ptr<Partitioning> partitioning_;
|
||||
|
||||
std::shared_ptr<FragmentSubtrees> subtrees_;
|
||||
};
|
||||
|
||||
/// \brief Options for writing a file of this format.
|
||||
class ARROW_DS_EXPORT FileWriteOptions {
|
||||
public:
|
||||
virtual ~FileWriteOptions() = default;
|
||||
|
||||
const std::shared_ptr<FileFormat>& format() const { return format_; }
|
||||
|
||||
std::string type_name() const { return format_->type_name(); }
|
||||
|
||||
protected:
|
||||
explicit FileWriteOptions(std::shared_ptr<FileFormat> format)
|
||||
: format_(std::move(format)) {}
|
||||
|
||||
std::shared_ptr<FileFormat> format_;
|
||||
};
|
||||
|
||||
/// \brief A writer for this format.
|
||||
class ARROW_DS_EXPORT FileWriter {
|
||||
public:
|
||||
virtual ~FileWriter() = default;
|
||||
|
||||
/// \brief Write the given batch.
|
||||
virtual Status Write(const std::shared_ptr<RecordBatch>& batch) = 0;
|
||||
|
||||
/// \brief Write all batches from the reader.
|
||||
Status Write(RecordBatchReader* batches);
|
||||
|
||||
/// \brief Indicate that writing is done.
|
||||
virtual Future<> Finish();
|
||||
|
||||
const std::shared_ptr<FileFormat>& format() const { return options_->format(); }
|
||||
const std::shared_ptr<Schema>& schema() const { return schema_; }
|
||||
const std::shared_ptr<FileWriteOptions>& options() const { return options_; }
|
||||
const fs::FileLocator& destination() const { return destination_locator_; }
|
||||
|
||||
/// \brief After Finish() is called, provides number of bytes written to file.
|
||||
Result<int64_t> GetBytesWritten() const;
|
||||
|
||||
protected:
|
||||
FileWriter(std::shared_ptr<Schema> schema, std::shared_ptr<FileWriteOptions> options,
|
||||
std::shared_ptr<io::OutputStream> destination,
|
||||
fs::FileLocator destination_locator)
|
||||
: schema_(std::move(schema)),
|
||||
options_(std::move(options)),
|
||||
destination_(std::move(destination)),
|
||||
destination_locator_(std::move(destination_locator)) {}
|
||||
|
||||
virtual Future<> FinishInternal() = 0;
|
||||
|
||||
std::shared_ptr<Schema> schema_;
|
||||
std::shared_ptr<FileWriteOptions> options_;
|
||||
std::shared_ptr<io::OutputStream> destination_;
|
||||
fs::FileLocator destination_locator_;
|
||||
std::optional<int64_t> bytes_written_;
|
||||
};
|
||||
|
||||
/// \brief Options for writing a dataset.
|
||||
struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
|
||||
/// Options for individual fragment writing.
|
||||
std::shared_ptr<FileWriteOptions> file_write_options;
|
||||
|
||||
/// FileSystem into which a dataset will be written.
|
||||
std::shared_ptr<fs::FileSystem> filesystem;
|
||||
|
||||
/// Root directory into which the dataset will be written.
|
||||
std::string base_dir;
|
||||
|
||||
/// Partitioning used to generate fragment paths.
|
||||
std::shared_ptr<Partitioning> partitioning;
|
||||
|
||||
/// If true the order of rows in the dataset is preserved when writing with
|
||||
/// multiple threads. This may cause notable performance degradation.
|
||||
bool preserve_order = false;
|
||||
|
||||
/// Maximum number of partitions any batch may be written into, default is 1K.
|
||||
int max_partitions = 1024;
|
||||
|
||||
/// Template string used to generate fragment basenames.
|
||||
/// {i} will be replaced by an auto incremented integer.
|
||||
std::string basename_template;
|
||||
|
||||
/// A functor which will be applied on an incremented counter. The result will be
|
||||
/// inserted into the basename_template in place of {i}.
|
||||
///
|
||||
/// This can be used, for example, to left-pad the file counter.
|
||||
std::function<std::string(int)> basename_template_functor;
|
||||
|
||||
/// If greater than 0 then this will limit the maximum number of files that can be left
|
||||
/// open. If an attempt is made to open too many files then the least recently used file
|
||||
/// will be closed. If this setting is set too low you may end up fragmenting your data
|
||||
/// into many small files.
|
||||
///
|
||||
/// The default is 900 which also allows some # of files to be open by the scanner
|
||||
/// before hitting the default Linux limit of 1024
|
||||
uint32_t max_open_files = 900;
|
||||
|
||||
/// If greater than 0 then this will limit how many rows are placed in any single file.
|
||||
/// Otherwise there will be no limit and one file will be created in each output
|
||||
/// directory unless files need to be closed to respect max_open_files
|
||||
uint64_t max_rows_per_file = 0;
|
||||
|
||||
/// If greater than 0 then this will cause the dataset writer to batch incoming data
|
||||
/// and only write the row groups to the disk when sufficient rows have accumulated.
|
||||
/// The final row group size may be less than this value and other options such as
|
||||
/// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes.
|
||||
uint64_t min_rows_per_group = 0;
|
||||
|
||||
/// If greater than 0 then the dataset writer may split up large incoming batches into
|
||||
/// multiple row groups. If this value is set then min_rows_per_group should also be
|
||||
/// set or else you may end up with very small row groups (e.g. if the incoming row
|
||||
/// group size is just barely larger than this value).
|
||||
uint64_t max_rows_per_group = 1 << 20;
|
||||
|
||||
/// Controls what happens if an output directory already exists.
|
||||
ExistingDataBehavior existing_data_behavior = ExistingDataBehavior::kError;
|
||||
|
||||
/// \brief If false the dataset writer will not create directories
|
||||
/// This is mainly intended for filesystems that do not require directories such as S3.
|
||||
bool create_dir = true;
|
||||
|
||||
/// Callback to be invoked against all FileWriters before
|
||||
/// they are finalized with FileWriter::Finish().
|
||||
std::function<Status(FileWriter*)> writer_pre_finish = [](FileWriter*) {
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
/// Callback to be invoked against all FileWriters after they have
|
||||
/// called FileWriter::Finish().
|
||||
std::function<Status(FileWriter*)> writer_post_finish = [](FileWriter*) {
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
const std::shared_ptr<FileFormat>& format() const {
|
||||
return file_write_options->format();
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief Wraps FileSystemDatasetWriteOptions for consumption as compute::ExecNodeOptions
|
||||
class ARROW_DS_EXPORT WriteNodeOptions : public acero::ExecNodeOptions {
|
||||
public:
|
||||
explicit WriteNodeOptions(
|
||||
FileSystemDatasetWriteOptions options,
|
||||
std::shared_ptr<const KeyValueMetadata> custom_metadata = NULLPTR)
|
||||
: write_options(std::move(options)), custom_metadata(std::move(custom_metadata)) {}
|
||||
|
||||
/// \brief Options to control how to write the dataset
|
||||
FileSystemDatasetWriteOptions write_options;
|
||||
/// \brief Optional schema to attach to all written batches
|
||||
///
|
||||
/// By default, we will use the output schema of the input.
|
||||
///
|
||||
/// This can be used to alter schema metadata, field nullability, or field metadata.
|
||||
/// However, this cannot be used to change the type of data. If the custom schema does
|
||||
/// not have the same number of fields and the same data types as the input then the
|
||||
/// plan will fail.
|
||||
std::shared_ptr<Schema> custom_schema;
|
||||
/// \brief Optional metadata to attach to written batches
|
||||
std::shared_ptr<const KeyValueMetadata> custom_metadata;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
namespace internal {
|
||||
ARROW_DS_EXPORT void InitializeDatasetWriter(arrow::acero::ExecFactoryRegistry* registry);
|
||||
}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,144 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/csv/options.h"
|
||||
#include "arrow/dataset/dataset.h"
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/ipc/type_fwd.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/compression.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
constexpr char kCsvTypeName[] = "csv";
|
||||
|
||||
/// \addtogroup dataset-file-formats
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief A FileFormat implementation that reads from and writes to Csv files
|
||||
class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
|
||||
public:
|
||||
// TODO(ARROW-18328) Remove this, moved to CsvFragmentScanOptions
|
||||
/// Options affecting the parsing of CSV files
|
||||
csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
|
||||
|
||||
CsvFileFormat();
|
||||
|
||||
std::string type_name() const override { return kCsvTypeName; }
|
||||
|
||||
bool Equals(const FileFormat& other) const override;
|
||||
|
||||
Result<bool> IsSupported(const FileSource& source) const override;
|
||||
|
||||
/// \brief Return the schema of the file if possible.
|
||||
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
|
||||
|
||||
Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const override;
|
||||
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& scan_options,
|
||||
const std::shared_ptr<FileFragment>& file) const override;
|
||||
|
||||
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FileSource& source, const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const override;
|
||||
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
|
||||
Result<std::shared_ptr<FileWriter>> MakeWriter(
|
||||
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<FileWriteOptions> options,
|
||||
fs::FileLocator destination_locator) const override;
|
||||
|
||||
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
|
||||
};
|
||||
|
||||
/// \brief Per-scan options for CSV fragments
|
||||
struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
|
||||
std::string type_name() const override { return kCsvTypeName; }
|
||||
|
||||
using StreamWrapFunc = std::function<Result<std::shared_ptr<io::InputStream>>(
|
||||
std::shared_ptr<io::InputStream>)>;
|
||||
|
||||
/// CSV conversion options
|
||||
csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults();
|
||||
|
||||
/// CSV reading options
|
||||
///
|
||||
/// Note that use_threads is always ignored.
|
||||
csv::ReadOptions read_options = csv::ReadOptions::Defaults();
|
||||
|
||||
/// CSV parse options
|
||||
csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
|
||||
|
||||
/// Optional stream wrapping function
|
||||
///
|
||||
/// If defined, all open dataset file fragments will be passed
|
||||
/// through this function. One possible use case is to transparently
|
||||
/// transcode all input files from a given character set to utf8.
|
||||
StreamWrapFunc stream_transform_func{};
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT CsvFileWriteOptions : public FileWriteOptions {
|
||||
public:
|
||||
/// Options passed to csv::MakeCSVWriter.
|
||||
std::shared_ptr<csv::WriteOptions> write_options;
|
||||
|
||||
protected:
|
||||
explicit CsvFileWriteOptions(std::shared_ptr<FileFormat> format)
|
||||
: FileWriteOptions(std::move(format)) {}
|
||||
|
||||
friend class CsvFileFormat;
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT CsvFileWriter : public FileWriter {
|
||||
public:
|
||||
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
|
||||
|
||||
private:
|
||||
CsvFileWriter(std::shared_ptr<io::OutputStream> destination,
|
||||
std::shared_ptr<ipc::RecordBatchWriter> writer,
|
||||
std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<CsvFileWriteOptions> options,
|
||||
fs::FileLocator destination_locator);
|
||||
|
||||
Future<> FinishInternal() override;
|
||||
|
||||
std::shared_ptr<io::OutputStream> destination_;
|
||||
std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
|
||||
|
||||
friend class CsvFileFormat;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,123 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/io/type_fwd.h"
|
||||
#include "arrow/ipc/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
/// \addtogroup dataset-file-formats
|
||||
///
|
||||
/// @{
|
||||
|
||||
constexpr char kIpcTypeName[] = "ipc";
|
||||
|
||||
/// \brief A FileFormat implementation that reads from and writes to Ipc files
|
||||
class ARROW_DS_EXPORT IpcFileFormat : public FileFormat {
|
||||
public:
|
||||
std::string type_name() const override { return kIpcTypeName; }
|
||||
|
||||
IpcFileFormat();
|
||||
|
||||
bool Equals(const FileFormat& other) const override {
|
||||
return type_name() == other.type_name();
|
||||
}
|
||||
|
||||
Result<bool> IsSupported(const FileSource& source) const override;
|
||||
|
||||
/// \brief Return the schema of the file if possible.
|
||||
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
|
||||
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options,
|
||||
const std::shared_ptr<FileFragment>& file) const override;
|
||||
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
|
||||
Result<std::shared_ptr<FileWriter>> MakeWriter(
|
||||
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<FileWriteOptions> options,
|
||||
fs::FileLocator destination_locator) const override;
|
||||
|
||||
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
|
||||
};
|
||||
|
||||
/// \brief Per-scan options for IPC fragments
|
||||
class ARROW_DS_EXPORT IpcFragmentScanOptions : public FragmentScanOptions {
|
||||
public:
|
||||
std::string type_name() const override { return kIpcTypeName; }
|
||||
|
||||
/// Options passed to the IPC file reader.
|
||||
/// included_fields, memory_pool, and use_threads are ignored.
|
||||
std::shared_ptr<ipc::IpcReadOptions> options;
|
||||
/// If present, the async scanner will enable I/O coalescing.
|
||||
/// This is ignored by the sync scanner.
|
||||
std::shared_ptr<io::CacheOptions> cache_options;
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT IpcFileWriteOptions : public FileWriteOptions {
|
||||
public:
|
||||
/// Options passed to ipc::MakeFileWriter. use_threads is ignored
|
||||
std::shared_ptr<ipc::IpcWriteOptions> options;
|
||||
|
||||
/// custom_metadata written to the file's footer
|
||||
std::shared_ptr<const KeyValueMetadata> metadata;
|
||||
|
||||
protected:
|
||||
explicit IpcFileWriteOptions(std::shared_ptr<FileFormat> format)
|
||||
: FileWriteOptions(std::move(format)) {}
|
||||
|
||||
friend class IpcFileFormat;
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT IpcFileWriter : public FileWriter {
|
||||
public:
|
||||
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
|
||||
|
||||
private:
|
||||
IpcFileWriter(std::shared_ptr<io::OutputStream> destination,
|
||||
std::shared_ptr<ipc::RecordBatchWriter> writer,
|
||||
std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<IpcFileWriteOptions> options,
|
||||
fs::FileLocator destination_locator);
|
||||
|
||||
Future<> FinishInternal() override;
|
||||
|
||||
std::shared_ptr<io::OutputStream> destination_;
|
||||
std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
|
||||
|
||||
friend class IpcFileFormat;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,98 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/dataset/dataset.h"
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/ipc/type_fwd.h"
|
||||
#include "arrow/json/options.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/future.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow::dataset {
|
||||
|
||||
/// \addtogroup dataset-file-formats
|
||||
///
|
||||
/// @{
|
||||
|
||||
constexpr char kJsonTypeName[] = "json";
|
||||
|
||||
/// \brief A FileFormat implementation that reads from JSON files
|
||||
class ARROW_DS_EXPORT JsonFileFormat : public FileFormat {
|
||||
public:
|
||||
JsonFileFormat();
|
||||
|
||||
std::string type_name() const override { return kJsonTypeName; }
|
||||
|
||||
bool Equals(const FileFormat& other) const override;
|
||||
|
||||
Result<bool> IsSupported(const FileSource& source) const override;
|
||||
|
||||
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
|
||||
|
||||
Future<std::shared_ptr<InspectedFragment>> InspectFragment(
|
||||
const FileSource& source, const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const override;
|
||||
|
||||
Future<std::shared_ptr<FragmentScanner>> BeginScan(
|
||||
const FragmentScanRequest& scan_request, const InspectedFragment& inspected,
|
||||
const FragmentScanOptions* format_options,
|
||||
compute::ExecContext* exec_context) const override;
|
||||
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& scan_options,
|
||||
const std::shared_ptr<FileFragment>& file) const override;
|
||||
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& scan_options) override;
|
||||
|
||||
Result<std::shared_ptr<FileWriter>> MakeWriter(
|
||||
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<FileWriteOptions> options,
|
||||
fs::FileLocator destination_locator) const override {
|
||||
return Status::NotImplemented("Writing JSON files is not currently supported");
|
||||
}
|
||||
|
||||
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override { return NULLPTR; }
|
||||
};
|
||||
|
||||
/// \brief Per-scan options for JSON fragments
|
||||
struct ARROW_DS_EXPORT JsonFragmentScanOptions : public FragmentScanOptions {
|
||||
std::string type_name() const override { return kJsonTypeName; }
|
||||
|
||||
/// @brief Options that affect JSON parsing
|
||||
///
|
||||
/// Note: `explicit_schema` and `unexpected_field_behavior` are ignored.
|
||||
json::ParseOptions parse_options = json::ParseOptions::Defaults();
|
||||
|
||||
/// @brief Options that affect JSON reading
|
||||
json::ReadOptions read_options = json::ReadOptions::Defaults();
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow::dataset
|
||||
@@ -0,0 +1,75 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/io/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
/// \addtogroup dataset-file-formats
|
||||
///
|
||||
/// @{
|
||||
|
||||
constexpr char kOrcTypeName[] = "orc";
|
||||
|
||||
/// \brief A FileFormat implementation that reads from and writes to ORC files
|
||||
class ARROW_DS_EXPORT OrcFileFormat : public FileFormat {
|
||||
public:
|
||||
OrcFileFormat();
|
||||
|
||||
std::string type_name() const override { return kOrcTypeName; }
|
||||
|
||||
bool Equals(const FileFormat& other) const override {
|
||||
return type_name() == other.type_name();
|
||||
}
|
||||
|
||||
Result<bool> IsSupported(const FileSource& source) const override;
|
||||
|
||||
/// \brief Return the schema of the file if possible.
|
||||
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
|
||||
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options,
|
||||
const std::shared_ptr<FileFragment>& file) const override;
|
||||
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
|
||||
Result<std::shared_ptr<FileWriter>> MakeWriter(
|
||||
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<FileWriteOptions> options,
|
||||
fs::FileLocator destination_locator) const override;
|
||||
|
||||
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,410 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/dataset/discovery.h"
|
||||
#include "arrow/dataset/file_base.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/io/caching.h"
|
||||
|
||||
namespace parquet {
|
||||
class ParquetFileReader;
|
||||
class Statistics;
|
||||
class ColumnChunkMetaData;
|
||||
class RowGroupMetaData;
|
||||
class FileMetaData;
|
||||
class FileDecryptionProperties;
|
||||
class FileEncryptionProperties;
|
||||
|
||||
class ReaderProperties;
|
||||
class ArrowReaderProperties;
|
||||
|
||||
class WriterProperties;
|
||||
class ArrowWriterProperties;
|
||||
|
||||
namespace arrow {
|
||||
class FileReader;
|
||||
class FileWriter;
|
||||
struct SchemaManifest;
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
struct ParquetDecryptionConfig;
|
||||
struct ParquetEncryptionConfig;
|
||||
|
||||
/// \addtogroup dataset-file-formats
|
||||
///
|
||||
/// @{
|
||||
|
||||
constexpr char kParquetTypeName[] = "parquet";
|
||||
|
||||
/// \brief A FileFormat implementation that reads from Parquet files
|
||||
class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
|
||||
public:
|
||||
ParquetFileFormat();
|
||||
|
||||
/// Convenience constructor which copies properties from a parquet::ReaderProperties.
|
||||
/// memory_pool will be ignored.
|
||||
explicit ParquetFileFormat(const parquet::ReaderProperties& reader_properties);
|
||||
|
||||
std::string type_name() const override { return kParquetTypeName; }
|
||||
|
||||
bool Equals(const FileFormat& other) const override;
|
||||
|
||||
struct ReaderOptions {
|
||||
/// \defgroup parquet-file-format-arrow-reader-properties properties which correspond
|
||||
/// to members of parquet::ArrowReaderProperties.
|
||||
///
|
||||
/// We don't embed parquet::ReaderProperties directly because column names (rather
|
||||
/// than indices) are used to indicate dictionary columns, and other options are
|
||||
/// deferred to scan time.
|
||||
///
|
||||
/// @{
|
||||
std::unordered_set<std::string> dict_columns;
|
||||
arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO;
|
||||
Type::type binary_type = Type::BINARY;
|
||||
Type::type list_type = Type::LIST;
|
||||
/// @}
|
||||
} reader_options;
|
||||
|
||||
Result<bool> IsSupported(const FileSource& source) const override;
|
||||
|
||||
/// \brief Return the schema of the file if possible.
|
||||
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
|
||||
|
||||
Result<RecordBatchGenerator> ScanBatchesAsync(
|
||||
const std::shared_ptr<ScanOptions>& options,
|
||||
const std::shared_ptr<FileFragment>& file) const override;
|
||||
|
||||
Future<std::optional<int64_t>> CountRows(
|
||||
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
|
||||
const std::shared_ptr<ScanOptions>& options) override;
|
||||
|
||||
using FileFormat::MakeFragment;
|
||||
|
||||
/// \brief Create a Fragment targeting all RowGroups.
|
||||
Result<std::shared_ptr<FileFragment>> MakeFragment(
|
||||
FileSource source, compute::Expression partition_expression,
|
||||
std::shared_ptr<Schema> physical_schema) override;
|
||||
|
||||
/// \brief Create a Fragment, restricted to the specified row groups.
|
||||
Result<std::shared_ptr<ParquetFileFragment>> MakeFragment(
|
||||
FileSource source, compute::Expression partition_expression,
|
||||
std::shared_ptr<Schema> physical_schema, std::vector<int> row_groups);
|
||||
|
||||
/// \brief Return a FileReader on the given source.
|
||||
Result<std::shared_ptr<parquet::arrow::FileReader>> GetReader(
|
||||
const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
|
||||
|
||||
Result<std::shared_ptr<parquet::arrow::FileReader>> GetReader(
|
||||
const FileSource& source, const std::shared_ptr<ScanOptions>& options,
|
||||
const std::shared_ptr<parquet::FileMetaData>& metadata) const;
|
||||
|
||||
Future<std::shared_ptr<parquet::arrow::FileReader>> GetReaderAsync(
|
||||
const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
|
||||
|
||||
Future<std::shared_ptr<parquet::arrow::FileReader>> GetReaderAsync(
|
||||
const FileSource& source, const std::shared_ptr<ScanOptions>& options,
|
||||
const std::shared_ptr<parquet::FileMetaData>& metadata) const;
|
||||
|
||||
Result<std::shared_ptr<FileWriter>> MakeWriter(
|
||||
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
|
||||
std::shared_ptr<FileWriteOptions> options,
|
||||
fs::FileLocator destination_locator) const override;
|
||||
|
||||
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
|
||||
};
|
||||
|
||||
/// \brief A FileFragment with parquet logic.
|
||||
///
|
||||
/// ParquetFileFragment provides a lazy (with respect to IO) interface to
|
||||
/// scan parquet files. Any heavy IO calls are deferred to the Scan() method.
|
||||
///
|
||||
/// The caller can provide an optional list of selected RowGroups to limit the
|
||||
/// number of scanned RowGroups, or to partition the scans across multiple
|
||||
/// threads.
|
||||
///
|
||||
/// Metadata can be explicitly provided, enabling pushdown predicate benefits without
|
||||
/// the potentially heavy IO of loading Metadata from the file system. This can induce
|
||||
/// significant performance boost when scanning high latency file systems.
|
||||
class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
|
||||
public:
|
||||
Result<FragmentVector> SplitByRowGroup(compute::Expression predicate);
|
||||
|
||||
/// \brief Return the RowGroups selected by this fragment.
|
||||
const std::vector<int>& row_groups() const {
|
||||
if (row_groups_) return *row_groups_;
|
||||
static std::vector<int> empty;
|
||||
return empty;
|
||||
}
|
||||
|
||||
/// \brief Return the FileMetaData associated with this fragment.
|
||||
///
|
||||
/// This may return nullptr if the fragment wasn't scanned yet, or if
|
||||
/// `ScanOptions::cache_metadata` was disabled.
|
||||
std::shared_ptr<parquet::FileMetaData> metadata();
|
||||
|
||||
/// \brief Ensure this fragment's FileMetaData is in memory.
|
||||
Status EnsureCompleteMetadata(parquet::arrow::FileReader* reader = NULLPTR);
|
||||
|
||||
Status ClearCachedMetadata() override;
|
||||
|
||||
/// \brief Return fragment which selects a filtered subset of this fragment's RowGroups.
|
||||
Result<std::shared_ptr<Fragment>> Subset(compute::Expression predicate);
|
||||
Result<std::shared_ptr<Fragment>> Subset(std::vector<int> row_group_ids);
|
||||
|
||||
static std::optional<compute::Expression> EvaluateStatisticsAsExpression(
|
||||
const Field& field, const parquet::Statistics& statistics);
|
||||
|
||||
static std::optional<compute::Expression> EvaluateStatisticsAsExpression(
|
||||
const Field& field, const FieldRef& field_ref,
|
||||
const parquet::Statistics& statistics);
|
||||
|
||||
private:
|
||||
ParquetFileFragment(FileSource source, std::shared_ptr<FileFormat> format,
|
||||
compute::Expression partition_expression,
|
||||
std::shared_ptr<Schema> physical_schema,
|
||||
std::optional<std::vector<int>> row_groups);
|
||||
|
||||
Status SetMetadata(std::shared_ptr<parquet::FileMetaData> metadata,
|
||||
std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
|
||||
std::shared_ptr<parquet::FileMetaData> original_metadata = {});
|
||||
|
||||
// Overridden to opportunistically set metadata since a reader must be opened anyway.
|
||||
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override {
|
||||
ARROW_RETURN_NOT_OK(EnsureCompleteMetadata());
|
||||
return physical_schema_;
|
||||
}
|
||||
|
||||
/// Return a filtered subset of row group indices.
|
||||
Result<std::vector<int>> FilterRowGroups(compute::Expression predicate);
|
||||
/// Simplify the predicate against the statistics of each row group.
|
||||
Result<std::vector<compute::Expression>> TestRowGroups(compute::Expression predicate);
|
||||
/// Try to count rows matching the predicate using metadata. Expects
|
||||
/// metadata to be present, and expects the predicate to have been
|
||||
/// simplified against the partition expression already.
|
||||
Result<std::optional<int64_t>> TryCountRows(compute::Expression predicate);
|
||||
|
||||
ParquetFileFormat& parquet_format_;
|
||||
|
||||
/// Indices of row groups selected by this fragment,
|
||||
/// or std::nullopt if all row groups are selected.
|
||||
std::optional<std::vector<int>> row_groups_;
|
||||
|
||||
// the expressions (combined for all columns for which statistics have been
|
||||
// processed) are stored per column group
|
||||
std::vector<compute::Expression> statistics_expressions_;
|
||||
// statistics status are kept track of by Parquet Schema column indices
|
||||
// (i.e. not Arrow schema field index)
|
||||
std::vector<bool> statistics_expressions_complete_;
|
||||
std::shared_ptr<parquet::FileMetaData> metadata_;
|
||||
std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
|
||||
// The FileMetaData that owns the SchemaDescriptor pointed by SchemaManifest.
|
||||
std::shared_ptr<parquet::FileMetaData> original_metadata_;
|
||||
|
||||
friend class ParquetFileFormat;
|
||||
friend class ParquetDatasetFactory;
|
||||
};
|
||||
|
||||
/// \brief Per-scan options for Parquet fragments
|
||||
class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions {
|
||||
public:
|
||||
ParquetFragmentScanOptions();
|
||||
std::string type_name() const override { return kParquetTypeName; }
|
||||
|
||||
/// Reader properties. Not all properties are respected: memory_pool comes from
|
||||
/// ScanOptions.
|
||||
std::shared_ptr<parquet::ReaderProperties> reader_properties;
|
||||
/// Arrow reader properties. Not all properties are respected: batch_size comes from
|
||||
/// ScanOptions. Additionally, other options come from ParquetFileFormat::ReaderOptions.
|
||||
std::shared_ptr<parquet::ArrowReaderProperties> arrow_reader_properties;
|
||||
/// A configuration structure that provides decryption properties for a dataset
|
||||
std::shared_ptr<ParquetDecryptionConfig> parquet_decryption_config = NULLPTR;
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT ParquetFileWriteOptions : public FileWriteOptions {
|
||||
public:
|
||||
/// \brief Parquet writer properties.
|
||||
std::shared_ptr<parquet::WriterProperties> writer_properties;
|
||||
|
||||
/// \brief Parquet Arrow writer properties.
|
||||
std::shared_ptr<parquet::ArrowWriterProperties> arrow_writer_properties;
|
||||
|
||||
// A configuration structure that provides encryption properties for a dataset
|
||||
std::shared_ptr<ParquetEncryptionConfig> parquet_encryption_config = NULLPTR;
|
||||
|
||||
protected:
|
||||
explicit ParquetFileWriteOptions(std::shared_ptr<FileFormat> format)
|
||||
: FileWriteOptions(std::move(format)) {}
|
||||
|
||||
friend class ParquetFileFormat;
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT ParquetFileWriter : public FileWriter {
|
||||
public:
|
||||
const std::shared_ptr<parquet::arrow::FileWriter>& parquet_writer() const {
|
||||
return parquet_writer_;
|
||||
}
|
||||
|
||||
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
|
||||
|
||||
private:
|
||||
ParquetFileWriter(std::shared_ptr<io::OutputStream> destination,
|
||||
std::shared_ptr<parquet::arrow::FileWriter> writer,
|
||||
std::shared_ptr<ParquetFileWriteOptions> options,
|
||||
fs::FileLocator destination_locator);
|
||||
|
||||
Future<> FinishInternal() override;
|
||||
|
||||
std::shared_ptr<parquet::arrow::FileWriter> parquet_writer_;
|
||||
|
||||
friend class ParquetFileFormat;
|
||||
};
|
||||
|
||||
/// \brief Options for making a FileSystemDataset from a Parquet _metadata file.
|
||||
struct ParquetFactoryOptions {
|
||||
/// Either an explicit Partitioning or a PartitioningFactory to discover one.
|
||||
///
|
||||
/// If a factory is provided, it will be used to infer a schema for partition fields
|
||||
/// based on file and directory paths then construct a Partitioning. The default
|
||||
/// is a Partitioning which will yield no partition information.
|
||||
///
|
||||
/// The (explicit or discovered) partitioning will be applied to discovered files
|
||||
/// and the resulting partition information embedded in the Dataset.
|
||||
PartitioningOrFactory partitioning{Partitioning::Default()};
|
||||
|
||||
/// For the purposes of applying the partitioning, paths will be stripped
|
||||
/// of the partition_base_dir. Files not matching the partition_base_dir
|
||||
/// prefix will be skipped for partition discovery. The ignored files will still
|
||||
/// be part of the Dataset, but will not have partition information.
|
||||
///
|
||||
/// Example:
|
||||
/// partition_base_dir = "/dataset";
|
||||
///
|
||||
/// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
|
||||
///
|
||||
/// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
|
||||
///
|
||||
/// This is useful for partitioning which parses directory when ordering
|
||||
/// is important, e.g. DirectoryPartitioning.
|
||||
std::string partition_base_dir;
|
||||
|
||||
/// Assert that all ColumnChunk paths are consistent. The parquet spec allows for
|
||||
/// ColumnChunk data to be stored in multiple files, but ParquetDatasetFactory
|
||||
/// supports only a single file with all ColumnChunk data. If this flag is set
|
||||
/// construction of a ParquetDatasetFactory will raise an error if ColumnChunk
|
||||
/// data is not resident in a single file.
|
||||
bool validate_column_chunk_paths = false;
|
||||
};
|
||||
|
||||
/// \brief Create FileSystemDataset from custom `_metadata` cache file.
|
||||
///
|
||||
/// Dask and other systems will generate a cache metadata file by concatenating
|
||||
/// the RowGroupMetaData of multiple parquet files into a single parquet file
|
||||
/// that only contains metadata and no ColumnChunk data.
|
||||
///
|
||||
/// ParquetDatasetFactory creates a FileSystemDataset composed of
|
||||
/// ParquetFileFragment where each fragment is pre-populated with the exact
|
||||
/// number of row groups and statistics for each columns.
|
||||
class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory {
|
||||
public:
|
||||
/// \brief Create a ParquetDatasetFactory from a metadata path.
|
||||
///
|
||||
/// The `metadata_path` will be read from `filesystem`. Each RowGroup
|
||||
/// contained in the metadata file will be relative to `dirname(metadata_path)`.
|
||||
///
|
||||
/// \param[in] metadata_path path of the metadata parquet file
|
||||
/// \param[in] filesystem from which to open/read the path
|
||||
/// \param[in] format to read the file with.
|
||||
/// \param[in] options see ParquetFactoryOptions
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
const std::string& metadata_path, std::shared_ptr<fs::FileSystem> filesystem,
|
||||
std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
|
||||
|
||||
/// \brief Create a ParquetDatasetFactory from a metadata source.
|
||||
///
|
||||
/// Similar to the previous Make definition, but the metadata can be a Buffer
|
||||
/// and the base_path is explicit instead of inferred from the metadata
|
||||
/// path.
|
||||
///
|
||||
/// \param[in] metadata source to open the metadata parquet file from
|
||||
/// \param[in] base_path used as the prefix of every parquet files referenced
|
||||
/// \param[in] filesystem from which to read the files referenced.
|
||||
/// \param[in] format to read the file with.
|
||||
/// \param[in] options see ParquetFactoryOptions
|
||||
static Result<std::shared_ptr<DatasetFactory>> Make(
|
||||
const FileSource& metadata, const std::string& base_path,
|
||||
std::shared_ptr<fs::FileSystem> filesystem,
|
||||
std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
|
||||
|
||||
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
|
||||
InspectOptions options) override;
|
||||
|
||||
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
|
||||
|
||||
protected:
|
||||
ParquetDatasetFactory(
|
||||
std::shared_ptr<fs::FileSystem> filesystem,
|
||||
std::shared_ptr<ParquetFileFormat> format,
|
||||
std::shared_ptr<parquet::FileMetaData> metadata,
|
||||
std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
|
||||
std::shared_ptr<Schema> physical_schema, std::string base_path,
|
||||
ParquetFactoryOptions options,
|
||||
std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids)
|
||||
: filesystem_(std::move(filesystem)),
|
||||
format_(std::move(format)),
|
||||
metadata_(std::move(metadata)),
|
||||
manifest_(std::move(manifest)),
|
||||
physical_schema_(std::move(physical_schema)),
|
||||
base_path_(std::move(base_path)),
|
||||
options_(std::move(options)),
|
||||
paths_with_row_group_ids_(std::move(paths_with_row_group_ids)) {}
|
||||
|
||||
std::shared_ptr<fs::FileSystem> filesystem_;
|
||||
std::shared_ptr<ParquetFileFormat> format_;
|
||||
std::shared_ptr<parquet::FileMetaData> metadata_;
|
||||
std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
|
||||
std::shared_ptr<Schema> physical_schema_;
|
||||
std::string base_path_;
|
||||
ParquetFactoryOptions options_;
|
||||
std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids_;
|
||||
|
||||
private:
|
||||
Result<std::vector<std::shared_ptr<FileFragment>>> CollectParquetFragments(
|
||||
const Partitioning& partitioning);
|
||||
|
||||
Result<std::shared_ptr<Schema>> PartitionSchema();
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,75 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
class CryptoFactory;
|
||||
struct KmsConnectionConfig;
|
||||
struct EncryptionConfiguration;
|
||||
struct DecryptionConfiguration;
|
||||
} // namespace parquet::encryption
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
/// \brief Core configuration class encapsulating parameters for high-level encryption
|
||||
/// within Parquet framework.
|
||||
///
|
||||
/// ParquetEncryptionConfig serves as a bridge, passing encryption-related
|
||||
/// parameters to appropriate components within the Parquet library. It holds references
|
||||
/// to objects defining encryption strategy, Key Management Service (KMS) configuration,
|
||||
/// and specific encryption configurations for Parquet data.
|
||||
struct ARROW_DS_EXPORT ParquetEncryptionConfig {
|
||||
/// Shared pointer to CryptoFactory object, responsible for creating cryptographic
|
||||
/// components like encryptors and decryptors.
|
||||
std::shared_ptr<parquet::encryption::CryptoFactory> crypto_factory;
|
||||
|
||||
/// Shared pointer to KmsConnectionConfig object, holding configuration parameters for
|
||||
/// connecting to a Key Management Service (KMS).
|
||||
std::shared_ptr<parquet::encryption::KmsConnectionConfig> kms_connection_config;
|
||||
|
||||
/// Shared pointer to EncryptionConfiguration object, defining specific encryption
|
||||
/// settings for Parquet data, like keys for different columns.
|
||||
std::shared_ptr<parquet::encryption::EncryptionConfiguration> encryption_config;
|
||||
};
|
||||
|
||||
/// \brief Core configuration class encapsulating parameters for high-level decryption
|
||||
/// within Parquet framework.
|
||||
///
|
||||
/// ParquetDecryptionConfig is designed to pass decryption-related parameters to
|
||||
/// appropriate decryption components within Parquet library. It holds references to
|
||||
/// objects defining decryption strategy, Key Management Service (KMS) configuration,
|
||||
/// and specific decryption configurations for reading encrypted Parquet data.
|
||||
struct ARROW_DS_EXPORT ParquetDecryptionConfig {
|
||||
/// Shared pointer to CryptoFactory object, pivotal in creating cryptographic
|
||||
/// components for decryption process.
|
||||
std::shared_ptr<parquet::encryption::CryptoFactory> crypto_factory;
|
||||
|
||||
/// Shared pointer to KmsConnectionConfig object, containing parameters for connecting
|
||||
/// to a Key Management Service (KMS) during decryption.
|
||||
std::shared_ptr<parquet::encryption::KmsConnectionConfig> kms_connection_config;
|
||||
|
||||
/// Shared pointer to DecryptionConfiguration object, specifying decryption settings
|
||||
/// for reading encrypted Parquet data.
|
||||
std::shared_ptr<parquet::encryption::DecryptionConfiguration> decryption_config;
|
||||
};
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,432 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/util/compare.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace dataset {
|
||||
|
||||
constexpr char kFilenamePartitionSep = '_';
|
||||
|
||||
struct ARROW_DS_EXPORT PartitionPathFormat {
|
||||
std::string directory, filename;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Partitioning
|
||||
|
||||
/// \defgroup dataset-partitioning Partitioning API
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Interface for parsing partition expressions from string partition
|
||||
/// identifiers.
|
||||
///
|
||||
/// For example, the identifier "foo=5" might be parsed to an equality expression
|
||||
/// between the "foo" field and the value 5.
|
||||
///
|
||||
/// Some partitionings may store the field names in a metadata
|
||||
/// store instead of in file paths, for example
|
||||
/// dataset_root/2009/11/... could be used when the partition fields
|
||||
/// are "year" and "month"
|
||||
///
|
||||
/// Paths are consumed from left to right. Paths must be relative to
|
||||
/// the root of a partition; path prefixes must be removed before passing
|
||||
/// the path to a partitioning for parsing.
|
||||
class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable<Partitioning> {
|
||||
public:
|
||||
virtual ~Partitioning() = default;
|
||||
|
||||
/// \brief The name identifying the kind of partitioning
|
||||
virtual std::string type_name() const = 0;
|
||||
|
||||
//// \brief Return whether the partitionings are equal
|
||||
virtual bool Equals(const Partitioning& other) const {
|
||||
return schema_->Equals(other.schema_, /*check_metadata=*/false);
|
||||
}
|
||||
|
||||
/// \brief If the input batch shares any fields with this partitioning,
|
||||
/// produce sub-batches which satisfy mutually exclusive Expressions.
|
||||
struct PartitionedBatches {
|
||||
RecordBatchVector batches;
|
||||
std::vector<compute::Expression> expressions;
|
||||
};
|
||||
virtual Result<PartitionedBatches> Partition(
|
||||
const std::shared_ptr<RecordBatch>& batch) const = 0;
|
||||
|
||||
/// \brief Parse a path into a partition expression
|
||||
virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
|
||||
|
||||
virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;
|
||||
|
||||
/// \brief A default Partitioning which is a DirectoryPartitioning
|
||||
/// with an empty schema.
|
||||
static std::shared_ptr<Partitioning> Default();
|
||||
|
||||
/// \brief The partition schema.
|
||||
const std::shared_ptr<Schema>& schema() const { return schema_; }
|
||||
|
||||
protected:
|
||||
explicit Partitioning(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
|
||||
|
||||
std::shared_ptr<Schema> schema_;
|
||||
};
|
||||
|
||||
/// \brief The encoding of partition segments.
|
||||
enum class SegmentEncoding : int8_t {
|
||||
/// No encoding.
|
||||
None = 0,
|
||||
/// Segment values are URL-encoded.
|
||||
Uri = 1,
|
||||
};
|
||||
|
||||
ARROW_DS_EXPORT
|
||||
std::ostream& operator<<(std::ostream& os, SegmentEncoding segment_encoding);
|
||||
|
||||
/// \brief Options for key-value based partitioning (hive/directory).
|
||||
struct ARROW_DS_EXPORT KeyValuePartitioningOptions {
|
||||
/// After splitting a path into components, decode the path components
|
||||
/// before parsing according to this scheme.
|
||||
SegmentEncoding segment_encoding = SegmentEncoding::Uri;
|
||||
};
|
||||
|
||||
/// \brief Options for inferring a partitioning.
|
||||
struct ARROW_DS_EXPORT PartitioningFactoryOptions {
|
||||
/// When inferring a schema for partition fields, yield dictionary encoded types
|
||||
/// instead of plain. This can be more efficient when materializing virtual
|
||||
/// columns, and Expressions parsed by the finished Partitioning will include
|
||||
/// dictionaries of all unique inspected values for each field.
|
||||
bool infer_dictionary = false;
|
||||
/// Optionally, an expected schema can be provided, in which case inference
|
||||
/// will only check discovered fields against the schema and update internal
|
||||
/// state (such as dictionaries).
|
||||
std::shared_ptr<Schema> schema;
|
||||
/// After splitting a path into components, decode the path components
|
||||
/// before parsing according to this scheme.
|
||||
SegmentEncoding segment_encoding = SegmentEncoding::Uri;
|
||||
|
||||
KeyValuePartitioningOptions AsPartitioningOptions() const;
|
||||
};
|
||||
|
||||
/// \brief Options for inferring a hive-style partitioning.
|
||||
struct ARROW_DS_EXPORT HivePartitioningFactoryOptions : PartitioningFactoryOptions {
|
||||
/// The hive partitioning scheme maps null to a hard coded fallback string.
|
||||
std::string null_fallback;
|
||||
|
||||
HivePartitioningOptions AsHivePartitioningOptions() const;
|
||||
};
|
||||
|
||||
/// \brief PartitioningFactory provides creation of a partitioning when the
|
||||
/// specific schema must be inferred from available paths (no explicit schema is known).
|
||||
class ARROW_DS_EXPORT PartitioningFactory {
|
||||
public:
|
||||
virtual ~PartitioningFactory() = default;
|
||||
|
||||
/// \brief The name identifying the kind of partitioning
|
||||
virtual std::string type_name() const = 0;
|
||||
|
||||
/// Get the schema for the resulting Partitioning.
|
||||
/// This may reset internal state, for example dictionaries of unique representations.
|
||||
virtual Result<std::shared_ptr<Schema>> Inspect(
|
||||
const std::vector<std::string>& paths) = 0;
|
||||
|
||||
/// Create a partitioning using the provided schema
|
||||
/// (fields may be dropped).
|
||||
virtual Result<std::shared_ptr<Partitioning>> Finish(
|
||||
const std::shared_ptr<Schema>& schema) const = 0;
|
||||
};
|
||||
|
||||
/// \brief Subclass for the common case of a partitioning which yields an equality
|
||||
/// expression for each segment
|
||||
class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
|
||||
public:
|
||||
/// An unconverted equality expression consisting of a field name and the representation
|
||||
/// of a scalar value
|
||||
struct Key {
|
||||
std::string name;
|
||||
std::optional<std::string> value;
|
||||
};
|
||||
|
||||
Result<PartitionedBatches> Partition(
|
||||
const std::shared_ptr<RecordBatch>& batch) const override;
|
||||
|
||||
Result<compute::Expression> Parse(const std::string& path) const override;
|
||||
|
||||
Result<PartitionPathFormat> Format(const compute::Expression& expr) const override;
|
||||
|
||||
const ArrayVector& dictionaries() const { return dictionaries_; }
|
||||
|
||||
SegmentEncoding segment_encoding() const { return options_.segment_encoding; }
|
||||
|
||||
bool Equals(const Partitioning& other) const override;
|
||||
|
||||
protected:
|
||||
KeyValuePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
|
||||
KeyValuePartitioningOptions options)
|
||||
: Partitioning(std::move(schema)),
|
||||
dictionaries_(std::move(dictionaries)),
|
||||
options_(options) {
|
||||
if (dictionaries_.empty()) {
|
||||
dictionaries_.resize(schema_->num_fields());
|
||||
}
|
||||
}
|
||||
|
||||
virtual Result<std::vector<Key>> ParseKeys(const std::string& path) const = 0;
|
||||
|
||||
virtual Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const = 0;
|
||||
|
||||
/// Convert a Key to a full expression.
|
||||
Result<compute::Expression> ConvertKey(const Key& key) const;
|
||||
|
||||
Result<std::vector<std::string>> FormatPartitionSegments(
|
||||
const ScalarVector& values) const;
|
||||
Result<std::vector<Key>> ParsePartitionSegments(
|
||||
const std::vector<std::string>& segments) const;
|
||||
|
||||
ArrayVector dictionaries_;
|
||||
KeyValuePartitioningOptions options_;
|
||||
};
|
||||
|
||||
/// \brief DirectoryPartitioning parses one segment of a path for each field in its
|
||||
/// schema. All fields are required, so paths passed to DirectoryPartitioning::Parse
|
||||
/// must contain segments for each field.
|
||||
///
|
||||
/// For example given schema<year:int16, month:int8> the path "/2009/11" would be
|
||||
/// parsed to ("year"_ == 2009 and "month"_ == 11)
|
||||
class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
|
||||
public:
|
||||
/// If a field in schema is of dictionary type, the corresponding element of
|
||||
/// dictionaries must be contain the dictionary of values for that field.
|
||||
explicit DirectoryPartitioning(std::shared_ptr<Schema> schema,
|
||||
ArrayVector dictionaries = {},
|
||||
KeyValuePartitioningOptions options = {});
|
||||
|
||||
std::string type_name() const override { return "directory"; }
|
||||
|
||||
bool Equals(const Partitioning& other) const override;
|
||||
|
||||
/// \brief Create a factory for a directory partitioning.
|
||||
///
|
||||
/// \param[in] field_names The names for the partition fields. Types will be
|
||||
/// inferred.
|
||||
static std::shared_ptr<PartitioningFactory> MakeFactory(
|
||||
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
|
||||
|
||||
private:
|
||||
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
|
||||
|
||||
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
|
||||
};
|
||||
|
||||
/// \brief The default fallback used for null values in a Hive-style partitioning.
|
||||
static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__";
|
||||
|
||||
struct ARROW_DS_EXPORT HivePartitioningOptions : public KeyValuePartitioningOptions {
|
||||
std::string null_fallback = kDefaultHiveNullFallback;
|
||||
|
||||
static HivePartitioningOptions DefaultsWithNullFallback(std::string fallback) {
|
||||
HivePartitioningOptions options;
|
||||
options.null_fallback = std::move(fallback);
|
||||
return options;
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief Multi-level, directory based partitioning
|
||||
/// originating from Apache Hive with all data files stored in the
|
||||
/// leaf directories. Data is partitioned by static values of a
|
||||
/// particular column in the schema. Partition keys are represented in
|
||||
/// the form $key=$value in directory names.
|
||||
/// Field order is ignored, as are missing or unrecognized field names.
|
||||
///
|
||||
/// For example given schema<year:int16, month:int8, day:int8> the path
|
||||
/// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321)
|
||||
class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
|
||||
public:
|
||||
/// If a field in schema is of dictionary type, the corresponding element of
|
||||
/// dictionaries must be contain the dictionary of values for that field.
|
||||
explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries = {},
|
||||
std::string null_fallback = kDefaultHiveNullFallback)
|
||||
: KeyValuePartitioning(std::move(schema), std::move(dictionaries),
|
||||
KeyValuePartitioningOptions()),
|
||||
hive_options_(
|
||||
HivePartitioningOptions::DefaultsWithNullFallback(std::move(null_fallback))) {
|
||||
}
|
||||
|
||||
explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
|
||||
HivePartitioningOptions options)
|
||||
: KeyValuePartitioning(std::move(schema), std::move(dictionaries), options),
|
||||
hive_options_(options) {}
|
||||
|
||||
std::string type_name() const override { return "hive"; }
|
||||
std::string null_fallback() const { return hive_options_.null_fallback; }
|
||||
const HivePartitioningOptions& options() const { return hive_options_; }
|
||||
|
||||
static Result<std::optional<Key>> ParseKey(const std::string& segment,
|
||||
const HivePartitioningOptions& options);
|
||||
|
||||
bool Equals(const Partitioning& other) const override;
|
||||
|
||||
/// \brief Create a factory for a hive partitioning.
|
||||
static std::shared_ptr<PartitioningFactory> MakeFactory(
|
||||
HivePartitioningFactoryOptions = {});
|
||||
|
||||
private:
|
||||
const HivePartitioningOptions hive_options_;
|
||||
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
|
||||
|
||||
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
|
||||
};
|
||||
|
||||
/// \brief Implementation provided by lambda or other callable
|
||||
class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
|
||||
public:
|
||||
using ParseImpl = std::function<Result<compute::Expression>(const std::string&)>;
|
||||
|
||||
using FormatImpl =
|
||||
std::function<Result<PartitionPathFormat>(const compute::Expression&)>;
|
||||
|
||||
FunctionPartitioning(std::shared_ptr<Schema> schema, ParseImpl parse_impl,
|
||||
FormatImpl format_impl = NULLPTR, std::string name = "function")
|
||||
: Partitioning(std::move(schema)),
|
||||
parse_impl_(std::move(parse_impl)),
|
||||
format_impl_(std::move(format_impl)),
|
||||
name_(std::move(name)) {}
|
||||
|
||||
std::string type_name() const override { return name_; }
|
||||
|
||||
bool Equals(const Partitioning& other) const override { return false; }
|
||||
|
||||
Result<compute::Expression> Parse(const std::string& path) const override {
|
||||
return parse_impl_(path);
|
||||
}
|
||||
|
||||
Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
|
||||
if (format_impl_) {
|
||||
return format_impl_(expr);
|
||||
}
|
||||
return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning");
|
||||
}
|
||||
|
||||
Result<PartitionedBatches> Partition(
|
||||
const std::shared_ptr<RecordBatch>& batch) const override {
|
||||
return Status::NotImplemented("partitioning batches from ", type_name(),
|
||||
" Partitioning");
|
||||
}
|
||||
|
||||
private:
|
||||
ParseImpl parse_impl_;
|
||||
FormatImpl format_impl_;
|
||||
std::string name_;
|
||||
};
|
||||
|
||||
class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
|
||||
public:
|
||||
/// \brief Construct a FilenamePartitioning from its components.
|
||||
///
|
||||
/// If a field in schema is of dictionary type, the corresponding element of
|
||||
/// dictionaries must be contain the dictionary of values for that field.
|
||||
explicit FilenamePartitioning(std::shared_ptr<Schema> schema,
|
||||
ArrayVector dictionaries = {},
|
||||
KeyValuePartitioningOptions options = {});
|
||||
|
||||
std::string type_name() const override { return "filename"; }
|
||||
|
||||
/// \brief Create a factory for a filename partitioning.
|
||||
///
|
||||
/// \param[in] field_names The names for the partition fields. Types will be
|
||||
/// inferred.
|
||||
static std::shared_ptr<PartitioningFactory> MakeFactory(
|
||||
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
|
||||
|
||||
bool Equals(const Partitioning& other) const override;
|
||||
|
||||
private:
|
||||
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
|
||||
|
||||
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
|
||||
};
|
||||
|
||||
ARROW_DS_EXPORT std::string StripPrefix(const std::string& path,
|
||||
const std::string& prefix);
|
||||
|
||||
/// \brief Extracts the directory and filename and removes the prefix of a path
|
||||
///
|
||||
/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") ->
|
||||
/// {"year=2019","c.txt"}`
|
||||
ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path,
|
||||
const std::string& prefix);
|
||||
|
||||
/// \brief Vector version of StripPrefixAndFilename.
|
||||
ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
|
||||
const std::vector<std::string>& paths, const std::string& prefix);
|
||||
|
||||
/// \brief Vector version of StripPrefixAndFilename.
|
||||
ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
|
||||
const std::vector<fs::FileInfo>& files, const std::string& prefix);
|
||||
|
||||
/// \brief Either a Partitioning or a PartitioningFactory
|
||||
class ARROW_DS_EXPORT PartitioningOrFactory {
|
||||
public:
|
||||
explicit PartitioningOrFactory(std::shared_ptr<Partitioning> partitioning)
|
||||
: partitioning_(std::move(partitioning)) {}
|
||||
|
||||
explicit PartitioningOrFactory(std::shared_ptr<PartitioningFactory> factory)
|
||||
: factory_(std::move(factory)) {}
|
||||
|
||||
PartitioningOrFactory& operator=(std::shared_ptr<Partitioning> partitioning) {
|
||||
return *this = PartitioningOrFactory(std::move(partitioning));
|
||||
}
|
||||
|
||||
PartitioningOrFactory& operator=(std::shared_ptr<PartitioningFactory> factory) {
|
||||
return *this = PartitioningOrFactory(std::move(factory));
|
||||
}
|
||||
|
||||
/// \brief The partitioning (if given).
|
||||
const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
|
||||
|
||||
/// \brief The partition factory (if given).
|
||||
const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
|
||||
|
||||
/// \brief Get the partition schema, inferring it with the given factory if needed.
|
||||
Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
|
||||
|
||||
private:
|
||||
std::shared_ptr<PartitioningFactory> factory_;
|
||||
std::shared_ptr<Partitioning> partitioning_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,33 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#include "arrow/dataset/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
namespace internal {
|
||||
|
||||
/// Register dataset-based exec nodes with the exec node registry
|
||||
///
|
||||
/// This function must be called before using dataset ExecNode factories
|
||||
ARROW_DS_EXPORT void Initialize();
|
||||
|
||||
} // namespace internal
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,32 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
// FIXME this is superceded by compute::Expression::Bind
|
||||
ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to);
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,623 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/acero/options.h"
|
||||
#include "arrow/compute/expression.h"
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/dataset/dataset.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/async_generator_fwd.h"
|
||||
#include "arrow/util/iterator.h"
|
||||
#include "arrow/util/thread_pool.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
|
||||
|
||||
namespace dataset {
|
||||
|
||||
/// \defgroup dataset-scanning Scanning API
|
||||
///
|
||||
/// @{
|
||||
|
||||
constexpr int64_t kDefaultBatchSize = 1 << 17; // 128Ki rows
|
||||
// This will yield 64 batches ~ 8Mi rows
|
||||
constexpr int32_t kDefaultBatchReadahead = 16;
|
||||
constexpr int32_t kDefaultFragmentReadahead = 4;
|
||||
constexpr int32_t kDefaultBytesReadahead = 1 << 25; // 32MiB
|
||||
|
||||
/// Scan-specific options, which can be changed between scans of the same dataset.
|
||||
struct ARROW_DS_EXPORT ScanOptions {
|
||||
/// A row filter (which will be pushed down to partitioning/reading if supported).
|
||||
compute::Expression filter = compute::literal(true);
|
||||
/// A projection expression (which can add/remove/rename columns).
|
||||
compute::Expression projection;
|
||||
|
||||
/// Schema with which batches will be read from fragments. This is also known as the
|
||||
/// "reader schema" it will be used (for example) in constructing CSV file readers to
|
||||
/// identify column types for parsing. Usually only a subset of its fields (see
|
||||
/// MaterializedFields) will be materialized during a scan.
|
||||
std::shared_ptr<Schema> dataset_schema;
|
||||
|
||||
/// Schema of projected record batches. This is independent of dataset_schema as its
|
||||
/// fields are derived from the projection. For example, let
|
||||
///
|
||||
/// dataset_schema = {"a": int32, "b": int32, "id": utf8}
|
||||
/// projection = project({equal(field_ref("a"), field_ref("b"))}, {"a_plus_b"})
|
||||
///
|
||||
/// (no filter specified). In this case, the projected_schema would be
|
||||
///
|
||||
/// {"a_plus_b": int32}
|
||||
std::shared_ptr<Schema> projected_schema;
|
||||
|
||||
/// Maximum row count for scanned batches.
|
||||
int64_t batch_size = kDefaultBatchSize;
|
||||
|
||||
/// How many batches to read ahead within a fragment.
|
||||
///
|
||||
/// Set to 0 to disable batch readahead
|
||||
///
|
||||
/// Note: May not be supported by all formats
|
||||
/// Note: Will be ignored if use_threads is set to false
|
||||
int32_t batch_readahead = kDefaultBatchReadahead;
|
||||
|
||||
/// How many files to read ahead
|
||||
///
|
||||
/// Set to 0 to disable fragment readahead
|
||||
///
|
||||
/// Note: May not be enforced by all scanners
|
||||
/// Note: Will be ignored if use_threads is set to false
|
||||
int32_t fragment_readahead = kDefaultFragmentReadahead;
|
||||
|
||||
/// A pool from which materialized and scanned arrays will be allocated.
|
||||
MemoryPool* pool = arrow::default_memory_pool();
|
||||
|
||||
/// IOContext for any IO tasks
|
||||
///
|
||||
/// Note: The IOContext executor will be ignored if use_threads is set to false
|
||||
io::IOContext io_context;
|
||||
|
||||
/// Executor for any CPU tasks
|
||||
///
|
||||
/// If null, the global CPU executor will be used
|
||||
///
|
||||
/// Note: The Executor will be ignored if use_threads is set to false
|
||||
arrow::internal::Executor* cpu_executor = NULLPTR;
|
||||
|
||||
/// If true the scanner will scan in parallel
|
||||
///
|
||||
/// Note: If true, this will use threads from both the cpu_executor and the
|
||||
/// io_context.executor
|
||||
/// Note: This must be true in order for any readahead to happen
|
||||
bool use_threads = false;
|
||||
|
||||
/// If true the scanner will add augmented fields to the output schema.
|
||||
bool add_augmented_fields = true;
|
||||
|
||||
/// Whether to cache metadata when scanning.
|
||||
///
|
||||
/// Fragments may typically cache metadata to speed up repeated accesses.
|
||||
/// However, in use cases where a single scan is done, or if memory use
|
||||
/// is more critical than CPU time, setting this option to false can
|
||||
/// lessen memory use.
|
||||
bool cache_metadata = true;
|
||||
|
||||
/// Fragment-specific scan options.
|
||||
std::shared_ptr<FragmentScanOptions> fragment_scan_options;
|
||||
|
||||
/// Return a vector of FieldRefs that require materialization.
|
||||
///
|
||||
/// This is usually the union of the fields referenced in the projection and the
|
||||
/// filter expression. Examples:
|
||||
///
|
||||
/// - `SELECT a, b WHERE a < 2 && c > 1` => ["a", "b", "a", "c"]
|
||||
/// - `SELECT a + b < 3 WHERE a > 1` => ["a", "b", "a"]
|
||||
///
|
||||
/// This is needed for expression where a field may not be directly
|
||||
/// used in the final projection but is still required to evaluate the
|
||||
/// expression.
|
||||
///
|
||||
/// This is used by Fragment implementations to apply the column
|
||||
/// sub-selection optimization.
|
||||
std::vector<FieldRef> MaterializedFields() const;
|
||||
|
||||
/// Parameters which control when the plan should pause for a slow consumer
|
||||
acero::BackpressureOptions backpressure =
|
||||
acero::BackpressureOptions::DefaultBackpressure();
|
||||
};
|
||||
|
||||
/// Scan-specific options, which can be changed between scans of the same dataset.
|
||||
///
|
||||
/// A dataset consists of one or more individual fragments. A fragment is anything
|
||||
/// that is independently scannable, often a file.
|
||||
///
|
||||
/// Batches from all fragments will be converted to a single schema. This unified
|
||||
/// schema is referred to as the "dataset schema" and is the output schema for
|
||||
/// this node.
|
||||
///
|
||||
/// Individual fragments may have schemas that are different from the dataset
|
||||
/// schema. This is sometimes referred to as the physical or fragment schema.
|
||||
/// Conversion from the fragment schema to the dataset schema is a process
|
||||
/// known as evolution.
|
||||
struct ARROW_DS_EXPORT ScanV2Options : public acero::ExecNodeOptions {
|
||||
explicit ScanV2Options(std::shared_ptr<Dataset> dataset)
|
||||
: dataset(std::move(dataset)) {}
|
||||
|
||||
/// \brief The dataset to scan
|
||||
std::shared_ptr<Dataset> dataset;
|
||||
/// \brief A row filter
|
||||
///
|
||||
/// The filter expression should be written against the dataset schema.
|
||||
/// The filter must be unbound.
|
||||
///
|
||||
/// This is an opportunistic pushdown filter. Filtering capabilities will
|
||||
/// vary between formats. If a format is not capable of applying the filter
|
||||
/// then it will ignore it.
|
||||
///
|
||||
/// Each fragment will do its best to filter the data based on the information
|
||||
/// (partitioning guarantees, statistics) available to it. If it is able to
|
||||
/// apply some filtering then it will indicate what filtering it was able to
|
||||
/// apply by attaching a guarantee to the batch.
|
||||
///
|
||||
/// For example, if a filter is x < 50 && y > 40 then a batch may be able to
|
||||
/// apply a guarantee x < 50. Post-scan filtering would then only need to
|
||||
/// consider y > 40 (for this specific batch). The next batch may not be able
|
||||
/// to attach any guarantee and both clauses would need to be applied to that batch.
|
||||
///
|
||||
/// A single guarantee-aware filtering operation should generally be applied to all
|
||||
/// resulting batches. The scan node is not responsible for this.
|
||||
///
|
||||
/// Fields that are referenced by the filter should be included in the `columns` vector.
|
||||
/// The scan node will not automatically fetch fields referenced by the filter
|
||||
/// expression. \see AddFieldsNeededForFilter
|
||||
///
|
||||
/// If the filter references fields that are not included in `columns` this may or may
|
||||
/// not be an error, depending on the format.
|
||||
compute::Expression filter = compute::literal(true);
|
||||
|
||||
/// \brief The columns to scan
|
||||
///
|
||||
/// This is not a simple list of top-level column indices but instead a set of paths
|
||||
/// allowing for partial selection of columns
|
||||
///
|
||||
/// These paths refer to the dataset schema
|
||||
///
|
||||
/// For example, consider the following dataset schema:
|
||||
/// schema({
|
||||
/// field("score", int32()),
|
||||
/// "marker", struct_({
|
||||
/// field("color", utf8()),
|
||||
/// field("location", struct_({
|
||||
/// field("x", float64()),
|
||||
/// field("y", float64())
|
||||
/// })
|
||||
/// })
|
||||
/// })
|
||||
///
|
||||
/// If `columns` is {{0}, {1,1,0}} then the output schema is:
|
||||
/// schema({field("score", int32()), field("x", float64())})
|
||||
///
|
||||
/// If `columns` is {{1,1,1}, {1,1}} then the output schema is:
|
||||
/// schema({
|
||||
/// field("y", float64()),
|
||||
/// field("location", struct_({
|
||||
/// field("x", float64()),
|
||||
/// field("y", float64())
|
||||
/// })
|
||||
/// })
|
||||
std::vector<FieldPath> columns;
|
||||
|
||||
/// \brief Target number of bytes to read ahead in a fragment
|
||||
///
|
||||
/// This limit involves some amount of estimation. Formats typically only know
|
||||
/// batch boundaries in terms of rows (not decoded bytes) and so an estimation
|
||||
/// must be done to guess the average row size. Other formats like CSV and JSON
|
||||
/// must make even more generalized guesses.
|
||||
///
|
||||
/// This is a best-effort guide. Some formats may need to read ahead further,
|
||||
/// for example, if scanning a parquet file that has batches with 100MiB of data
|
||||
/// then the actual readahead will be at least 100MiB
|
||||
///
|
||||
/// Set to 0 to disable readahead. When disabled, the scanner will read the
|
||||
/// dataset one batch at a time
|
||||
///
|
||||
/// This limit applies across all fragments. If the limit is 32MiB and the
|
||||
/// fragment readahead allows for 20 fragments to be read at once then the
|
||||
/// total readahead will still be 32MiB and NOT 20 * 32MiB.
|
||||
int32_t target_bytes_readahead = kDefaultBytesReadahead;
|
||||
|
||||
/// \brief Number of fragments to read ahead
|
||||
///
|
||||
/// Higher readahead will potentially lead to more efficient I/O but will lead
|
||||
/// to the scan operation using more RAM. The default is fairly conservative
|
||||
/// and designed for fast local disks (or slow local spinning disks which cannot
|
||||
/// handle much parallelism anyways). When using a highly parallel remote filesystem
|
||||
/// you will likely want to increase these values.
|
||||
///
|
||||
/// Set to 0 to disable fragment readahead. When disabled the dataset will be scanned
|
||||
/// one fragment at a time.
|
||||
int32_t fragment_readahead = kDefaultFragmentReadahead;
|
||||
/// \brief Options specific to the file format
|
||||
const FragmentScanOptions* format_options = NULLPTR;
|
||||
|
||||
/// \brief Utility method to get a selection representing all columns in a dataset
|
||||
static std::vector<FieldPath> AllColumns(const Schema& dataset_schema);
|
||||
|
||||
/// \brief Utility method to add fields needed for the current filter
|
||||
///
|
||||
/// This method adds any fields that are needed by `filter` which are not already
|
||||
/// included in the list of columns. Any new fields added will be added to the end
|
||||
/// in no particular order.
|
||||
static Status AddFieldsNeededForFilter(ScanV2Options* options);
|
||||
};
|
||||
|
||||
/// \brief Describes a projection
|
||||
struct ARROW_DS_EXPORT ProjectionDescr {
|
||||
/// \brief The projection expression itself
|
||||
/// This expression must be a call to make_struct
|
||||
compute::Expression expression;
|
||||
/// \brief The output schema of the projection.
|
||||
|
||||
/// This can be calculated from the input schema and the expression but it
|
||||
/// is cached here for convenience.
|
||||
std::shared_ptr<Schema> schema;
|
||||
|
||||
/// \brief Create a ProjectionDescr by binding an expression to the dataset schema
|
||||
///
|
||||
/// expression must return a struct type
|
||||
static Result<ProjectionDescr> FromStructExpression(
|
||||
const compute::Expression& expression, const Schema& dataset_schema);
|
||||
|
||||
/// \brief Create a ProjectionDescr from expressions/names for each field
|
||||
static Result<ProjectionDescr> FromExpressions(std::vector<compute::Expression> exprs,
|
||||
std::vector<std::string> names,
|
||||
const Schema& dataset_schema);
|
||||
|
||||
/// \brief Create a default projection referencing fields in the dataset schema
|
||||
static Result<ProjectionDescr> FromNames(std::vector<std::string> names,
|
||||
const Schema& dataset_schema,
|
||||
bool add_augmented_fields = true);
|
||||
|
||||
/// \brief Make a projection that projects every field in the dataset schema
|
||||
static Result<ProjectionDescr> Default(const Schema& dataset_schema,
|
||||
bool add_augmented_fields = true);
|
||||
};
|
||||
|
||||
/// \brief Utility method to set the projection expression and schema
|
||||
ARROW_DS_EXPORT void SetProjection(ScanOptions* options, ProjectionDescr projection);
|
||||
|
||||
/// \brief Combines a record batch with the fragment that the record batch originated
|
||||
/// from
|
||||
///
|
||||
/// Knowing the source fragment can be useful for debugging & understanding loaded
|
||||
/// data
|
||||
struct TaggedRecordBatch {
|
||||
std::shared_ptr<RecordBatch> record_batch;
|
||||
std::shared_ptr<Fragment> fragment;
|
||||
|
||||
friend inline bool operator==(const TaggedRecordBatch& left,
|
||||
const TaggedRecordBatch& right) {
|
||||
return left.record_batch == right.record_batch && left.fragment == right.fragment;
|
||||
}
|
||||
};
|
||||
|
||||
using TaggedRecordBatchGenerator = std::function<Future<TaggedRecordBatch>()>;
|
||||
using TaggedRecordBatchIterator = Iterator<TaggedRecordBatch>;
|
||||
|
||||
/// \brief Combines a tagged batch with positional information
|
||||
///
|
||||
/// This is returned when scanning batches in an unordered fashion. This information is
|
||||
/// needed if you ever want to reassemble the batches in order
|
||||
struct EnumeratedRecordBatch {
|
||||
Enumerated<std::shared_ptr<RecordBatch>> record_batch;
|
||||
Enumerated<std::shared_ptr<Fragment>> fragment;
|
||||
|
||||
friend inline bool operator==(const EnumeratedRecordBatch& left,
|
||||
const EnumeratedRecordBatch& right) {
|
||||
return left.record_batch == right.record_batch && left.fragment == right.fragment;
|
||||
}
|
||||
};
|
||||
|
||||
using EnumeratedRecordBatchGenerator = std::function<Future<EnumeratedRecordBatch>()>;
|
||||
using EnumeratedRecordBatchIterator = Iterator<EnumeratedRecordBatch>;
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace dataset
|
||||
|
||||
template <>
|
||||
struct IterationTraits<dataset::TaggedRecordBatch> {
|
||||
static dataset::TaggedRecordBatch End() {
|
||||
return dataset::TaggedRecordBatch{NULLPTR, NULLPTR};
|
||||
}
|
||||
static bool IsEnd(const dataset::TaggedRecordBatch& val) {
|
||||
return val.record_batch == NULLPTR;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IterationTraits<dataset::EnumeratedRecordBatch> {
|
||||
static dataset::EnumeratedRecordBatch End() {
|
||||
return dataset::EnumeratedRecordBatch{
|
||||
IterationEnd<Enumerated<std::shared_ptr<RecordBatch>>>(),
|
||||
IterationEnd<Enumerated<std::shared_ptr<dataset::Fragment>>>()};
|
||||
}
|
||||
static bool IsEnd(const dataset::EnumeratedRecordBatch& val) {
|
||||
return IsIterationEnd(val.fragment);
|
||||
}
|
||||
};
|
||||
|
||||
namespace dataset {
|
||||
|
||||
/// \defgroup dataset-scanning Scanning API
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief A scanner glues together several dataset classes to load in data.
|
||||
/// The dataset contains a collection of fragments and partitioning rules.
|
||||
///
|
||||
/// The fragments identify independently loadable units of data (i.e. each fragment has
|
||||
/// a potentially unique schema and possibly even format. It should be possible to read
|
||||
/// fragments in parallel if desired).
|
||||
///
|
||||
/// The fragment's format contains the logic necessary to actually create a task to load
|
||||
/// the fragment into memory. That task may or may not support parallel execution of
|
||||
/// its own.
|
||||
///
|
||||
/// The scanner is then responsible for creating scan tasks from every fragment in the
|
||||
/// dataset and (potentially) sequencing the loaded record batches together.
|
||||
///
|
||||
/// The scanner should not buffer the entire dataset in memory (unless asked) instead
|
||||
/// yielding record batches as soon as they are ready to scan. Various readahead
|
||||
/// properties control how much data is allowed to be scanned before pausing to let a
|
||||
/// slow consumer catchup.
|
||||
///
|
||||
/// Today the scanner also handles projection & filtering although that may change in
|
||||
/// the future.
|
||||
class ARROW_DS_EXPORT Scanner {
|
||||
public:
|
||||
virtual ~Scanner() = default;
|
||||
|
||||
/// \brief Apply a visitor to each RecordBatch as it is scanned. If multiple threads
|
||||
/// are used (via use_threads), the visitor will be invoked from those threads and is
|
||||
/// responsible for any synchronization.
|
||||
virtual Status Scan(std::function<Status(TaggedRecordBatch)> visitor) = 0;
|
||||
/// \brief Convert a Scanner into a Table.
|
||||
///
|
||||
/// Use this convenience utility with care. This will serially materialize the
|
||||
/// Scan result in memory before creating the Table.
|
||||
virtual Result<std::shared_ptr<Table>> ToTable() = 0;
|
||||
/// \brief Scan the dataset into a stream of record batches. Each batch is tagged
|
||||
/// with the fragment it originated from. The batches will arrive in order. The
|
||||
/// order of fragments is determined by the dataset.
|
||||
///
|
||||
/// Note: The scanner will perform some readahead but will avoid materializing too
|
||||
/// much in memory (this is goverended by the readahead options and use_threads option).
|
||||
/// If the readahead queue fills up then I/O will pause until the calling thread catches
|
||||
/// up.
|
||||
virtual Result<TaggedRecordBatchIterator> ScanBatches() = 0;
|
||||
virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync() = 0;
|
||||
virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync(
|
||||
::arrow::internal::Executor* cpu_thread_pool) = 0;
|
||||
/// \brief Scan the dataset into a stream of record batches. Unlike ScanBatches this
|
||||
/// method may allow record batches to be returned out of order. This allows for more
|
||||
/// efficient scanning: some fragments may be accessed more quickly than others (e.g.
|
||||
/// may be cached in RAM or just happen to get scheduled earlier by the I/O)
|
||||
///
|
||||
/// To make up for the out-of-order iteration each batch is further tagged with
|
||||
/// positional information.
|
||||
virtual Result<EnumeratedRecordBatchIterator> ScanBatchesUnordered() = 0;
|
||||
virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync() = 0;
|
||||
virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync(
|
||||
::arrow::internal::Executor* cpu_thread_pool) = 0;
|
||||
/// \brief A convenience to synchronously load the given rows by index.
|
||||
///
|
||||
/// Will only consume as many batches as needed from ScanBatches().
|
||||
virtual Result<std::shared_ptr<Table>> TakeRows(const Array& indices) = 0;
|
||||
/// \brief Get the first N rows.
|
||||
virtual Result<std::shared_ptr<Table>> Head(int64_t num_rows) = 0;
|
||||
/// \brief Count rows matching a predicate.
|
||||
///
|
||||
/// This method will push down the predicate and compute the result based on fragment
|
||||
/// metadata if possible.
|
||||
virtual Result<int64_t> CountRows() = 0;
|
||||
virtual Future<int64_t> CountRowsAsync() = 0;
|
||||
/// \brief Convert the Scanner to a RecordBatchReader so it can be
|
||||
/// easily used with APIs that expect a reader.
|
||||
virtual Result<std::shared_ptr<RecordBatchReader>> ToRecordBatchReader() = 0;
|
||||
|
||||
/// \brief Get the options for this scan.
|
||||
const std::shared_ptr<ScanOptions>& options() const { return scan_options_; }
|
||||
/// \brief Get the dataset that this scanner will scan
|
||||
virtual const std::shared_ptr<Dataset>& dataset() const = 0;
|
||||
|
||||
protected:
|
||||
explicit Scanner(std::shared_ptr<ScanOptions> scan_options)
|
||||
: scan_options_(std::move(scan_options)) {}
|
||||
|
||||
Result<EnumeratedRecordBatchIterator> AddPositioningToInOrderScan(
|
||||
TaggedRecordBatchIterator scan);
|
||||
|
||||
const std::shared_ptr<ScanOptions> scan_options_;
|
||||
};
|
||||
|
||||
/// \brief ScannerBuilder is a factory class to construct a Scanner. It is used
|
||||
/// to pass information, notably a potential filter expression and a subset of
|
||||
/// columns to materialize.
|
||||
class ARROW_DS_EXPORT ScannerBuilder {
|
||||
public:
|
||||
explicit ScannerBuilder(std::shared_ptr<Dataset> dataset);
|
||||
|
||||
ScannerBuilder(std::shared_ptr<Dataset> dataset,
|
||||
std::shared_ptr<ScanOptions> scan_options);
|
||||
|
||||
ScannerBuilder(std::shared_ptr<Schema> schema, std::shared_ptr<Fragment> fragment,
|
||||
std::shared_ptr<ScanOptions> scan_options);
|
||||
|
||||
/// \brief Make a scanner from a record batch reader.
|
||||
///
|
||||
/// The resulting scanner can be scanned only once. This is intended
|
||||
/// to support writing data from streaming sources or other sources
|
||||
/// that can be iterated only once.
|
||||
static std::shared_ptr<ScannerBuilder> FromRecordBatchReader(
|
||||
std::shared_ptr<RecordBatchReader> reader);
|
||||
|
||||
/// \brief Set the subset of columns to materialize.
|
||||
///
|
||||
/// Columns which are not referenced may not be read from fragments.
|
||||
///
|
||||
/// \param[in] columns list of columns to project. Order and duplicates will
|
||||
/// be preserved.
|
||||
///
|
||||
/// \return Failure if any column name does not exists in the dataset's
|
||||
/// Schema.
|
||||
Status Project(std::vector<std::string> columns);
|
||||
|
||||
/// \brief Set expressions which will be evaluated to produce the materialized
|
||||
/// columns.
|
||||
///
|
||||
/// Columns which are not referenced may not be read from fragments.
|
||||
///
|
||||
/// \param[in] exprs expressions to evaluate to produce columns.
|
||||
/// \param[in] names list of names for the resulting columns.
|
||||
///
|
||||
/// \return Failure if any referenced column does not exists in the dataset's
|
||||
/// Schema.
|
||||
Status Project(std::vector<compute::Expression> exprs, std::vector<std::string> names);
|
||||
|
||||
/// \brief Set the filter expression to return only rows matching the filter.
|
||||
///
|
||||
/// The predicate will be passed down to Sources and corresponding
|
||||
/// Fragments to exploit predicate pushdown if possible using
|
||||
/// partition information or Fragment internal metadata, e.g. Parquet statistics.
|
||||
/// Columns which are not referenced may not be read from fragments.
|
||||
///
|
||||
/// \param[in] filter expression to filter rows with.
|
||||
///
|
||||
/// \return Failure if any referenced columns does not exist in the dataset's
|
||||
/// Schema.
|
||||
Status Filter(const compute::Expression& filter);
|
||||
|
||||
/// \brief Indicate if the Scanner should make use of the available
|
||||
/// ThreadPool found in ScanOptions;
|
||||
Status UseThreads(bool use_threads = true);
|
||||
|
||||
/// \brief Indicate if metadata should be cached when scanning
|
||||
///
|
||||
/// Fragments may typically cache metadata to speed up repeated accesses.
|
||||
/// However, in use cases where a single scan is done, or if memory use
|
||||
/// is more critical than CPU time, setting this option to false can
|
||||
/// lessen memory use.
|
||||
Status CacheMetadata(bool cache_metadata = true);
|
||||
|
||||
/// \brief Set the maximum number of rows per RecordBatch.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows.
|
||||
/// \returns An error if the number for batch is not greater than 0.
|
||||
///
|
||||
/// This option provides a control limiting the memory owned by any RecordBatch.
|
||||
Status BatchSize(int64_t batch_size);
|
||||
|
||||
/// \brief Set the number of batches to read ahead within a fragment.
|
||||
///
|
||||
/// \param[in] batch_readahead How many batches to read ahead within a fragment
|
||||
/// \returns an error if this number is less than 0.
|
||||
///
|
||||
/// This option provides a control on the RAM vs I/O tradeoff.
|
||||
/// It might not be supported by all file formats, in which case it will
|
||||
/// simply be ignored.
|
||||
Status BatchReadahead(int32_t batch_readahead);
|
||||
|
||||
/// \brief Set the number of fragments to read ahead
|
||||
///
|
||||
/// \param[in] fragment_readahead How many fragments to read ahead
|
||||
/// \returns an error if this number is less than 0.
|
||||
///
|
||||
/// This option provides a control on the RAM vs I/O tradeoff.
|
||||
Status FragmentReadahead(int32_t fragment_readahead);
|
||||
|
||||
/// \brief Set the pool from which materialized and scanned arrays will be allocated.
|
||||
Status Pool(MemoryPool* pool);
|
||||
|
||||
/// \brief Set fragment-specific scan options.
|
||||
Status FragmentScanOptions(std::shared_ptr<FragmentScanOptions> fragment_scan_options);
|
||||
|
||||
/// \brief Override default backpressure configuration
|
||||
Status Backpressure(acero::BackpressureOptions backpressure);
|
||||
|
||||
/// \brief Return the current scan options for the builder.
|
||||
Result<std::shared_ptr<ScanOptions>> GetScanOptions();
|
||||
|
||||
/// \brief Return the constructed now-immutable Scanner object
|
||||
Result<std::shared_ptr<Scanner>> Finish();
|
||||
|
||||
const std::shared_ptr<Schema>& schema() const;
|
||||
const std::shared_ptr<Schema>& projected_schema() const;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Dataset> dataset_;
|
||||
std::shared_ptr<ScanOptions> scan_options_ = std::make_shared<ScanOptions>();
|
||||
};
|
||||
|
||||
/// \brief Construct a source ExecNode which yields batches from a dataset scan.
|
||||
///
|
||||
/// Does not construct associated filter or project nodes.
|
||||
///
|
||||
/// Batches are yielded sequentially, like single-threaded,
|
||||
/// when require_sequenced_output=true.
|
||||
///
|
||||
/// Yielded batches will be augmented with fragment/batch indices when
|
||||
/// implicit_ordering=true to enable stable ordering for simple ExecPlans.
|
||||
class ARROW_DS_EXPORT ScanNodeOptions : public acero::ExecNodeOptions {
|
||||
public:
|
||||
explicit ScanNodeOptions(std::shared_ptr<Dataset> dataset,
|
||||
std::shared_ptr<ScanOptions> scan_options,
|
||||
bool require_sequenced_output = false,
|
||||
bool implicit_ordering = false)
|
||||
: dataset(std::move(dataset)),
|
||||
scan_options(std::move(scan_options)),
|
||||
require_sequenced_output(require_sequenced_output),
|
||||
implicit_ordering(implicit_ordering) {}
|
||||
|
||||
std::shared_ptr<Dataset> dataset;
|
||||
std::shared_ptr<ScanOptions> scan_options;
|
||||
bool require_sequenced_output;
|
||||
bool implicit_ordering;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
namespace internal {
|
||||
ARROW_DS_EXPORT void InitializeScanner(arrow::acero::ExecFactoryRegistry* registry);
|
||||
ARROW_DS_EXPORT void InitializeScannerV2(arrow::acero::ExecFactoryRegistry* registry);
|
||||
} // namespace internal
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,113 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/type_fwd.h" // IWYU pragma: export
|
||||
#include "arrow/dataset/visibility.h"
|
||||
#include "arrow/filesystem/type_fwd.h" // IWYU pragma: export
|
||||
#include "arrow/type_fwd.h" // IWYU pragma: export
|
||||
|
||||
namespace arrow {
|
||||
namespace dataset {
|
||||
|
||||
class Dataset;
|
||||
class DatasetFactory;
|
||||
using DatasetVector = std::vector<std::shared_ptr<Dataset>>;
|
||||
|
||||
class UnionDataset;
|
||||
class UnionDatasetFactory;
|
||||
|
||||
class Fragment;
|
||||
using FragmentIterator = Iterator<std::shared_ptr<Fragment>>;
|
||||
using FragmentVector = std::vector<std::shared_ptr<Fragment>>;
|
||||
|
||||
class FragmentScanOptions;
|
||||
|
||||
class FileSource;
|
||||
class FileFormat;
|
||||
class FileFragment;
|
||||
class FileWriter;
|
||||
class FileWriteOptions;
|
||||
class FileSystemDataset;
|
||||
class FileSystemDatasetFactory;
|
||||
struct FileSystemDatasetWriteOptions;
|
||||
class WriteNodeOptions;
|
||||
|
||||
/// \brief Controls what happens if files exist in an output directory during a dataset
|
||||
/// write
|
||||
enum class ExistingDataBehavior : int8_t {
|
||||
/// Deletes all files in a directory the first time that directory is encountered
|
||||
kDeleteMatchingPartitions,
|
||||
/// Ignores existing files, overwriting any that happen to have the same name as an
|
||||
/// output file
|
||||
kOverwriteOrIgnore,
|
||||
/// Returns an error if there are any files or subdirectories in the output directory
|
||||
kError,
|
||||
};
|
||||
|
||||
class InMemoryDataset;
|
||||
|
||||
class CsvFileFormat;
|
||||
class CsvFileWriter;
|
||||
class CsvFileWriteOptions;
|
||||
struct CsvFragmentScanOptions;
|
||||
|
||||
class JsonFileFormat;
|
||||
class JsonFileWriter;
|
||||
class JsonFileWriteOptions;
|
||||
struct JsonFragmentScanOptions;
|
||||
|
||||
class IpcFileFormat;
|
||||
class IpcFileWriter;
|
||||
class IpcFileWriteOptions;
|
||||
class IpcFragmentScanOptions;
|
||||
|
||||
class ParquetFileFormat;
|
||||
class ParquetFileFragment;
|
||||
class ParquetFragmentScanOptions;
|
||||
class ParquetFileWriter;
|
||||
class ParquetFileWriteOptions;
|
||||
|
||||
class Partitioning;
|
||||
class PartitioningFactory;
|
||||
class PartitioningOrFactory;
|
||||
struct KeyValuePartitioningOptions;
|
||||
class DirectoryPartitioning;
|
||||
class HivePartitioning;
|
||||
struct HivePartitioningOptions;
|
||||
class FilenamePartitioning;
|
||||
struct FilenamePartitioningOptions;
|
||||
|
||||
class ScanNodeOptions;
|
||||
struct ScanOptions;
|
||||
|
||||
class Scanner;
|
||||
|
||||
class ScannerBuilder;
|
||||
|
||||
class ScanTask;
|
||||
using ScanTaskVector = std::vector<std::shared_ptr<ScanTask>>;
|
||||
using ScanTaskIterator = Iterator<std::shared_ptr<ScanTask>>;
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,50 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(push)
|
||||
# pragma warning(disable : 4251)
|
||||
# else
|
||||
# pragma GCC diagnostic ignored "-Wattributes"
|
||||
# endif
|
||||
|
||||
# ifdef ARROW_DS_STATIC
|
||||
# define ARROW_DS_EXPORT
|
||||
# elif defined(ARROW_DS_EXPORTING)
|
||||
# define ARROW_DS_EXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define ARROW_DS_EXPORT __declspec(dllimport)
|
||||
# endif
|
||||
|
||||
# define ARROW_DS_NO_EXPORT
|
||||
#else // Not Windows
|
||||
# ifndef ARROW_DS_EXPORT
|
||||
# define ARROW_DS_EXPORT __attribute__((visibility("default")))
|
||||
# endif
|
||||
# ifndef ARROW_DS_NO_EXPORT
|
||||
# define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
# endif
|
||||
#endif // Non-Windows
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# pragma warning(pop)
|
||||
#endif
|
||||
Reference in New Issue
Block a user